這完全讓我感到困惑。兩組邏輯上應該相同的代碼集合只在GPU上崩潰,而在CPU上運行良好。下面是測試代碼:部分封裝OpenCL導致段錯誤,包含代碼示例
#include <iostream>
#include <CL/cl.hpp>
class Device
{
public:
cl::Platform platform_;
cl::Device device_;
cl::Context context_;
cl::CommandQueue queue_;
Device(void) : platform_()
, device_()
, context_()
, queue_() {}
Device(int32_t platform, int32_t device) : platform_()
, device_()
, context_()
, queue_()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
platform_ = platforms[platform];
std::vector<cl::Device> devices;
platform_.getDevices(CL_DEVICE_TYPE_GPU, &devices);
device_ = devices[device];
cl_context_properties properties[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform_)(),
0
};
cl_int clErr = CL_SUCCESS;
context_ = cl::Context(device_, properties, NULL, NULL, &clErr);
queue_ = cl::CommandQueue(context_,device_,0,&clErr);
}
};
int main()
{
Device device(0,0);
cl::Program::Sources source;
std::string src =
"__kernel void Pointless(uint total, __global uint *data)"\
"{"\
" uint perStream=total/get_global_size(0);"\
" __global uint *dest=data+get_global_id(0)*perStream;"\
" for(uint i=0;i<perStream;i++)"\
" dest[i] = 1;"\
"}";
source.push_back({src.c_str(),src.length()});
cl_int clErr = CL_SUCCESS;
cl::Program program = cl::Program(device.context_,source,&clErr);
if (clErr != CL_SUCCESS)
{
std::cerr << "Failed to create program: " << clErr << std::endl;
return 1;
}
clErr = program.build({device.device_});
if(clErr != CL_SUCCESS)
{
std::cerr << "Failed to build program: " << clErr << std::endl;
std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device.device_) << std::endl;
return 1;
}
uint32_t samples = 16*256;
cl::make_kernel<cl_uint,cl::Buffer> Pointless(cl::Kernel(program,"Pointless"));
cl::Buffer device_samples(device.context_,CL_MEM_READ_WRITE,sizeof(cl_uint)*samples);
Pointless(cl::EnqueueArgs(device.queue_, cl::NDRange(16)), samples, device_samples).wait();
std::vector<cl_uint> host_samples(samples);
device.queue_.enqueueReadBuffer(device_samples,CL_TRUE,0,sizeof(cl_uint)*samples,host_samples.data());
for (auto x: host_samples)
std::cout << x;
std::cout << std::endl;
return 0;
}
上面似乎失敗:我得到enqueueReadBuffer
段故障。更有趣的是,它只在GPU(英特爾P4000)上失效。 CPU(i3 3xxx)運行時沒有問題(將CL_DEVICE_TYPE_GPU
更改爲CL_DEVICE_TYPE_CPU
以在CPU上測試)。
下面的代碼適用於兩種設備類型。
#include <iostream>
#include <CL/cl.hpp>
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
cl::Platform platform = platforms[0];
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
cl::Device device = devices[0];
cl_context_properties properties[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform)(),
0
};
cl_int clErr = CL_SUCCESS;
cl::Context context(device, properties, NULL, NULL, &clErr);
cl::CommandQueue queue(context,device,0,&clErr);
cl::Program::Sources source;
std::string src =
"__kernel void Pointless(uint total, __global uint *data)"\
"{"\
" uint perStream=total/get_global_size(0);"\
" __global uint *dest=data+get_global_id(0)*perStream;"\
" for(uint i=0;i<perStream;i++)"\
" dest[i] = 1;"\
"}";
source.push_back({src.c_str(),src.length()});
cl::Program program = cl::Program(context,source,&clErr);
clErr = program.build({device});
if(clErr != CL_SUCCESS)
{
std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
}
uint32_t samples = 16*256;
cl::make_kernel<cl_uint,cl::Buffer> Pointless(cl::Kernel(program,"Pointless"));
cl::Buffer device_samples(context,CL_MEM_READ_WRITE,sizeof(cl_uint)*samples);
Pointless(cl::EnqueueArgs(queue, cl::NDRange(16)), samples, device_samples).wait();
std::vector<cl_uint> host_samples(samples);
queue.enqueueReadBuffer(device_samples,CL_TRUE,0,sizeof(cl_uint)*samples,host_samples.data());
for (auto x: host_samples)
std::cout << x;
std::cout << std::endl;
return 0;
}
顯然我在這裏錯過了一些非常基本的東西。他們都使用英特爾ICD(我沒有這個系統上的AMD設備)。
你所有的opencl文件版本是否相互匹配? –
@huseyin GPU和CPU可能有不同版本的ICD,但我不確定這是否重要:如果是ICD,GPU上面的兩組代碼都不會失敗?目前,它只有第一組代碼纔會失敗。 – bhimberg
@huseyin兩者都使用相同的DLL:IntelOpenCL64.dll,版本1.0.1.1003。 – bhimberg