0
當我嘗試使用nvidia編譯opencl向量添加內核(在visual studio 2010下)時,我遇到了這種奇怪的行爲。我得到以下輸出:用nvidia(visual studio 2010)奇怪的opencl向量添加行爲
C[0]=1=0+1
C[1]=1.#INF=1+2
C[2]=-5.87747e-039=2+3
C[3]=-1.76324e-038=3+4
C[4]=-2.93874e-038=4+5
C[5]=-4.11423e-038=5+6
C[6]=-5.87747e-038=6+7
C[7]=-8.22846e-038=7+8
C[8]=-1.05794e-037=8+9
C[9]=-1.29304e-037=9+10
如果我通過分配與A或B C替換籽粒
C[i] = A[i] + B[i];
的操作,無需算術運算,如低於:
C[i] = A[i];
返回的輸出將是正確的。我想這意味着它正確地讀取&寫入緩衝區。
而且,它返回怪異的值時,我與常數值分配c更換操作,如:
C[i] = 999;
這裏是我的主機程序:
#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
int main(int argc, char *argv[]) {
try {
std::cout<<"C"<<"\n";
std::ifstream ifs("vector_add.cl");
std::string kernelSource((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>() ));
cl_uint N = 10;
std::vector<cl::Platform> platform1;
cl::Platform::get(&platform1);
std::vector<cl::Device> device1;
//platform1.front().getDevices(CL_DEVICE_TYPE_ALL, &device1);
platform1.front().getDevices(CL_DEVICE_TYPE_GPU, &device1);
cl::Context context1 = cl::Context(device1);
cl::Program::Sources src_code1;
src_code1.push_back(std::make_pair(kernelSource.c_str(),kernelSource.size()));
cl::Program program1 = cl::Program(context1, src_code1);
program1.build(device1);
cl::Kernel kernel1(program1, "vector_add");
// Create a command queue and use the first device
cl::Device& device1_addr = device1.front();
cl::CommandQueue queue1(context1, device1_addr);
float A[10], B[10], C[10];
for (int i=0;i<10;i++) {
A[i]=float(i);
B[i]=1.0+i;
}
cl::Buffer A_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N);
cl::Buffer B_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N);
cl::Buffer C_buff(context1,CL_MEM_WRITE_ONLY,sizeof(cl_float)*N);
queue1.enqueueWriteBuffer(A_buff, CL_TRUE, 0, sizeof(cl_float)*N, A);
queue1.enqueueWriteBuffer(B_buff, CL_TRUE, 0, sizeof(cl_float)*N, B);
kernel1.setArg(0,A_buff);
kernel1.setArg(1,B_buff);
kernel1.setArg(2,C_buff);
queue1.enqueueNDRangeKernel(kernel1, 0, cl::NDRange(10), cl::NDRange(1));
queue1.enqueueReadBuffer(C_buff, CL_TRUE, 0, sizeof(cl_float)*N, C);
for (int i=0;i<10;i++) {
std::cout<<"C["<<i<<"]="<<C[i]<<"="<<A[i]<<"+"<<B[i]<<"\n";
}
} catch(cl::Error& err) {
std::cerr << "OpenCL error: " << err.what() << "(" << err.err() <<
")" << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
,而這裏是我的內核:
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
//C[i] = B[i];
//C[i] = 999;
}
糟糕......這是如此尷尬的錯誤...謝謝! – user2280883