2013-06-25 78 views
0

當我嘗試使用nvidia編譯opencl向量添加內核(在visual studio 2010下)時,我遇到了這種奇怪的行爲。我得到以下輸出:用nvidia(visual studio 2010)奇怪的opencl向量添加行爲

C[0]=1=0+1 
C[1]=1.#INF=1+2 
C[2]=-5.87747e-039=2+3 
C[3]=-1.76324e-038=3+4 
C[4]=-2.93874e-038=4+5 
C[5]=-4.11423e-038=5+6 
C[6]=-5.87747e-038=6+7 
C[7]=-8.22846e-038=7+8 
C[8]=-1.05794e-037=8+9 
C[9]=-1.29304e-037=9+10 

如果我通過分配與A或B C替換籽粒

C[i] = A[i] + B[i]; 

的操作,無需算術運算,如低於:

C[i] = A[i]; 

返回的輸出將是正確的。我想這意味着它正確地讀取&寫入緩衝區。

而且,它返回怪異的值時,我與常數值分配c更換操作,如:

C[i] = 999; 

這裏是我的主機程序:

#include "stdafx.h" 
#include <stdio.h> 
#include <stdlib.h> 
#include <iostream> 
#include <fstream> 
#include <vector> 
#include <cstdlib> 

#define __CL_ENABLE_EXCEPTIONS 
#include <CL/cl.hpp> 

int main(int argc, char *argv[]) { 
    try { 
std::cout<<"C"<<"\n"; 

std::ifstream ifs("vector_add.cl"); 
std::string kernelSource((std::istreambuf_iterator<char>(ifs)), 
         (std::istreambuf_iterator<char>() )); 

     cl_uint N = 10; 
     std::vector<cl::Platform> platform1; 
     cl::Platform::get(&platform1); 
     std::vector<cl::Device> device1; 
     //platform1.front().getDevices(CL_DEVICE_TYPE_ALL, &device1); 
     platform1.front().getDevices(CL_DEVICE_TYPE_GPU, &device1); 
     cl::Context context1 = cl::Context(device1); 
     cl::Program::Sources src_code1; 
     src_code1.push_back(std::make_pair(kernelSource.c_str(),kernelSource.size())); 
     cl::Program program1 = cl::Program(context1, src_code1); 
     program1.build(device1); 
     cl::Kernel kernel1(program1, "vector_add"); 
     // Create a command queue and use the first device 
     cl::Device& device1_addr = device1.front(); 
     cl::CommandQueue queue1(context1, device1_addr); 

     float A[10], B[10], C[10]; 
     for (int i=0;i<10;i++) { 
      A[i]=float(i); 
      B[i]=1.0+i; 
     } 
     cl::Buffer A_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N); 
     cl::Buffer B_buff(context1,CL_MEM_READ_ONLY,sizeof(cl_float)*N); 
     cl::Buffer C_buff(context1,CL_MEM_WRITE_ONLY,sizeof(cl_float)*N); 
     queue1.enqueueWriteBuffer(A_buff, CL_TRUE, 0, sizeof(cl_float)*N, A); 
     queue1.enqueueWriteBuffer(B_buff, CL_TRUE, 0, sizeof(cl_float)*N, B); 
     kernel1.setArg(0,A_buff); 
     kernel1.setArg(1,B_buff); 
     kernel1.setArg(2,C_buff); 

     queue1.enqueueNDRangeKernel(kernel1, 0, cl::NDRange(10), cl::NDRange(1)); 
     queue1.enqueueReadBuffer(C_buff, CL_TRUE, 0, sizeof(cl_float)*N, C); 
     for (int i=0;i<10;i++) { 
      std::cout<<"C["<<i<<"]="<<C[i]<<"="<<A[i]<<"+"<<B[i]<<"\n"; 
     } 
    } catch(cl::Error& err) { 
     std::cerr << "OpenCL error: " << err.what() << "(" << err.err() << 
      ")" << std::endl; 

     return EXIT_FAILURE; 
    } 

    return EXIT_SUCCESS; 
} 

,而這裏是我的內核:

__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) { 

     // Get the index of the current element to be processed 
     int i = get_global_id(0); 



    // Do the operation 
    C[i] = A[i] + B[i]; 
    //C[i] = B[i]; 
    //C[i] = 999; 
} 

回答

0

你的內核對其所有參數使用「__global const int *」,這應該是「__global const float *」作爲您的主機代碼使用float。

+0

糟糕......這是如此尷尬的錯誤...謝謝! – user2280883