2013-10-22 38 views
0

我在cuda中爲Visual Studio 2010中的圖像處理製作算法。在我的編碼中,我遇到了處理線程和cuda塊的問題。所以我的C和CUDA示例代碼如下,C代碼工作正常,但CUDA代碼不能正常工作。 我的C代碼Cuda的線程和塊不能正常工作

void checkGpuBlockValue(unsigned int *a,unsigned int *b,int length) 
{ 
    for(int i=0;i<length;i++){ 
     b[i]=a[i]+i; 
    } 

} 

int main() 
{ 
    const int range=1000; 
    unsigned int *a=new unsigned int[range]; 
    unsigned int *b=new unsigned int[range]; 

    for(int i=0;i<range;i++) 
    { 
     a[i]=i; 
    } 

checkGpuBlockValue(a,b,range); 

for(int j=0;j<range;j++) 
    { 
     cout<<"b["<<j<<"] = "<<b[j]<<std::endl; 
    } 
} 

輸出=

OutPut : 
b[0] = 0 
b[1] = 2 
b[2] = 4 
b[3] = 6 
b[4] = 8 
. 
. 
. 
. 
. 

b[996] = 1992 
b[997] = 1994 
b[998] = 1996 
b[999] = 1998 

能正常工作。

我的CUDA代碼(工作不正常)是;

__global__ 
void checkGpuBlockValue(unsigned int *a,unsigned int *b,int length) 
{ 
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x; 

    if(i<length){ 
     b[i]=a[i]+i; 
    } 

} 

int main() 
{ 
    const int range=1000; 
    unsigned int *a=new unsigned int[range]; 
    unsigned int *b=new unsigned int[range]; 

    unsigned int *dev_a; 
    unsigned int *dev_b; 

    for(int i=0;i<range;i++) 
    { 
     a[i]=i; 
    } 

    cudaMalloc((void**)&dev_a, range* sizeof(unsigned int)); 
    cudaMalloc((void**)&dev_b, range* sizeof(unsigned int)); 

    cudaMemcpy(dev_a, a, range, cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_b, a, range, cudaMemcpyHostToDevice); 

    static const int BLOCK_WIDTH = 8;  

    //1024 is the maximum number of threads per block for modern GPUs. 

    int x = static_cast<int>(ceilf(static_cast<float>(range)/BLOCK_WIDTH)); 


    const dim3 grid (x,1);        
    const dim3 block(BLOCK_WIDTH,1);  

    checkGpuBlockValue<<<grid,block>>>(dev_a,dev_b,range); 
    cudaDeviceSynchronize(); 

    cudaMemcpy(b, dev_b, range, cudaMemcpyDeviceToHost); 


    for(int j=0;j<range;j++) 
    { 
     cout<<"b["<<j<<"] = "<<b[j]<<std::endl; 
    } 

    cudaFree(dev_a); 
    cudaFree(dev_b); 
} 

OUT PUT是:

Out Put = 
b[0] = 0 
b[1] = 2 
b[2] = 4 
b[3] = 6 
. 
. 
. 
. 
. 
b[242] = 484 
b[243] = 486 
b[244] = 488 
b[245] = 490 
b[246] = 492 
b[247] = 494 
b[248] = 496 
b[249] = 498 
b[250] = 3452816845 
b[251] = 3452816845 
b[252] = 3452816845 
b[253] = 3452816845 
b[254] = 3452816845 
b[255] = 3452816845 
b[256] = 3452816845 
. 
. 
. 
. 
. 
. 
b[996] = 3452816845 
b[997] = 3452816845 
b[998] = 3452816845 
b[999] = 3452816845 

在我的代碼1M在INT * a和比添加puting值0到1000 *一個帶有值從0到1000和結果在int值存儲* b。 所以我的代碼在0到249(高達250)循環中工作良好,但在250之後它給出了錯誤的值。 那麼我在這裏做什麼錯了?請給我建議。

回答

1

只要看一眼你的代碼看起來像你的問題是在這些線路

cudaMemcpy(dev_a, a, range, cudaMemcpyHostToDevice); 
cudaMemcpy(dev_b, a, range, cudaMemcpyHostToDevice); 
.... 
.... 
cudaMemcpy(b, dev_b, range, cudaMemcpyDeviceToHost); 

應該

cudaMemcpy(dev_a, a, range* sizeof(unsigned int), cudaMemcpyHostToDevice); 
cudaMemcpy(dev_b, a, range* sizeof(unsigned int), cudaMemcpyHostToDevice); 
.... 
.... 
cudaMemcpy(b, dev_b, range * sizeof(unsigned int), cudaMemcpyDeviceToHost); 

我只是通過修改代碼,它的工作原理如你預期檢查。但我強烈建議你做適當的錯誤檢查作爲良好的編程習慣。

+0

謝謝薩加爾先生。現在工作正常。我接受我的錯誤。 – Jay