2013-12-08 104 views
2

我想從全局內存複製到共享內存中,我做了以下複製全局共享內存

__global__ void test(unsigned char *image, unsigned char *out, int n, int m) 
{ 
     int x = threadIdx.x + blockIdx.x * blockDim.x; 
     int y = threadIdx.y + blockIdx.y * blockDim.y; 
     int index = x + y * blockDim.x * gridDim.x; 


    __shared__ unsigned char shared [16*16*3]; 

    if (threadIdx.x < 256) 

    { 

    shared[threadIdx.x*3+0] = image[index*3+0]; 
    shared[threadIdx.x*3+1] = image[index*3+1]; 
    shared[threadIdx.x*3+2] = image[index*3+2]; 


    } 

    __syncthreads(); 

    if (threadIdx.x < 256) 

    { 
    out[index*3+0] = shared[threadIdx.x*3+0]; 
    out[index*3+1] = shared[threadIdx.x*3+1]; 
    out[index*3+2] = shared[threadIdx.x*3+2]; 
    } 

} 

我有一個512×512的圖像和我打電話那樣的內核:

out = (unsigned char*) malloc(n*m*3); 
cudaMalloc((void**)&dev_image, n*m*3); 
cudaMalloc((void**)&dev_out, n*m*3); 
cudaMemcpy(dev_image, image, n*m*3, cudaMemcpyHostToDevice); 
cudaMemcpy(dev_out, out, n*m*3, cudaMemcpyHostToDevice); 

dim3 threads(16,16); 
dim3 blocks(32, 32); 

test<<<blocks, threads>>>(dev_image, dev_out, n, m); 
cudaThreadSynchronize(); 

cudaMemcpy(out, dev_out, n*m*3, cudaMemcpyDeviceToHost); 

任何想法我做錯了什麼?如何將全局內存的一部分複製到共享內存(一維)?

回答

3

在您的內核中,您檢查threadIdx.x < 256這是錯誤的,因爲threadIdx.x不能大於15.您必須改爲在16x16線程塊內檢查您的索引。

我已經改變了你的內核,以這樣的:

__global__ void test(unsigned char *image, unsigned char *out, int n, int m) 
{ 
    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    int index = x + y * blockDim.x * gridDim.x; 
    int blockIndex = threadIdx.x + threadIdx.y * blockDim.x; 

    __shared__ unsigned char shared [16*16*3]; 

    if (blockIndex < 256 && index < n*m) 
    { 
     shared[blockIndex*3+0] = image[index*3+0]; 
     shared[blockIndex*3+1] = image[index*3+1]; 
     shared[blockIndex*3+2] = image[index*3+2]; 
    } 

    __syncthreads(); 

    if (blockIndex < 256 && index < n*m) 
    { 
     out[index*3+0] = shared[blockIndex*3+0]; 
     out[index*3+1] = shared[blockIndex*3+1]; 
     out[index*3+2] = shared[blockIndex*3+2]; 
    } 
} 

你也永遠不應該忘記的內核和CUDA API調用在內核範圍檢查(我已經添加,太)和適當的CUDA錯誤檢查。