2
我想從全局內存複製到共享內存中,我做了以下複製全局共享內存
__global__ void test(unsigned char *image, unsigned char *out, int n, int m)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int index = x + y * blockDim.x * gridDim.x;
__shared__ unsigned char shared [16*16*3];
if (threadIdx.x < 256)
{
shared[threadIdx.x*3+0] = image[index*3+0];
shared[threadIdx.x*3+1] = image[index*3+1];
shared[threadIdx.x*3+2] = image[index*3+2];
}
__syncthreads();
if (threadIdx.x < 256)
{
out[index*3+0] = shared[threadIdx.x*3+0];
out[index*3+1] = shared[threadIdx.x*3+1];
out[index*3+2] = shared[threadIdx.x*3+2];
}
}
我有一個512×512的圖像和我打電話那樣的內核:
out = (unsigned char*) malloc(n*m*3);
cudaMalloc((void**)&dev_image, n*m*3);
cudaMalloc((void**)&dev_out, n*m*3);
cudaMemcpy(dev_image, image, n*m*3, cudaMemcpyHostToDevice);
cudaMemcpy(dev_out, out, n*m*3, cudaMemcpyHostToDevice);
dim3 threads(16,16);
dim3 blocks(32, 32);
test<<<blocks, threads>>>(dev_image, dev_out, n, m);
cudaThreadSynchronize();
cudaMemcpy(out, dev_out, n*m*3, cudaMemcpyDeviceToHost);
任何想法我做錯了什麼?如何將全局內存的一部分複製到共享內存(一維)?