我有我的程序有2個問題。 這裏是我的程序的一部分C CUDA卷積bug(s)
主程序將調用卷積2d函數 此時內核只包含順序代碼。 因爲我可以測試所有的數據傳遞是否正確。
問題1是經過過濾的dev_filter在kenel 我嘗試了很多東西,但沒有奏效
問題2是如何在順序部分的所有這些循環並行化這一點。
我希望我做了我的問題,明確
#define FILTER_WIDTH 3
#define FILTER_HEIGTH 3
float SOBEL_FILTER_X[FILTER_HEIGTH][FILTER_WIDTH] = { {-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1} };
float SOBEL_FILTER_Y[FILTER_HEIGTH][FILTER_WIDTH] = { { 1, 2, 1}, { 0, 0, 0}, {-1,-2,-1} };
gray_image_t convolution2D(gray_image_t in, int imgW, int imgH, float filter[FILTER_HEIGTH][FILTER_WIDTH]) {
int imgS = imgW * imgH;
gray_image_t out, dev_in, dev_out;
float dev_filter[FILTER_HEIGTH][FILTER_WIDTH];
int filterS = FILTER_HEIGTH * FILTER_WIDTH;
//allocate memory
out = (gray_image_t) calloc(imgS, sizeof(float));
if (out == NULL) return NULL;
checkCudaCall(cudaMalloc(&dev_in, imgS * sizeof(float)));
checkCudaCall(cudaMalloc(&dev_out, imgS * sizeof(float)));
//memcopy
checkCudaCall(cudaMemcpy(dev_in,in,imgS * sizeof(float), cudaMemcpyHostToDevice));
timer convolution2D_kernel_timer("Convolution2D_kernel_timer");
convolution2D_kernel_timer.start();
convolution_2DKernel<<<AMOUNT_OF_BLOCKS, THREADS_PER_BLOCK>>>(dev_in,dev_out,imgW,imgH,dev_filter);
convolution2D_kernel_timer.stop();
std::cout << convolution2D_kernel_timer;
checkCudaCall(cudaThreadSynchronize());
checkCudaCall(cudaMemcpy(out,dev_out,imgS * sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(dev_in);
cudaFree(dev_out);
return out;
}
,這裏是內核
__global__ void convolution_2DKernel(gray_image_t dev_in, gray_image_t dev_out, int imgW,int imgH,float dev_filter[FILTER_HEIGTH][FILTER_WIDTH]){
// find center position of kernel (half of kernel size)
int kCenterX = FILTER_WIDTH/2;
int kCenterY = FILTER_HEIGTH/2;
for(int y=0; y < imgH; y++) {
for(int x=0; x < imgW; x++) {
for(int m=0; m < FILTER_HEIGTH; ++m) {
for(int n=0; n < FILTER_WIDTH; ++n) {
// index of input signal, used for checking boundary
int yy = y + (m - kCenterY);
int xx = x + (n - kCenterX);
// ignore input samples which are out of bound
if(yy >= 0 && yy < imgH && xx >= 0 && xx < imgW) {
dev_out[y*imgW+x] += dev_in[yy*imgW+xx] * dev_filter[m][n];
}
}
}
}
}
}
您好我試過它與使用cudaMallocPitch和cudaMemcpy2D ,但我仍然得到相同的錯誤
這段代碼有很多低效率的問題,所以你可以考慮使用一個已經可用的CUDA卷積包。免責聲明是,我工作的一個,我認爲會對你有用,ArrayFire,http://www.accelereyes.com/arrayfire_cuda/group__CONV__mat.htm – arrayfire 2012-02-15 03:37:03