1
我想在設備上分配矩陣,在內核中填充一些數字,然後將其複製回主機。問題是在主機上只有一行似乎被填滿。CUDA二維數組
我得到了這樣的事情:
9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1
這裏是我的代碼:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
tab[col * pitch + row] = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check(cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height));
h_tab = (int*)malloc(realSize);
check(cudaMemset(d_tab, 0, realSize));
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check(cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost));
showMatrix2(h_tab, width, height);
printf("\nPitch size: %d \n", pitch);
getchar();
return 0;
}
非常感謝幫助:) – Knight