我正在努力學習CUDA。我有一些MPI的基本經驗,所以我想我會從一些非常簡單的矢量操作開始。我正在嘗試編寫一個並行化的點積產品。我要麼無法爲CUDA設備分配/寫入內存,要麼無法正確地將其返回主機(cudaMemcpy())。CUDA內存分配和訪問問題
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *dot){
dot[0] += array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSetupVector(double *vector, int dim, int incrSize, int start){
if(threadIdx.x<dim){
vector[threadIdx.x] = start + threadIdx.x * incrSize;
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
int VECTOR_LENGTH = 8;
int ELEMENT_SIZE = sizeof(double);
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product = (double*) malloc(ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
//host dot
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_dot;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot, ELEMENT_SIZE);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot, &dev_dot_product, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//init vectors
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_a, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_b, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, 1>>>(dev_dot, VECTOR_LENGTH, 0, 0);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_dot);
//get computed product back to the machine
cudaMemcpy(dev_dot, dev_dot_product, ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV CALCULATION: %f \n", dev_dot_product[0]);
fprintf(output, "PRINTING DEV ARRAY VALS: ARRAY A\n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i: %f\n", i, dev_array_a[i]);
}
free(array_a);
free(array_b);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_dot);
return(0);
}
下面是一個例子輸出:
HOST CALCULATION: 140.000000
DEV CALCULATION: 0.000000
PRINTING DEV ARRAY VALS: ARRAY A
value 0: -0.000000
value 1: 387096841637590350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 2: -9188929998371095800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 3: 242247762331550610000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 4: -5628111589595087500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 5: 395077289052074410000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 6: 0.000000
value 7: -13925691551991564000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
我實現了您發佈的代碼。它會拋出每個CUDA調用。設置CUDA或我的卡時是否缺少某些東西? – Joe 2012-01-18 22:48:34
您正在使用哪些版本的CUDA驅動程序和編譯器?從http://developer.nvidia.com/cuda-downloads – keveman 2012-01-18 23:35:41