0
我正在運行一個寫在cuda中的矢量加法代碼。關於代碼的一切都很好,但如果我增加矢量大小,問題就來了。錯誤的數量(由CPU和GPU給出的結果差異)變得太大。我附上下面的代碼:Cuda矢量加法給出了大量的錯誤
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "timer.h"
/*
* **CUDA KERNEL**
*
* Compute the sum of two vectors
* C[i] = A[i] + B[i]
*
*/
__global__ void vecAdd(float* a, float* b, float* c) {
/* Calculate index for this thread */
int i = blockIdx.x * blockDim.x + threadIdx.x;
/* Compute the element of C */
c[i] = a[i] + b[i];
}
void compute_vec_add(int N, float *a, float* b, float *c);
/*
*
* Host code to drive the CUDA Kernel
*
*/
int main() {
float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024 * 512;
struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;
/* Setup timers */
stopwatch_init();
timer = stopwatch_create();
/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);
/*
Set the initial values of h_a, h_b, and h_c
*/
for (i = 0; i < N; i++) {
h_a[i] = (float) (rand() % 100)/10.0;
h_b[i] = (float) (rand() % 100)/10.0;
h_c[i] = 0.0;
}
/*
Allocate space on the GPU
*/
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));
/*
Copy d_a and d_b from CPU to GPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(
cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(
cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
t_pcie_htd = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",
t_pcie_htd);
/*
Run N/256 blocks of 256 threads each
*/
dim3 GS(N/256, 1, 1);
dim3 BS(256, 1, 1);
stopwatch_start(timer);
vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
cudaThreadSynchronize();
t_kernel = stopwatch_stop(timer);
fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);
/*
Copy d_cfrom GPU to CPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(
cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
t_pcie_dth = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",
t_pcie_dth);
/*
Double check errors
*/
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start(timer);
compute_vec_add(N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop(timer);
fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);
int cnt = 0;
for (int i = 0; i < N; i++) {
if (abs(h_temp[i] - h_c[i]) > 1e-5)
cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);
/*
Free the device memory
*/
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
/*
Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);
/*
Free timer
*/
stopwatch_destroy(timer);
if (cnt == 0) {
printf("\n\nSuccess\n");
}
}
void compute_vec_add(int N, float *a, float* b, float *c) {
int i;
for (i = 0; i < N; i++)
c[i] = a[i] + b[i];
}
編輯:這是我如何編譯
nvcc vecAdd.cu timer.o
,當我們在GTX TITAN X上運行它上面的代碼的輸出如下:
Timer: gettimeofday
Timer resolution: ~ 1 us (?)
Time to transfer data from host to device: 1.44104 secs
Time to execute GPU kernel: 0.000121 secs
Time to transfer data from device to host: 0.725893 secs
Time to execute CPU program: 2.96071 secs
number of errors: 350576933 out of 536870912
另外,爲什麼需要0.72秒圍繞2GB從設備到主機或1.44秒的傳輸數據的從主機傳輸數據的〜4GB CPU和GPU之間的高帶寬連接的設備inspite。 謝謝。
不要垃圾郵件的標籤! – Olaf
您在內核啓動時會有不完整的錯誤檢查,這意味着在大數據量時您不會檢測到(幾乎確定的)內核啓動失敗。另外,'cudaThreadSynchronize'很長時間不推薦使用,您應該使用'cudaDeviceSynchronize'來代替。請編輯您的問題以包含用於編譯此代碼的編譯語句。 – talonmies
@talonmies我已經添加了編譯語句。另外,我試過cudaDeviceSynchronize,但它沒有幫助我。我不知道啓動內核時的錯誤檢查。我會擡頭看。謝謝 – amritkrs