執行內核功能後,打印輸出始終爲0。 經過一些測試,cudaMemcpy仍然是正確的。但內核似乎不工作,無法從d_inputs獲取正確的數據。 有人可以幫忙解釋嗎?謝謝!CUDA輸出始終爲0
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>
#define N 32
__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid<N) {
double val =(double) d_inputs[tid];
/*for (int iter=0; iter < niters; iter++){
val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
val = (val/3.0) + 102.0;
val = (val + 1.07) - 103.0;
val = (val/1.037) + 104.0;
val = (val + 3.00) - 105.0;
val = (val/0.22) + 106.0;
}*/
val = val + 1.0;
//printf("This is %f\n",val);
d_outputs[tid] = val;
}
}
int main(int argc, char **argv)
{
int niters = 10;
printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);
int inputs[N];
for (int i = 0; i<N; i++){
inputs[i] = i+1;
}
int d_inputs[N];
double d_outputs[N];
double outputs[N];
cudaMalloc((void**)&d_inputs, N*sizeof(int));
cudaMalloc((void**)&d_outputs, N*sizeof(double));
printf("test %d \n", inputs[3]);
cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
printf("test %d \n", d_inputs[1]);
Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
//cudaDeviceSynchronize();
cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int j =0;j<10; j++){
printf("Outputs[%d] is: %f and %f\n",j, d_outputs[j], outputs[j]);
}
cudaFree(d_inputs);
cudaFree(d_outputs);
return EXIT_SUCCESS;
}
你絕對沒有錯誤在你的代碼檢查,所以它隨時可能被失敗點,你不會知道。在每次可能會失敗的呼叫之後添加錯誤檢查,並查看是否有助於縮小問題範圍。 –
CHECK(cudaMalloc((void **)&d_inputs,N * sizeof(int))); \t CHECK(cudaGetLastError()); (cudaMalloc((void **)&d_outputs,N * sizeof(double))); \t CHECK(cudaGetLastError()); \t printf(「test%d \ n」,inputs [3]); (cudaMemcpy(d_inputs,inputs,N * sizeof(int),cudaMemcpyHostToDevice)); \t CHECK(cudaGetLastError()); \t printf(「test%d \ n」,d_inputs [1]); \t Kernel_double <<<16,2> >>(niters,d_inputs,d_outputs); \t CHECK(cudaGetLastError()); – user45690
添加後,我得到錯誤,在cudaMemcpy行中,說無效的參數 – user45690