4
以下是CUDA編程示例,它基本上是C但內部帶有NVidia CUDA函數。我一直試圖解釋這個代碼示例,並找出它正在嘗試做什麼。我的問題是這個程序編譯得很好,但是它需要什麼參數?例如,這CUDA程序正在在Linux模擬器但是一旦運行./program返回運行:C/CUDA程序輸出
用法:./program數 分段故障
哪些程序輸入參數。謝謝。
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand() % 128);
y[i] = (float) (rand() % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}
任何想法是什麼int值。將值設置爲100000會返回以下錯誤:「」int main(int,char **):聲明'yback [i] == y [i]'失敗。中止「」一個非常大的數字會導致錯誤「」Segmentation Fault「」 – 2013-03-15 20:08:30
它運行的值是小的 - '5'還是'32',可能? – us2012 2013-03-15 20:12:22
不,它錯誤與相同斷言失敗錯誤 – 2013-03-15 20:13:57