0
我想只將數組的第一個元素設置爲5.0(說)。即只有一個線程應該設置一個值,而其他線程的其餘部分不會執行任何操作。CUDA編程任務
這裏是我完整的代碼
#include <stdio.h>
#include <cuda.h>
#define GPUERRCHK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void writeBuf(char * fName, float * out_frame, int dim)
{
FILE * fp = fopen(fName, "w+");
int baseIndx = 0;
for(int i=0 ; i<dim ; i++)
{
for(int j=0 ; j<dim ; j++)
{
fprintf(fp, "%f ", out_frame[ dim + j ]);
}
baseIndx += dim;
fprintf(fp, "\n");
}
fclose(fp);
}
__global__ void kernel(float * s1, float * s2, int dim, int * hx, int *hy, float *hT, int nHeaters)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y*blockDim.x*gridDim.x;
if(offset < 1)
{
s2[0] = 1.0;
}
__syncthreads();
}
int main()
{
srand48(time(NULL));
int dim = 1024;
float *dev_s1, *dev_s2;
GPUERRCHK(cudaMalloc((void**)&dev_s1, dim*dim * sizeof(float)));
GPUERRCHK(cudaMalloc((void**)&dev_s2, dim*dim * sizeof(float)));
GPUERRCHK(cudaMemset(dev_s1, 0x00, dim*dim * sizeof(float)));
GPUERRCHK(cudaMemset(dev_s2, 0x00, dim*dim * sizeof(float)));
//heaters
int *dev_hx, *dev_hy;
float *dev_hT;
int nHeaters = 20;
GPUERRCHK(cudaMalloc((void**)&dev_hx, nHeaters * sizeof(int)));
GPUERRCHK(cudaMalloc((void**)&dev_hy, nHeaters * sizeof(int)));
GPUERRCHK(cudaMalloc((void**)&dev_hT, nHeaters * sizeof(float)));
//init heaters on cpu
int * hx, *hy;
float * hT;
hx = (int*) malloc(nHeaters * sizeof(int));
hy = (int*) malloc(nHeaters * sizeof(int));
hT = (float*) malloc(nHeaters * sizeof(float));
for(int i=0 ; i<nHeaters ; i++)
{
hx[i] = (int) ((float)drand48() * (float)dim) + 5;
hy[i] = (int) (drand48() * dim) + 5;
hT[i] = (float) (drand48() * 100) + 50;
}
//transfer hx, hy, hT to GPU
GPUERRCHK(cudaMemcpy(dev_hx, hx, nHeaters * sizeof(int), cudaMemcpyHostToDevice));
GPUERRCHK(cudaMemcpy(dev_hy, hy, nHeaters * sizeof(int), cudaMemcpyHostToDevice));
GPUERRCHK(cudaMemcpy(dev_hT, hT, nHeaters * sizeof(float), cudaMemcpyHostToDevice));
float *out_frame = (float *) malloc(dim*dim*sizeof(float));
// run kernel
int nThreadsPerBlock = 16;
int nBlockX = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
int nBlockY = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
kernel<<< dim3(nBlockX, nBlockY), dim3(nThreadsPerBlock, nThreadsPerBlock) >>>(dev_s1, dev_s2, dim, dev_hx, dev_hy, dev_hT, nHeaters);
GPUERRCHK(cudaPeekAtLastError());
GPUERRCHK(cudaDeviceSynchronize());
// collect result
GPUERRCHK(cudaMemcpy(out_frame, dev_s2, dim*dim * sizeof(float), cudaMemcpyDeviceToHost));
int f=1;
char fName[100];
snprintf(fName, 100, "out/file_%04d.data", f);
writeBuf(fName, out_frame, dim);
cudaFree(dev_s1);
cudaFree(dev_s2);
free(out_frame);
}
當我運行此,該文件包含所有零。我如何實現我計劃實現的目標? 可能是什麼問題?
如果你要發佈一個「我的代碼不能正常工作」的問題,請確認您發表的*實際代碼*(和代碼應儘可能簡潔,儘可能最小)這是有問題的,因爲你已經明確發佈的不是你正在編譯和運行的代碼..... – talonmies
我已經把我正在編譯的確切代碼。請幫忙..! – mkuse