此代碼可以跨多個塊工作,無需分配1.0f
的數組。 if (index < 5000)
聲明並不打算將您限制爲單個線程塊。它旨在確保只有整個網格中的合法線程參與操作。
嘗試這樣的事:
#include <iostream>
#define TOTAL_SIZE 100000
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernelCode(float *result)
{
int index = threadIdx.x+blockIdx.x*blockDim.x;
if (index < TOTAL_SIZE)
{
atomicAdd(result, 1.0f);
}
}
int main(){
float h_result, *d_result;
cudaMalloc((void **)&d_result, sizeof(float));
cudaCheckErrors("cuda malloc fail");
h_result = 0.0f;
cudaMemcpy(d_result, &h_result, sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
kernelCode<<<(TOTAL_SIZE+nTPB-1)/nTPB, nTPB>>>(d_result);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
std::cout<< "result = " << h_result << std::endl;
return 0;
}
您可以更改TOTAL_SIZE
到任何數量將可以方便地貼合在float
注意,我在瀏覽器中輸入驗證碼,有可能是印刷錯誤。
你的代碼工作正常..看看.. http://pastebin.com/daAGkZZu –