2016-02-19 80 views
0

考慮下面的代碼:Cuda的不修改輸入,如果網格尺寸太大

#include<iostream> 
#include<vector> 

#include <cuda.h> 
#include <cuda_runtime_api.h> 

using namespace std; 

__global__ void reduce_or(char* A) { 
    if(threadIdx.x == 0) { 
     A[blockIdx.x] = 1; 
    } 
} 

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

int main(int argc, char** argv) { 
    const uint64_t group_size = 1 << 16; //1 << 15 would work 
    char *dr; 

    std::vector<char> result; 
    result.resize(group_size, 0); 

    gpuErrchk(cudaMalloc((void **)&dr, group_size)); 
    gpuErrchk(cudaMemcpy(dr, result.data(), group_size, cudaMemcpyHostToDevice)); 

    reduce_or<<<group_size, 32>>>(dr); 

    gpuErrchk(cudaDeviceSynchronize()); 
    gpuErrchk(cudaMemcpy(result.data(), dr, group_size, cudaMemcpyDeviceToHost)); 

    for(int kk = 0; kk < group_size; ++kk) { 
    if(result[kk]) { 
     cout << std::dec << kk << std::hex << " " << (unsigned long) result[kk] << endl; 
    } 
    } 
} 

如果塊的數量大於或等於65536,輸入數組不從CUDA代碼改性雖然device_query樣品說,在x維度的最大尺寸大於65536大得多:

Device 0: "Tesla K20Xm" 
    CUDA Driver Version/Runtime Version   6.5/6.5 
    CUDA Capability Major/Minor version number: 3.5 
    Total amount of global memory:     5760 MBytes (6039339008 
    [...] 
    Maximum number of threads per multiprocessor: 2048 
    Maximum number of threads per block:   1024 
    Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 
    Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) 
    [...] 

上午我做錯什麼或做硬件謊言的能力?這是一個已知的錯誤?不應該拋出錯誤?

回答