無效配置參數塊的16位比

更大此代碼工作正常：無效配置參數塊的16位比

#include <stdio.h> 
#define N 1000 // <-- Works for values < 2^16 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
int main() { 
    int max_value[2]; 
    int ha[N], hb[N]; 
    int *da, *db; 
    cudaMalloc((void **)&da, N*sizeof(int)); 
    cudaMalloc((void **)&db, N*sizeof(int)); 
    for (int i = 0; i<N; ++i) { 
     ha[i] = i; 
    } 
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice); 
    add<<<N, 1>>>(da, db); 
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaFree(da); 
    cudaFree(db); 
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]); 
    getchar(); 
    return 0; 
}

但是當我從1000改變數字N（數組中的項）>（2 ）-1-程序崩潰。

我認爲這是對東道國的溢出，讓我感動的ha和hb數組聲明BSS segment和改變N到100萬。

#include <stdio.h> 
#define N 1000000 // <---- 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; // <---- 
static int hb[N]; // <---- 
int main() { 
    int max_value[2]; 
    // int ha[N], hb[N]; 
    int *da, *db; 
    cudaMalloc((void **)&da, N*sizeof(int)); 
    cudaMalloc((void **)&db, N*sizeof(int)); 
    for (int i = 0; i<N; ++i) { 
     ha[i] = i; 
    } 
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice); 
    add<<<N, 1>>>(da, db); 
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaFree(da); 
    cudaFree(db); 
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]); 
    getchar(); 
    return 0; 
}

現在我沒有得到一個錯誤，但hb數組爲空。
我的代碼有什麼問題？
如何分配大數組到設備並獲得有效結果？

更新：我已經插入錯誤檢查代碼，
我得到的錯誤是 - >「無效的配置參數」。
更新代碼是：

#include <stdio.h> 
#include <time.h> 
#include <math.h> 
#include <thrust/system_error.h> 
#include <thrust/system/cuda/error.h> 
#include <sstream> 
const int N = 70000; 

#define checkCudaErrors(error) {\ 
    if (error != cudaSuccess) {\ 
     printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\ 
     exit(1);\ 
     }\ 
}\ 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; 
static int hb[N]; 
int main() { 
    // int ha[N], hb[N]; 
    int max_value[2]; 

    int deviceCount = 0; 
    cudaGetDeviceCount(&deviceCount); 
    cudaError_t err=cudaDeviceReset(); 
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);} 
    printf("Device count: %d \n", deviceCount); 

    for (int i = 0; i<N; ++i) { ha[i] = i; } 
    int *da, *db; 
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int))); 
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int))); 
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice)); 
    add<<<N, 1>>>(da, db); // <--- Invalid configuration error 
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost)); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaError_t error = cudaGetLastError();  
    if(error != cudaSuccess) { 
     printf("CUDA error: %s\n", cudaGetErrorString(error)); 
     getchar(); 
     exit(-1); 
    } 
    getchar(); 
    return 0; 
}

該設備是一個的GeForce GTX 470和我使用
NVCC -o FOO new.cu編譯

來源

2017-07-12 Panos Kal.

'cudaMalloc'和'cudaMemcpy'都返回'cudaError_t'類型的值 - 可能值得先檢查一下。 – iehrlich

謝謝@iehrlich我會檢查這個 –

也請看看[this]（https://stackoverflow.com/questions/34655893/cuda-large-input-arrays）和總體[this]（https：// www.google.ru/search?q=cuda+large+array）可能會給你一些提示。祝你好運！ – iehrlich

您的設備（GTX 470）是cc2.0設備e（計算能力）。

無效配置參數錯誤是由於cc2.0設備的一維網格塊數限制爲65535.此信息可在programming guide（「最大x維度一個線程塊的網格「）或運行CUDA示例代碼。所以，你的N選擇在這裏過大：

add<<<N, 1>>>(da, db); 
    ^

通常的辦法解決這個與CC2.0設備是創建threadblocks的一個網格，是多維的，它允許threadblocks的數目大得多。內核啓動參數實際上可以是dim3變量，這些變量允許指定多維網格（線程塊）或多維線程塊（線程）。

要正確執行此操作，您還需要更改內核代碼，以便從可用的多維變量中創建適當的全局唯一線程ID。

下工作的例子給出了一個可能的最小集合變化的說明概念，並似乎對我正確運行：

$ cat t363.cu 
#include <stdio.h> 
#include <time.h> 
#include <math.h> 
#include <thrust/system_error.h> 
#include <thrust/system/cuda/error.h> 
#include <sstream> 
const int N = 70000; 

#define checkCudaErrors(error) {\ 
    if (error != cudaSuccess) {\ 
     printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\ 
     exit(1);\ 
     }\ 
}\ 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x + blockIdx.y*gridDim.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; 
static int hb[N]; 
int main() { 
    int max_value[2]; 

    int deviceCount = 0; 
    cudaGetDeviceCount(&deviceCount); 
    cudaError_t err=cudaDeviceReset(); 
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);} 
    printf("Device count: %d \n", deviceCount); 

    for (int i = 0; i<N; ++i) { ha[i] = i; } 
    int *da, *db; 
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int))); 
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int))); 
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice)); 
    dim3 mygrid(N/10, 10); 
    add<<<mygrid, 1>>>(da, db); 
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost)); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    printf("max_value[0] = %d, max_value[1] = %d\n", max_value[0], max_value[1]); 
    cudaError_t error = cudaGetLastError(); 
    if(error != cudaSuccess) { 
     printf("CUDA error: %s\n", cudaGetErrorString(error)); 
     getchar(); 
     exit(-1); 
    } 
    return 0; 
} 
$ nvcc -arch=sm_20 -o t363 t363.cu 
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 
$ ./t363 
Device count: 4 
max_value[0] = 139998, max_value[1] = 69999 
$

注：

如果您在CC3跑了你原來的代碼.0或更高版本的設備，它不應該拋出該錯誤。較新的CUDA設備將1D網格限制提高到2^31-1。但是如果你想超過這個塊數（大約2B），那麼你將不得不去多維網格。

在CUDA 8中不推薦使用cc2.0設備，並且從即將推出的CUDA 9版本中刪除對它們的支持。

來源

2017-07-13 19:42:54

非常感謝羅伯特，爲你的時間和你的真棒回覆！ –

無效配置參數塊的16位比

回答

相關問題