嵌套循環在CUDA C++

-1

內環的可變範圍我需要CUDA並行嵌套循環：嵌套循環在CUDA C++

for(int ix=0; ix<Nx; ix++) { 
    for(int iy=0; iy<Ny[ix]; iy++) { 
     SomeFunction(ix, iy); 
     ...

其中Ny[]是在CUDA全局存儲器陣列。該循環在單次運行中被調用幾次，Nx和Ny[]的元素在不同調用中發生變化，並且它們可以很大（從0到約100萬的Nx，從0到約10000的Ny）。原則上我可以使用CUDA內核，其線程排列在由ix和iy索引的大小爲Nx*max(Ny)的二維網格中，因此內核的計算成本爲O[Nx*max(Ny)/Ncores]。問題是，在我的情況下max(Ny)可以大於數量級的平均值Ny。在一些調用中，Ny的一些元素可能非常大（例如大約1000），而大多數其他元素非常小。我希望上述循環的一個很好的並行實現有一個計算成本O[Nx*average(Ny)/Ncores]，在我的情況下，它會比O[Nx*max(Ny)/Ncores]小得多，但我不知道我這樣做。我有一個模糊的想法，一個好的解決方案可以基於首先對Ny的元素進行排序。僅用於測試目的，這裏是一段代碼，它產生與我的分佈類似的Ny值。

#include <stdio.h> 
#include <stdlib.h> 

int UpdateArray(int *array, int *Nx); 

int main() 
{ 
    int Nx_max=1000000; 
    int *Ny; 
    int Nx, i; 

    Ny=(int*)malloc(Nx_max*sizeof(int)); 
    UpdateArray(Ny, &Nx); 

    for(i=0; i<Nx; i++) { 
    printf("%d\n", Ny[i]); 
    } 

    return 0; 
} 

int UpdateArray(int *array, int *Nx) 
{ 
    int Nx0_min=500000, Nx0_max=1000000; 
    int Nx1_min=50000, Nx1_max=100000; 
    int Nx2_min=5000, Nx2_max=10000; 

    int Ny0_min=1, Ny0_max=10; 
    int Ny1_min=10, Ny1_max=100; 
    int Ny2_min=100, Ny2_max=1000; 

    int nx0, nx1, nx2, i, ix; 

    nx0 = Nx0_min + rand()%(Nx0_max-Nx0_min); 
    for(i=0; i<nx0; i++) { 
    array[i] = Ny0_min + rand()%(Ny0_max-Ny0_min); 
    } 
    nx1 = Nx1_min + rand()%(Nx1_max-Nx1_min); 
    for(i=0; i<nx1; i++) { 
    ix = rand()%nx0; 
    array[ix] = Ny1_min + rand()%(Ny1_max-Ny1_min); 
    } 
    nx2 = Nx2_min + rand()%(Nx2_max-Nx2_min); 
    for(i=0; i<nx2; i++) { 
    ix = rand()%nx0; 
    array[ix] = Ny2_min + rand()%(Ny2_max-Ny2_min); 
    } 
    *Nx = nx0; 

    return 0; 
}

來源

2016-12-17 Bruno Golosio

是否有'n_links'變化或是靜態的？請發佈一個小數字示例輸入數據以及您想要的輸出。 –

'n_links'在模擬過程中發生變化，即它不僅僅依賴於索引'is'，它隨時間步長't'而變化。我知道這個名字可能暗示它是靜態的，因此我將它重命名爲'n_hits'。當我處理大量數據時，發佈示例會是一個問題，但是我編輯了我的問題，並試圖描述一個數字示例。正如我現在所說的那樣，我有一個模糊的想法，一個好的解決方案可以基於首先對'n_hits [']進行排序。 –

我編輯了我的問題，使其更具可讀性和更普遍的興趣。 –

如果SomeFunction得多計算上昂貴的是簡單的加載和存儲，我會考慮創建INT2這將存儲SomeFunction的執行參數的陣列（）。然後這個數組複製到設備上，並使用他們跑這會從陣列讀取參數上的位置由螺紋指數表明，執行SomeFunction（）內核：

#include<iostream> 
#include<cuda_runtime.h> 

int SomeFunctionHost(int x, int y) 
{ 
    return x*y; 
} 

__device__ int SomeFunction(int x, int y) 
{ 
    return x*y; 
} 

__global__ void executionKernel(int2 * args, int * results, int n) 
{ 
    int blockId = blockIdx.x; 
    int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; 
    if(threadId < n) 
    { 
     int2 myArgs = args[threadId]; 
     results[threadId] = SomeFunction(myArgs.x, myArgs.y); 
    } 
} 

int main() 
{ 
    //Create execution parameters 
    int Nx_max=1000000; 
    int *Ny; 
    int Nx, i; 

    Ny=(int*)malloc(Nx_max*sizeof(int)); 
    UpdateArray(Ny, &Nx); 

    int count = 0; 
    for(i=0; i<Nx; i++) { 
     count += Ny[i]; 
    } 

    int2 * hParams, * dParams; 
    int * hResults, * dResults; 

    hParams = (int2*)malloc(count*sizeof(int2)); 
    hResults = (int*)malloc(count*sizeof(int)); 

    cudaMalloc((void **) &dParams, count*sizeof(int2)); 
    cudaMalloc((void **) &dResults, count*sizeof(int)); 

    cudaMemset(dResults, 0 , count * sizeof(int)); 

    int index = 0; 
    for(i=0; i<Nx; i++) 
    { 
     for(int j=0; j<Ny[i];j++) 
     { 
      hParams[index].x = i; 
      hParams[index].y = j; 
      index++; 
     } 
    } 

    //Copy execution parameters to the device 
    cudaMemcpy(dParams, hParams, count * sizeof(int2), cudaMemcpyHostToDevice); 

    //Define the grid configuration 
    dim3 blockDim(32,32,1); 
    int gridLength = count/(blockDim.x*blockDim.y) + 1; 
    dim3 gridDim(gridLength, 1, 1); 

    //Run kernel 
    executionKernel<<<gridDim, blockDim>>>(dParams, dResults, count); 

    //Copy the results back to the host 
    cudaMemcpy(hResults, dResults, count * sizeof(int), cudaMemcpyDeviceToHost); 

    //TEST 
    for(int i=0;i<count;i++) 
    { 
     if(SomeFunctionHost(hParams[i].x, hParams[i].y) != hResults[i]) 
     { 
      std::cout << "WRONG RESULT !" << std::endl; 
     } 
    } 

    std::cout << "DONE!" << std::endl; 

    free(hParams); 
    free(hResults); 
    free(dParams); 
    free(dResults); 
    return 0;  
}

這樣可以確保黨建工作的均勻分佈網格。我強烈建議你嘗試不同的網格配置。也許嘗試平行化參數生成階段，看看它是否給你帶來任何好處。玩的開心！

來源

2016-12-19 08:54:20 pSoLT

謝謝@pSoLT，我會盡力實現你的方法，儘管它看起來不太容易。 –

嘿@BrunoGolosio我擴展了一下解決方案。我希望它能幫助你更多。 – pSoLT

嵌套循環在CUDA C++

回答

相關問題