#include <iostream> 
#include <omp.h> 

int main() 
    int myNumber = 0; 
    int numOfHits = 0; 

    cout << "Enter my Number Value" << endl; 
    cin >> myNumber; 

    #pragma omp parallel for reduction(+:numOfHits) 

    for(int i = 0; i <= 100000; ++i) 
     for(int j = 0; j <= 100000; ++j) 
      for(int k = 0; k <= 100000; ++k) 
       if(i + j + k == myNumber) 

    cout << "Number of Hits" << numOfHits << endl; 

    return 0; 



你也可以直接計算'numOfHits',而不是用所有那些循環強制它...... – sth


你還沒有在這裏提問。你想知道什麼_exactly_? – talonmies




所以首先,你會希望得到MS Visual Studio中建立與CUDA,這是本指南輕鬆以下:http://www.ademiller.com/blogs/tech/2011/05/visual-studio-2010-and-cuda-easier-with-rc2/

現在,你將要閱讀的NVIDIA CUDA編程指南(免費PDF格式),文檔和CUDA示例(我強烈建議學習CUDA的書)。


這是一個非常重的算術運算和數據光計算 - 實際上它可以在不使用這個蠻力方法的情況下進行相當簡單的計算,但這不是您正在尋找的答案。我建議像這樣的內核:

__global__ void kernel(int* myNumber, int* numOfHits){ 

    //a shared value will be stored on-chip, which is beneficial since this is written to multiple times 
    //it is shared by all threads 
    __shared__ int s_hits = 0; 

    //this identifies the current thread uniquely 
    int i = (threadIdx.x + blockIdx.x*blockDim.x); 
    int j = (threadIdx.y + blockIdx.y*blockDim.y); 
    int k = 0; 

    //we increment i and j by an amount equal to the number of threads in one dimension of the block, 16 usually, times the number of blocks in one dimension, which can be quite large (but not 100,000) 
    for(; i < 100000; i += blockDim.x*gridDim.x){ 
     for(; j < 100000; j += blockDim.y*gridDim.y){ 
        //Thanks to talonmies for this simplification 
       if(0 <= (*myNumber-i-j) && (*myNumber-i-j) < 100000){ 
        //you should actually use atomics for this 
       //otherwise, the value may change during the 'read, modify, write' process 

    //synchronize threads, so we now s_hits is completely updated 

    //again, atomics 
    //we make sure only one thread per threadblock actually adds in s_hits 
    if(threadIdx.x == 0 && threadIdx.y == 0) 
     *numOfHits += s_hits; 



dim3 blocks(some_number, some_number, 1); //some_number should be hand-optimized 
dim3 threads(16, 16, 1); 
kernel<<<blocks, threads>>>(/*args*/); 


聲明:我沒有測試過我的代碼,我不是專家 - 它可能是愚蠢的。


內核中的內部循環完全沒有必要。 – talonmies


我相信只有啓動足夠的線程才能完全覆蓋i和j域[0,999999],這是不必要的。否則,你需要'一步'來覆蓋沒有專門針對他們的線程的i和j。 –


否。在定義了i和j的情況下,只有一個可能的值滿足'i + j + k = myNumber'。所以它遵循'k = myNumber - i - j',因此對於'k'的所有可能的值只能得到最多一個「命中」,並且只有當'0 <= myNumber - i - j <= 100000 '。所以只要執行那個測試就足以知道一個給定的'i'和'j'是否會發生命中而沒有執行內部循環.... – talonmies