2011-12-07 64 views
-1

我正在試着製作一個小計算器,用於計算cuda設計,下面是第一個順序版本,然後是我的並行版本。 它運行沒有錯誤,但由於某種原因,它不會給結果回來,我一直試圖讓這個工作2個星期,但現在找不到錯誤!CUDA不返回結果

Serilized版本

int f(int x, int c, int n); 
int gcd(unsigned int u, unsigned int v); 

int main() 
{ 
    clock_t start = clock(); 

    srand (time(NULL)); 

    int x = 1; 
    int y = 2; 
    int d = 1; 


    int c = rand() % 100; 
    int n = 323; 

    if(n % y == 0) 
     d = y; 

    while(d == 1) 
    { 
     x = f(x, c, n); 
     y = f(f(y, c, n), c, n); 
     int abs = x - y; 
     if(abs < 0) 
      abs = abs * -1; 
     d = gcd(abs, n); 

     if(d == n) 
     { 
      printf("\nd == n"); 
      c = 0; 
      while(c == 0 || c == -2) 
       c = rand() % 100; 
      x = 2; 
      y = 2; 
     } 
    } 

    int d2 = n/d; 

    printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC); 
    printf("\nResult: %d", d); 
    printf("\nResult2: %d", d2); 


    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 
} 

int f(int x, int c, int n) 
{ 
    return (int)(pow((float)x, 2) + c) % n; 
} 

int gcd(unsigned int u, unsigned int v){ 

    int shift; 

    /* GCD(0,x) := x */
    if (u == 0 || v == 0) 
     return u | v; 

    /* Let shift := lg K, where K is the greatest power of 2 
     dividing both u and v. */
    for (shift = 0; ((u | v) & 1) == 0; ++shift) { 
     u >>= 1; 
     v >>= 1; 
    } 

    while ((u & 1) == 0) 
     u >>= 1; 

    /* From here on, u is always odd. */
    do { 
     while ((v & 1) == 0)/* Loop X */
      v >>= 1; 

     /* Now u and v are both odd, so diff(u, v) is even. 
      Let u = min(u, v), v = diff(u, v)/2. */
     if (u < v) { 
      v -= u; 
     } else { 
      int diff = u - v; 
      u = v; 
      v = diff; 
     } 
     v >>= 1; 
    } while (v != 0); 

    return u << shift; 
} 

水貨版本

#define threads 512 
#define MaxBlocks 65535 
#define RunningTheads (512*100) 

__device__ int gcd(unsigned int u, unsigned int v) 
{ 
    int shift; 
    if (u == 0 || v == 0) 
     return u | v; 

    for (shift = 0; ((u | v) & 1) == 0; ++shift) { 
     u >>= 1; 
     v >>= 1; 
    } 

    while ((u & 1) == 0) 
     u >>= 1; 

    do { 
     while ((v & 1) == 0) 
      v >>= 1; 

     if (u < v) { 
      v -= u; 
     } else { 
      int diff = u - v; 
      u = v; 
      v = diff; 
     } 
     v >>= 1; 
    } while (v != 0); 

    return u << shift; 
} 

__device__ bool cuda_found; 
__global__ void cudaKernal(int *cArray, int n, int *outr) 
{ 
    int index = blockIdx.x * threads + threadIdx.x; 

    int x = 1; 
    int y = 2; 
    int d = 4; 
    int c = cArray[index]; 

    while(d == 1 && !cuda_found) 
    { 
     x = (int)(pow((float)x, 2) + c) % n; 
     y = (int)(pow((float)y, 2) + c) % n; 
     y = (int)(pow((float)y, 2) + c) % n; 

     int abs = x - y; 
     if(abs < 0) 
      abs = abs * -1; 
     d = gcd(abs, n); 
    } 
    if(d != 1 && !cuda_found) 
    { 
     cuda_found = true; 
     outr = &d; 
    } 
} 

int main() 
{ 
    int n = 323; 

    int cArray[RunningTheads]; 
    cArray[0] = 1; 
    for(int i = 1; i < RunningTheads-1; i++) 
    { 
     cArray[i] = i+2; 
    } 

    int dresult = 0; 
    int *dev_cArray; 
    int *dev_result; 

    HANDLE_ERROR(cudaMalloc((void**)&dev_cArray, RunningTheads*sizeof(int))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int))); 

    HANDLE_ERROR(cudaMemcpy(dev_cArray, cArray, RunningTheads*sizeof(int), cudaMemcpyHostToDevice)); 

    int TotalBlocks = ceil((float)RunningTheads/(float)threads); 
    if(TotalBlocks > MaxBlocks) 
     TotalBlocks = MaxBlocks; 

    printf("Blocks: %d\n", TotalBlocks); 
    printf("Threads: %d\n\n", threads); 

    cudaKernal<<<TotalBlocks,threads>>>(dev_cArray, n, dev_result); 

    HANDLE_ERROR(cudaMemcpy(&dresult, dev_result, sizeof(int), cudaMemcpyDeviceToHost)); 

    HANDLE_ERROR(cudaFree(dev_cArray)); 
    HANDLE_ERROR(cudaFree(dev_result)); 

    if(dresult == 0) 
     dresult = 1; 

    int d2 = n/dresult; 

    printf("\nResult: %d", dresult); 
    printf("\nResult2: %d", d2); 


    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 
} 

回答

4

讓我們來看看你的內核代碼:

__global__ void cudaKernal(int *cArray, int n, int *outr) 
{ 
    int index = blockIdx.x * threads + threadIdx.x; 

    int x = 1; 
    int y = 2; 
    int d = 4; 
    int c = cArray[index]; 

    while(d == 1 && !cuda_found)  // always false because d is always 4 
    { 
     x = (int)(pow((float)x, 2) + c) % n; 
     y = (int)(pow((float)y, 2) + c) % n; 
     y = (int)(pow((float)y, 2) + c) % n; 

     int abs = x - y; 
     if(abs < 0) 
      abs = abs * -1; 
     d = gcd(abs, n);   // never writes to d because the loop won't 
            // be executed 
    } 
    if(d != 1 && !cuda_found)  // maybe true if cuda_found was initalized 
            // with false 
    { 
     cuda_found = true; // Memory race here. 
     outr = &d;   // you are changing the adresse where outr 
          // points to; the host code does not see this 
          // change. your cudaMemcpy dev -> host will copy 
          // the exact values back from device that have 
          // been uploaded by cudaMemcpy host -> dev 
          // if you want to set outr to 4 than write: 
          // *outr = d; 
     } 
    } 
+0

這是值得指出的是,即使在經過此回覆標識的所有問題都是固定的,該算法的基本前提和使用cuda_found'的'因爲全局同步標誌被完全破壞,代表了一個永遠無法可靠工作的巨大內存競爭。 – talonmies

2

其中的一個問題是你不返回結果。在您的代碼中,您只需更改outr,它在您的內核函數中具有本地範圍(即在此函數外部未看到更改)。您應該編寫*outr = d;以更改您指定的內存值outr

我不確定CUDA是否用零初始化全局變量。我的意思是你確定cuda_found總是初始化爲false