2011-11-30 37 views
0

我正在做一些CUDA(FFT),但我不知道爲什麼它在調用內核函數時會產生異常。CUDA異常

所有包含和定義:

#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 
#include <time.h> 


#define CPU_ARRAY_SIZE 1024 // 1024, 2048, 4096 8192 
#define GPU_ARRAY_SIZE 512 // 
#define THREAD_SIZE 16  // fixed 
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE) // 32 

#define PI 3.14 

由於我在NVIDIA GTX480運行它,我想這可能是共享內存空間,雖然它似乎沒有要(因爲有「一些許多「共享變量)。所以,我改變了GPU_ARRAY_SIZE來看它是如何工作的,當我將它定義爲32,64,256,512時,它給了我不同的結果(在512的情況下,它返回全零,我猜CUDA不能做任何事情 - 在其他情況下,它返回怪異的,因爲我不知道爲什麼它沒有任何計算跳轉16個單元格的原因)。在大多數情況下,在我的Microsoft Visual Studio的輸出窗口中,它會返回數十億例外「.exe中的0x75b9b9bc第一次機會異常:Microsoft C++異常:內存位置的cudaError_enum」異常。在你要求我進行調試之前,我無法調試它,因爲VS不會爲VS無法識別的文件(比如.cpp - 至少這種理論適用於我的情況)這樣做。 你們有什麼想法的問題: 1.爲什麼它會產生異常? 2.它爲什麼要計算,它應該爲每個塊中的每個單元格做什麼,只在幾個單元格內

我怎麼能解決這個問題......任何想法?

核心功能:

__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix) 
{ 

    __shared__ double block[THREAD_SIZE][THREAD_SIZE]; 
    __shared__ double spectrum[THREAD_SIZE][THREAD_SIZE]; 
    __shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin 
    __shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE]; 
    __shared__ double local_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin 
    __shared__ double local_sin[THREAD_SIZE][THREAD_SIZE]; 

    unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x; 
    unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y; 


    int u; 
    int x=0,y=0; 

    int tx = threadIdx.x; 
    int ty = threadIdx.y; 

    double sum_sines=0.0,sum_cosines=0.0; 

    double angle=(2*PI)/GPU_ARRAY_SIZE;  

    block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex]; 

    __syncthreads(); 


    //for every column! 

    for(u=0; u<THREAD_SIZE; u++) 
    { 

     /* All threads calculate its own sin and cos value. */ 
     local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u); 
     local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u); 


     /* Only one row is activate. The thread in row adds all element of its column. */ 
     if (ty == u) 
     { 
      sum_sines = 0.0; 
      sum_cosines = 0.0; 

      /* Access each column to add all elements of the column.*/ 
      for (y=0; y<THREAD_SIZE; y++) 
      { 
       sum_sines += local_sin[tx][y]; 
       sum_cosines += local_cos[tx][y]; 
      } 

      //if (sum_sines < 0) 
       //sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE; 
      //else 
       sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE; 

      //if (sum_cosines < 0) 
       //sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE; 
      //else 
       sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE; 

     } 

     __syncthreads(); 
    } 

    spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2) 
           +(double)pow(sum_cos[tx][ty],2)); 
    __syncthreads(); 


    block[tx][ty] = spectrum[tx][ty]; 


    __syncthreads(); 


    //for every row! 

    for(u=0; u<THREAD_SIZE; u++) 
    { 

     /* All threads calculate its own sin and cos value. */ 
     local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u); 
     local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u); 


     /* Only one column is activate. The thread in colum adds all element of its row. */ 
     if (tx == u) 
     { 
      sum_sines = 0.0; 
      sum_cosines = 0.0; 

      for (x=0; x<THREAD_SIZE; x++) 
      { 
       sum_sines += local_sin[x][ty]; 
       sum_cosines += local_cos[x][ty]; 
      } 

      //if (sum_sines < 0) 
       //sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE; 
      //else 
       sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE; 

      //if (sum_cosines < 0) 
       //sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE; 
      //else 
       sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE; 

     } 

     __syncthreads(); 
    } 

    spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2)); 
    __syncthreads(); 


     /* Transpose! I think this is not necessary part. */ 

    d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] = spectrum[threadIdx.y][threadIdx.x]; 

    __syncthreads(); 
} 

主要功能:

int main(int argc, char** argv) 
{ 

    int i,j, w, h, sw, sh; 

    int numSubblock = CPU_ARRAY_SIZE/GPU_ARRAY_SIZE; 
     double *d_isub_matrix,*d_osub_matrix; 

    double *big_matrix = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE]; 
    double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE]; 

    double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE]; 
    double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE]; 
    cudaEvent_t start,stop; 
    float elapsedtime; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 


    for (i=0; i<CPU_ARRAY_SIZE; i++) 
    { 
     for (j=0; j<CPU_ARRAY_SIZE; j++) 
     big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j; 
    } 



    cudaEventRecord(start,0); 


    //cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2); 
    //cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2); 

    for(i = 0; i < numSubblock; i++) 
    { 
     for (j=0; j < numSubblock; j++) 
     { 


     // start position of subarea of big array 
     cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)); 
     cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)); 

     h = i*GPU_ARRAY_SIZE; 

     w = j*GPU_ARRAY_SIZE; 
     //printf("h = %d, w=%d",h,w); 
     //system("PAUSE"); 

     // move subarea of big array into isub array. 

     for (sh = 0; sh < GPU_ARRAY_SIZE; sh++) 
     { 
      for (sw = 0; sw <GPU_ARRAY_SIZE; sw++) 
      { 
      isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)]; 

      } 
     } 



      cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice); 

     //call the cuda kernel 
     dim3 blocks(BLOCK_SIZE, BLOCK_SIZE); 
     dim3 threads(THREAD_SIZE, THREAD_SIZE); 

      twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix); 

     cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); 


     for (sh = 0; sh < GPU_ARRAY_SIZE; sh++) 
     { 
      for (sw = 0; sw <GPU_ARRAY_SIZE; sw++) 
      { 
       big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw]; 
       printf(" sh %d sw %d %lf \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]); 

      } 

     } 
     printf("passei por aqui algumas vezes\n"); 
     cudaFree(d_osub_matrix); 
     cudaFree(d_isub_matrix); 

     } 
    } 
// cudaFree(d_osub_matrix); 
// cudaFree(d_isub_matrix); 

     //Stop the time 
     cudaEventRecord(stop,0); 
     cudaEventSynchronize(stop); 
     cudaEventElapsedTime(&elapsedtime,start,stop); 

    //showing the processing time 
    printf("The processing time took... %fms to execute everything",elapsedtime); 
    system("PAUSE"); 

     for (sh = 0; sh < CPU_ARRAY_SIZE; sh++) 
     { 
      for (sw = 0; sw <CPU_ARRAY_SIZE; sw++) 
      { 

       printf(" sh %d sw %d %lf \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]); 

      } 
     } 


    system("PAUSE"); 
    // I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]". 

} 

回答

1

通過短期看問題可能和應該是folling線:

// start position of subarea of big array 
    cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)); 
    cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)); 

你分配公正爲GPU上的雙重值提供少量內存。您的子矩陣在每個需要8個字節的位置分配4個字節。

+0

好吧,似乎你得到了正確的拍攝! :) 我乘以2,你說我應該需要8個字節......那麼至少它是給我答案!好的,克羅諾斯! :) 謝謝您的幫助! –

+0

好吧......這是一個很好的選擇......直到我開始打印所有的......並從sh = 256開始,它又開始給我ZERO了! :(你有什麼猜測嗎?(我開始再次看到它,它重新開始給我一些很好的跡象,我猜sh = 500左右......非常奇怪!) –

+1

你是否改變了cudaMemcpy到?尺寸計算他們也使用sizeof(浮動),而不是sizeof(雙) –