2011-11-16 43 views
0

因此,當我運行我的代碼時,它完全執行,但是當我嘗試在Visual Profiler中運行它時,它第一次運行,但似乎要運行該程序七次,時間會導致未指定的啓動失敗。爲什麼會發生?我的代碼如下所示,我的錯誤檢查告訴我錯誤發生在 cudaMemcpy(p-> siteset,rsites,sitesize,cudaMemcpyDeviceToHost); (可能通過搜索memcpy11很容易在代碼中找到,這將是上面的行)Cuda Visual Profiler未指定的啓動失敗

我想不出一個原因,程序本質上是第二次運行它,但不是第一次,如果我在終端上多次運行它,它是完全正常的。任何人都可以想出可能發生的事情嗎? 謝謝!

void fillin(node *p, node *left, node *rt) 
{ 
size_t stepsize = chars * sizeof(long); 
size_t sitesize = chars * sizeof(sitearray); 
seqptr lsites; 
    cudaMalloc((void **) &lsteps, stepsize); 

    checkCUDAError("malloc"); 
    cudaMalloc((void **) &lsites, sitesize); 
    checkCUDAError("malloc"); 
    cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy7"); 
    cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy8"); 
    steptr rsteps; 
    seqptr rsites; 
    cudaMalloc((void **) &rsteps, stepsize); 
    checkCUDAError("malloc"); 
    cudaMalloc((void **) &rsites, sitesize); 
    checkCUDAError("malloc"); 
    cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy9"); 
    cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy"); 
    //call kernel 
    int block_size = 1; 
    int n_blocks = chars; 
    fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars); 
    cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost); 
    checkCUDAError("memcpy10"); 
    cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost); 
    checkCUDAError("memcpy11"); 
    cudaFree(rsites); cudaFree(rsteps); 
    cudaFree(lsites); cudaFree(lsteps); 
    checkCUDAError("free"); 
} 

}

__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){ 
boolean counted; 
aas aa; 
long s; 
long i, j, k, n; 
int idx = blockIdx.x; 
//reduce array references; may or may not be useful 
__shared__ long ls[3]; 
__shared__ long rs[3]; 
__shared__ long qs[3]; 
counted = false; 
k = 0; 
//computation from original program, but now need to do manual address calculation 
if(idx < max){ 
    for(i = 0; i < 3; i++){ 
     rs[i]=rsite[idx][i]; 
     ls[i]=lsite[idx][i]; 
    } 
    n = lsteps[idx] + rsteps[idx]; 
    counted = false; 
    for (i = 0; i <= 5; i++) { 
     if (k < 3) { 
      switch (i) { 

       case 0: 
        s = ls[0] & rs[0]; 
        break; 

       case 1: 
        s = (ls[0] & rs[1]) | (ls[1] & rs[0]); 
        break; 

       case 2: 
        s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]); 
        break; 

       case 3: 
        s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0]; 
        break; 

       case 4: 
        s = ls[1] | (ls[2] & rs[2]) | rs[1]; 
        break; 

       case 5: 
        s = ls[2] | rs[2]; 
        break; 
      } 
      if (counted || s != 0) { 
       qs[k] = s; 
       k++; 
       counted = true; 
      } else if (!counted) 
       n += cudaWeight[idx]; 
     } 
    } 
    for (i = 0; i <= 1; i++) { 
     for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) { 
      if (((1L << ((long)aa)) & qs[i]) != 0) { 
       for (j = i + 1; j <= 2; j++) 
        qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i]; 
      } 
     } 
    } 
    rsteps[idx] = n; 
    for(i = 0; i < 3; i++) 
     rsite[idx][i]=qs[i]; 
} 

}

+1

嘗試運行與'cudamemchk'代碼,看看它是否報告了一些越界內存訪問。 Profiler的每次運行都會有所不同,但當代碼訪問超出界限時將會「潛在」,否則特定於芯片資源的配置文件可能會被代碼破壞。 – talonmies

+0

我對它運行cuda-memcheck並沒有得到任何錯誤=========錯誤摘要:0錯誤 其他任何問題都可能出錯嗎? – Izri

回答

1

嘗試禁用配置文件中的會話設置所有計數器。另外,嘗試從工作文件夾中刪除所有文件,如「temp_compute_profiler_1_1.csv」(請參閱​​配置文件設置「工作文件夾」,默認情況下與您的可執行文件的位置相同)。

有同樣的錯誤(OpenCL的CUDA以上):http://www.khronos.org/message_boards/viewtopic.php?t=4324

相關問題