因此,當我運行我的代碼時,它完全執行,但是當我嘗試在Visual Profiler中運行它時,它第一次運行,但似乎要運行該程序七次,時間會導致未指定的啓動失敗。爲什麼會發生?我的代碼如下所示,我的錯誤檢查告訴我錯誤發生在 cudaMemcpy(p-> siteset,rsites,sitesize,cudaMemcpyDeviceToHost); (可能通過搜索memcpy11很容易在代碼中找到,這將是上面的行)Cuda Visual Profiler未指定的啓動失敗
我想不出一個原因,程序本質上是第二次運行它,但不是第一次,如果我在終端上多次運行它,它是完全正常的。任何人都可以想出可能發生的事情嗎? 謝謝!
void fillin(node *p, node *left, node *rt)
{
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
seqptr lsites;
cudaMalloc((void **) &lsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &lsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy7");
cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy8");
steptr rsteps;
seqptr rsites;
cudaMalloc((void **) &rsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &rsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy9");
cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//call kernel
int block_size = 1;
int n_blocks = chars;
fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars);
cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy10");
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy11");
cudaFree(rsites); cudaFree(rsteps);
cudaFree(lsites); cudaFree(lsteps);
checkCUDAError("free");
}
}
__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){
boolean counted;
aas aa;
long s;
long i, j, k, n;
int idx = blockIdx.x;
//reduce array references; may or may not be useful
__shared__ long ls[3];
__shared__ long rs[3];
__shared__ long qs[3];
counted = false;
k = 0;
//computation from original program, but now need to do manual address calculation
if(idx < max){
for(i = 0; i < 3; i++){
rs[i]=rsite[idx][i];
ls[i]=lsite[idx][i];
}
n = lsteps[idx] + rsteps[idx];
counted = false;
for (i = 0; i <= 5; i++) {
if (k < 3) {
switch (i) {
case 0:
s = ls[0] & rs[0];
break;
case 1:
s = (ls[0] & rs[1]) | (ls[1] & rs[0]);
break;
case 2:
s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]);
break;
case 3:
s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0];
break;
case 4:
s = ls[1] | (ls[2] & rs[2]) | rs[1];
break;
case 5:
s = ls[2] | rs[2];
break;
}
if (counted || s != 0) {
qs[k] = s;
k++;
counted = true;
} else if (!counted)
n += cudaWeight[idx];
}
}
for (i = 0; i <= 1; i++) {
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) {
if (((1L << ((long)aa)) & qs[i]) != 0) {
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
rsteps[idx] = n;
for(i = 0; i < 3; i++)
rsite[idx][i]=qs[i];
}
}
嘗試運行與'cudamemchk'代碼,看看它是否報告了一些越界內存訪問。 Profiler的每次運行都會有所不同,但當代碼訪問超出界限時將會「潛在」,否則特定於芯片資源的配置文件可能會被代碼破壞。 – talonmies
我對它運行cuda-memcheck並沒有得到任何錯誤=========錯誤摘要:0錯誤 其他任何問題都可能出錯嗎? – Izri