與內存有關的莫名其妙的cuda行爲

所以基本上我把我的C++代碼（這是正常工作），並將其改寫爲cuda（我對cuda沒有經驗）。代碼的一部分（solve（）方法）工作不正常，我真的不知道爲什麼。與內存有關的莫名其妙的cuda行爲

所以我的問題是什麼意思是「未指定啓動失敗」錯誤期間cudaMemcpy和爲什麼它發生在我的代碼。

我的第二個問題是爲什麼變量backup_ans和ans在計算相同的事情時有所不同？

#include "stdio.h" 
#include <algorithm> 

__device__ unsigned int primes[1024]; 
__device__ long long n = 1ll<<32; // #unsigned_integers 




__device__ int hashh(long long x) { 
     return (x>>1)%1024; 
} 

// compute (x^e)%n 
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) { 
    unsigned long long ans = 1; 
    while(e>0) { 
     if(e&1) ans = (ans*x)%n; 
     x = (x*x)%n; 
     e>>=1; 
    } 
    return ans; 
} 

// determine whether n is strong probable prime base a or not. 
// n is ODD 
__device__ int is_SPRP(unsigned long long a,unsigned long long n) { 
    int d=0; 
    unsigned long long t = n-1; 
    while(t%2==0) { 
     ++d; 
     t>>=1; 
    } 
    unsigned long long x = mulmod(a,t,n); 
    if(x==1) return 1; 
    for(int i=0;i<d;++i) { 
     if(x==n-1) return 1; 
     x=(x*x)%n; 
    } 
    return 0; 
} 
__device__ int prime(long long x) { 
     return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x); 
} 

// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c 
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads 
__global__ void find(unsigned int *out,unsigned int *c) { 
    unsigned int buff[4096]; 
    int local_c = 0; 
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545; 
    long long e = b+335545; 
    if(b%2==0) ++b; 
    for(long long i=b;i<e && i<n;i+=2) { 
     if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue; 
     if(hashh(i)==0) { 
      buff[local_c++]=(unsigned int)i; 
      if(local_c==4096) { 
       int start = atomicAdd(c,local_c); 
       for(int i=0;i<local_c;++i) out[i+start]=buff[i]; 
       local_c=0; 
      } 
     } 
    } 
    int start = atomicAdd(c,local_c); 
    for(int i=0;i<local_c;++i) out[i+start]=buff[i]; 
} 

// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint 
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) { 
     __shared__ unsigned int s[32]; 
    unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1; 
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif; 
    unsigned int e = b+dif>(*count)?(*count):b+dif; 
    unsigned int mysol = 0; 
    for(long long i = 2; i<33; ++i) { 
      int sol = 1; 
      // each thread doing its part 
      for(unsigned int j = b; j<e ; ++j) { 
       //is some element is sprp base i break 
       if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) { 
       sol=0; 
       break; 
       } 
      } 
      // if all elements passed store base to mysol 
      if(sol==1) mysol|=1<<(i-2); 
    } 
    s[threadIdx.x] = mysol; 
    // save thread_result 
    backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol; 
    __syncthreads(); 
    // compute global resulte and store it to ans 
    if(threadIdx.x==0) { 
      unsigned int global_sol = ~0; 
      for(int i=0;i<blockDim.x;++i) global_sol&=s[i]; 
      atomicAnd(ans,global_sol); 
    } 
} 


int main(void) { 
// number of blocks & thread for solve 
const int blocks = 400; 
const int threads = 32; 

unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 }; 

printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4))); 

/*-----*/ 

// allocate buffers 
unsigned int *dev_input,*dev_count; 
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23)))); 
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4))); 
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4))); 
find<<<400,32>>>(dev_input,dev_count); 
cudaDeviceSynchronize(); 

unsigned int count; 
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost))); 

// sort found elements just to make debbug easier, it is not necessary 
unsigned int *backup_numbers = new unsigned int[1000000]; 
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost))); 
std::sort(backup_numbers,backup_numbers+count); 
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice))); 
delete[] backup_numbers; 

printf("\nsize: %u\n",count); 

// allocate buffers 
unsigned int *dev_backup, *dev_ans; 
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads))); 
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4))); 
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4))); 

solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans); 
cudaDeviceSynchronize(); 

unsigned int ans,*backup; 
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost))); 
backup = new unsigned int[400*32]; 
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost))); 
unsigned int backup_ans = ~0; 

// compute global result using backuped thread_results 
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!) 
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i]; 
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans); 
printf("%u\n",backup[48]); 

delete[] backup; 
cudaFree(dev_ans); 
cudaFree(dev_backup); 
cudaFree(dev_count); 
cudaFree(dev_input); 
}

除solve（）方法以外的所有代碼都按照打算工作。 solve（）方法只是計算廢話（因爲backup_ans和ans不同），它也給了我最後兩個cudaMemcpy上的「未指定啓動失敗」錯誤。當我運行解決< < < 1,1 >>>（...）我得到

答：134816642個backup_ans 432501552

但是當我運行解決< < < 400,32 >>>（。 ..）它給了我

答：134816642個backup_ans 0 （正確的答案應該是0）

在所有情況下，它應該計算backup_ans = ANS = 0

任何意見，我做錯了會有所幫助。

代碼生成primes.bin

#include <cstdlib> 
#include <stdio.h> 
using namespace std; 

const unsigned long long n = 1ll<<32; 
const int buffer_size = 2000000; 

typedef unsigned char uch; 
typedef unsigned int uint; 
typedef unsigned long long ull; 

uch *primes; 

int prime(long long x) { 
if(x==2) return 1; 
if(x%2==0) return 0; 
long long pos = x/16; 
long long index = (x&15)>>1; 
return (1<<index)&(~(primes[pos])); 
} 
void eratosten_sieve(void) { 
    long long pos; 
    long long index; 
    for(long long i=3;i*i<n;++i) { 
     if(!prime(i)) continue; 
     for(long long j=i*i;j<n;j+=(i<<1)) { 
     pos = j/16; 
     index = ((j&15)>>1); 
     primes[pos]|=(1<<index); 
     } 
    } 

} 

int main(void) { 
primes = new uch[(n/16)+1]; 
for(long long i=0;i<(n/16)+1;++i) primes[i]=0; 
printf("generating\n"); 
eratosten_sieve(); 
int l = n/16 +1; 
printf("writing\n"); 
FILE *f = fopen("primes.bin","wb"); 
fwrite(primes,1,l,f); 
fclose(f); 
printf("done\n"); 
delete[] primes; 
}

PS：我對NVCC -arch compute_11

CUDA Driver Version/Runtime Version   5.5/5.5 
CUDA Capability Major/Minor version number: 1.1 
Total amount of global memory:     1023 MBytes (1073020928 bytes) 
(14) Multiprocessors, ( 8) CUDA Cores/MP:  112 CUDA Cores 
GPU Clock rate:        1500 MHz (1.50 GHz) 
Memory Clock rate:        900 Mhz 
Memory Bus Width:        256-bit 
Maximum Texture Dimension Size (x,y,z)   1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048) 
Maximum Layered 1D Texture Size, (num) layers 1D=(8192), 512 layers 
Maximum Layered 2D Texture Size, (num) layers 2D=(8192, 8192), 512 layers 
Total amount of constant memory:    65536 bytes 
Total amount of shared memory per block:  16384 bytes 
Total number of registers available per block: 8192 
Warp size:          32 
Maximum number of threads per multiprocessor: 768 
Maximum number of threads per block:   512 
Max dimension size of a thread block (x,y,z): (512, 512, 64) 
Max dimension size of a grid size (x,y,z): (65535, 65535, 1) 
Maximum memory pitch:       2147483647 bytes 
Texture alignment:        256 bytes 
Concurrent copy and kernel execution:   Yes with 1 copy engine(s) 
Run time limit on kernels:      Yes 
Integrated GPU sharing Host Memory:   No 
Support host page-locked memory mapping:  Yes 
Alignment requirement for Surfaces:   Yes 
Device has ECC support:      Disabled 
Device supports Unified Addressing (UVA):  No 
Device PCI Bus ID/PCI location ID:   1/0 
Compute Mode: 
    < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT 
Result = PASS

來源

2014-03-06 user3390078

在內核調用來自內核調用後，在'cudaMemcpy'調用中未指定的啓動失敗。您的內核正在進行某種非法訪問或其他非法操作。（您可以對內核調用進行適當的cuda錯誤檢查）。你可能想用'cuda-memcheck'運行你的代碼來獲得一些額外的洞察，看看內核爲什麼會失敗。 –

我試着運行您發佈的代碼（在生成primes.bin之後）並且我沒有在cc2.0設備上發生錯誤。輸出看起來像[this]（http://pastebin.com/zL6JcF7e）。令人費解。 –

9800 GT有多少內存？你是否還在其上託管一個顯示器？什麼是操作系統？ –

OK編譯它，你是內存不足。我花了一段時間才能弄清楚，因爲我並沒有考慮過大的靜態分配：

__device__ unsigned char primes[(1<<28)+1];

通常，當人們都出來了的記憶，他們發現它在cudaMalloc操作。在你的情況下，你的GPU有1GB的內存，而且我猜你還在上面顯示一個顯示（你沒有回答這個問題）。看看有在nvidia-smi -a輸出多少可用內存，它會是這個樣子：

FB Memory Usage 
    Total      : 1535 MiB 
    Used      : 3 MiB 
    Free      : 1532 MiB

您的號碼將會更小 - 空線是我們所關心的。

您的動態分配（即從cudaMalloc）分配約350MB。但內核啓動帶來了靜態分配，然後你的總佔用空間增加到超過700MB（2^28超過250MB）。如果您的顯示器在該GPU上運行，則會佔用1GB內存，使您無法運行需要700MB的內核。

如果您想在該GPU上運行，請參閱是否可以以某種方式減少問題的大小。

做proper cuda error checking總是好的，但除了這個問題之外，你的代碼似乎在有更多內存的設備上運行時沒有任何錯誤。

來源

2014-03-06 23:58:51

內存使用量合計：1023 MB 使用：186 MB 免費：837 MB 這在我看來已經足夠了。無論如何，我停止使用這個256MB的bitset並將其減少到4kB。但現在我正在「發佈超時並終止」。我已經更新了代碼，如果它適合你，你可以試試。（第一次終止是由於第一次cudaDeviceSynchronize（） – user3390078

啓動超時並終止是由於在運行CUDA的同一GPU上運行顯示（您尚未確認這一點，但我認爲是這種情況））。我無法很方便地再現這個問題，如果你想學習一些解決方法，請閱讀[this]（http://nvidia.custhelp.com/app/answers/detail/a_id/3029/）〜/ using-cuda-and-x）無論如何，你的原始問題我認爲是由於內存不足，內核失敗錯誤代碼（2）以及在減少內存佔用量時消失的事實所致。 –

與內存有關的莫名其妙的cuda行爲

回答

相關問題