我正在試着製作一個小計算器,用於計算cuda設計,下面是第一個順序版本,然後是我的並行版本。 它運行沒有錯誤,但由於某種原因,它不會給結果回來,我一直試圖讓這個工作2個星期,但現在找不到錯誤!CUDA不返回結果
Serilized版本
int f(int x, int c, int n);
int gcd(unsigned int u, unsigned int v);
int main()
{
clock_t start = clock();
srand (time(NULL));
int x = 1;
int y = 2;
int d = 1;
int c = rand() % 100;
int n = 323;
if(n % y == 0)
d = y;
while(d == 1)
{
x = f(x, c, n);
y = f(f(y, c, n), c, n);
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
if(d == n)
{
printf("\nd == n");
c = 0;
while(c == 0 || c == -2)
c = rand() % 100;
x = 2;
y = 2;
}
}
int d2 = n/d;
printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC);
printf("\nResult: %d", d);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}
int f(int x, int c, int n)
{
return (int)(pow((float)x, 2) + c) % n;
}
int gcd(unsigned int u, unsigned int v){
int shift;
/* GCD(0,x) := x */
if (u == 0 || v == 0)
return u | v;
/* Let shift := lg K, where K is the greatest power of 2
dividing both u and v. */
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
/* From here on, u is always odd. */
do {
while ((v & 1) == 0)/* Loop X */
v >>= 1;
/* Now u and v are both odd, so diff(u, v) is even.
Let u = min(u, v), v = diff(u, v)/2. */
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
水貨版本
#define threads 512
#define MaxBlocks 65535
#define RunningTheads (512*100)
__device__ int gcd(unsigned int u, unsigned int v)
{
int shift;
if (u == 0 || v == 0)
return u | v;
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
do {
while ((v & 1) == 0)
v >>= 1;
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
__device__ bool cuda_found;
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
int index = blockIdx.x * threads + threadIdx.x;
int x = 1;
int y = 2;
int d = 4;
int c = cArray[index];
while(d == 1 && !cuda_found)
{
x = (int)(pow((float)x, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
}
if(d != 1 && !cuda_found)
{
cuda_found = true;
outr = &d;
}
}
int main()
{
int n = 323;
int cArray[RunningTheads];
cArray[0] = 1;
for(int i = 1; i < RunningTheads-1; i++)
{
cArray[i] = i+2;
}
int dresult = 0;
int *dev_cArray;
int *dev_result;
HANDLE_ERROR(cudaMalloc((void**)&dev_cArray, RunningTheads*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_cArray, cArray, RunningTheads*sizeof(int), cudaMemcpyHostToDevice));
int TotalBlocks = ceil((float)RunningTheads/(float)threads);
if(TotalBlocks > MaxBlocks)
TotalBlocks = MaxBlocks;
printf("Blocks: %d\n", TotalBlocks);
printf("Threads: %d\n\n", threads);
cudaKernal<<<TotalBlocks,threads>>>(dev_cArray, n, dev_result);
HANDLE_ERROR(cudaMemcpy(&dresult, dev_result, sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(dev_cArray));
HANDLE_ERROR(cudaFree(dev_result));
if(dresult == 0)
dresult = 1;
int d2 = n/dresult;
printf("\nResult: %d", dresult);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}
這是值得指出的是,即使在經過此回覆標識的所有問題都是固定的,該算法的基本前提和使用cuda_found'的'因爲全局同步標誌被完全破壞,代表了一個永遠無法可靠工作的巨大內存競爭。 – talonmies