2016-03-24 46 views
0

我複製了一個異步CUDA/C++示例並對其進行了修改以評估素數。我的問題是,對於每個打印的素數,數組中的下一個值是該值的重複值。這是預期的行爲還是與我編程示例的方式有關?CUDA計算之後在數組中重複值

驗證碼:

//////////////////////////////////////////////////////////////////////////// 
// 
// Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 
// 
// Please refer to the NVIDIA end user license agreement (EULA) associated 
// with this source code for terms and conditions that govern your use of 
// this software. Any use, reproduction, disclosure, or distribution of 
// this software and related documentation outside the terms of the EULA 
// is strictly prohibited. 
// 
//////////////////////////////////////////////////////////////////////////// 

// 
// This sample illustrates the usage of CUDA events for both GPU timing and 
// overlapping CPU and GPU execution. Events are inserted into a stream 
// of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can 
// perform computations while GPU is executing (including DMA memcopies 
// between the host and device). CPU can query CUDA events to determine 
// whether GPU has completed tasks. 
// 

// includes, system 
#include <stdio.h> 

// includes CUDA Runtime 
#include <cuda_runtime.h> 

// includes, project 
#include <helper_cuda.h> 
#include <helper_functions.h> // helper utility functions 


//set matrix to possible prime values 
//evaluate if input is prime, sets variable to 0 if not prime 
__global__ void testPrimality(int * g_data) { 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    g_data[idx] = 3 + idx/2; 

    if (g_data[idx] <= 3) { 
     if (g_data[idx] <= 1) { 
      g_data[idx] = 0; 
     } 
    } 

    else if (g_data[idx] % 2 == 0 || g_data[idx] % 3 == 0) { 
     g_data[idx] = 0; 
    } 

    else { 
     for (unsigned short i = 5; i * i <= g_data[idx]; i += 6) { 
      if (g_data[idx] % i == 0 || g_data[idx] % (i + 2) == 0) { 
       g_data[idx] = 0; 
      } 
     } 
    } 

} 

bool correct_output(int *data, const int n, const int x) 
{ 
    for (int i = 0; i < n; i++) 
     if (data[i] != x) 
     { 
      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); 
      return false; 
     } 

    return true; 
} 

int main(int argc, char *argv[]) 
{ 
    int devID; 
    cudaDeviceProp deviceProps; 

    printf("[%s] - Starting...\n", argv[0]); 

    // This will pick the best possible CUDA capable device 
    devID = findCudaDevice(argc, (const char **)argv); 

    // get device name 
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); 
    printf("CUDA device [%s]\n", deviceProps.name); 

    const int n = 16 * 1024 * 1024; 
    int nbytes = n * sizeof(int); 
    int value = 1; 

    // allocate host memory 
    int *a = 0; 
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); 
    memset(a, 0, nbytes); 



    // allocate device memory 
    int *d_a=0; 
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); 
    checkCudaErrors(cudaMemset(d_a, 255, nbytes)); 

    // set kernel launch configuration 
    dim3 threads = dim3(512, 1); 
    dim3 blocks = dim3(n/threads.x, 1); 

    // create cuda event handles 
    cudaEvent_t start, stop; 
    checkCudaErrors(cudaEventCreate(&start)); 
    checkCudaErrors(cudaEventCreate(&stop)); 

    StopWatchInterface *timer = NULL; 
    sdkCreateTimer(&timer); 
    sdkResetTimer(&timer); 

    checkCudaErrors(cudaDeviceSynchronize()); 
    float gpu_time = 0.0f; 

    // asynchronously issue work to the GPU (all to stream 0) 
    sdkStartTimer(&timer); 
    cudaEventRecord(start, 0); 
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); 
    //increment_kernel<<<blocks, threads, 0, 0>>>(d_a); 
    testPrimality<<<blocks, threads, 0, 0 >>>(d_a); 
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); 
    cudaEventRecord(stop, 0); 
    sdkStopTimer(&timer); 

    // have CPU do some work while waiting for stage 1 to finish 
    unsigned long int counter=0; 

    while (cudaEventQuery(stop) == cudaErrorNotReady) 
    { 
     counter++; 
    } 

    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 

    // print the cpu and gpu times 
    printf("time spent executing by the GPU: %.2f\n", gpu_time); 
    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); 
    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); 

    //print values for all allocated memory space 
    for (int i = 0; i < n; i++) { 
     if (a[i] != 0) { 
      std::cout << a[i]<< " : " << i << std::endl; 
     } 
    } 

    // check the output for correctness 
    //bool bFinalResults = correct_output(a, n, value); 
    bool bFinalResults = true; 



    // release resources 
    checkCudaErrors(cudaEventDestroy(start)); 
    checkCudaErrors(cudaEventDestroy(stop)); 
    checkCudaErrors(cudaFreeHost(a)); 
    checkCudaErrors(cudaFree(d_a)); 

    // cudaDeviceReset causes the driver to clean up all state. While 
    // not mandatory in normal operation, it is good practice. It is also 
    // needed to ensure correct operation when the application is being 
    // profiled. Calling cudaDeviceReset causes all profile data to be 
    // flushed before the application exits 
    cudaDeviceReset(); 

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); 
} 

回答

2

的複製被從實際的「輸入」值的操作產生的。這是我不清楚你想要什麼樣的數字序列,但此行的代碼:

g_data[idx] = 3 + idx/2; 

確實整數師(idxint型的,所以是g_data[idx])。

整數除以二的結果意味着「輸入」中的每個值都將被複制,因此輸出中的每個值也將被複制。如果你想看到的輸入值,修改了上cout聲明,如下所示:

 std::cout << a[i]<< " : " << i << " " << 3+i/2 << std::endl; 

以「模擬」輸入數據生成你在內核中做的事情。如果你這樣做,你會在最後一列數字中看到重複。

編輯:根據下面的評論,似乎有一些不確定的關於idx變量將如何產生數字。這是產生一個全球唯一的線程ID的規範方法:

int idx = blockIdx.x * blockDim.x + threadIdx.x; 

在一般使用每個線程將獲取比「前」線程更高一個唯一的正指數:

0,1,2,3,... 

看來,需要的情況下,是創造一個「輸入」數據集是這樣的:

3,5,7,9,... 

因此,正確的算法代替這個:

g_data[idx] = 3 + idx/2; 

是這樣的:

g_data[idx] = 3 + idx * 2; 

這裏是一個完全樣例與變化,與以前cout變化,我建議:

$ cat t1119.cu 
//////////////////////////////////////////////////////////////////////////// 
// 
// Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 
// 
// Please refer to the NVIDIA end user license agreement (EULA) associated 
// with this source code for terms and conditions that govern your use of 
// this software. Any use, reproduction, disclosure, or distribution of 
// this software and related documentation outside the terms of the EULA 
// is strictly prohibited. 
// 
//////////////////////////////////////////////////////////////////////////// 

// 
// This sample illustrates the usage of CUDA events for both GPU timing and 
// overlapping CPU and GPU execution. Events are inserted into a stream 
// of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can 
// perform computations while GPU is executing (including DMA memcopies 
// between the host and device). CPU can query CUDA events to determine 
// whether GPU has completed tasks. 
// 

// includes, system 
#include <stdio.h> 

// includes CUDA Runtime 
#include <cuda_runtime.h> 

// includes, project 
#include <helper_cuda.h> 
#include <helper_functions.h> // helper utility functions 


//set matrix to possible prime values 
//evaluate if input is prime, sets variable to 0 if not prime 
__global__ void testPrimality(int * g_data) { 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    g_data[idx] = 3 + idx * 2; 

    if (g_data[idx] <= 3) { 
     if (g_data[idx] <= 1) { 
      g_data[idx] = 0; 
     } 
    } 

    else if (g_data[idx] % 2 == 0 || g_data[idx] % 3 == 0) { 
     g_data[idx] = 0; 
    } 

    else { 
     for (unsigned short i = 5; i * i <= g_data[idx]; i += 6) { 
      if (g_data[idx] % i == 0 || g_data[idx] % (i + 2) == 0) { 
       g_data[idx] = 0; 
      } 
     } 
    } 

} 

bool correct_output(int *data, const int n, const int x) 
{ 
    for (int i = 0; i < n; i++) 
     if (data[i] != x) 
     { 
      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); 
      return false; 
     } 

    return true; 
} 

int main(int argc, char *argv[]) 
{ 
    int devID; 
    cudaDeviceProp deviceProps; 

    printf("[%s] - Starting...\n", argv[0]); 

    // This will pick the best possible CUDA capable device 
    devID = findCudaDevice(argc, (const char **)argv); 

    // get device name 
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); 
    printf("CUDA device [%s]\n", deviceProps.name); 

    //const int n = 16 * 1024 * 1024; 
    const int n = 1024; 
    int nbytes = n * sizeof(int); 
    //int value = 1; 

    // allocate host memory 
    int *a = 0; 
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); 
    memset(a, 0, nbytes); 



    // allocate device memory 
    int *d_a=0; 
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); 
    checkCudaErrors(cudaMemset(d_a, 255, nbytes)); 

    // set kernel launch configuration 
    dim3 threads = dim3(512, 1); 
    dim3 blocks = dim3(n/threads.x, 1); 

    // create cuda event handles 
    cudaEvent_t start, stop; 
    checkCudaErrors(cudaEventCreate(&start)); 
    checkCudaErrors(cudaEventCreate(&stop)); 

    StopWatchInterface *timer = NULL; 
    sdkCreateTimer(&timer); 
    sdkResetTimer(&timer); 

    checkCudaErrors(cudaDeviceSynchronize()); 
    float gpu_time = 0.0f; 

    // asynchronously issue work to the GPU (all to stream 0) 
    sdkStartTimer(&timer); 
    cudaEventRecord(start, 0); 
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); 
    //increment_kernel<<<blocks, threads, 0, 0>>>(d_a); 
    testPrimality<<<blocks, threads, 0, 0 >>>(d_a); 
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); 
    cudaEventRecord(stop, 0); 
    sdkStopTimer(&timer); 

    // have CPU do some work while waiting for stage 1 to finish 
    unsigned long int counter=0; 

    while (cudaEventQuery(stop) == cudaErrorNotReady) 
    { 
     counter++; 
    } 

    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 

    // print the cpu and gpu times 
    printf("time spent executing by the GPU: %.2f\n", gpu_time); 
    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); 
    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); 

    //print values for all allocated memory space 
    for (int i = 0; i < n; i++) { 
     if (a[i] != 0) { 
      std::cout << a[i]<< " : " << i << " " << 3 + i * 2 << std::endl; 
     } 
    } 

    // check the output for correctness 
    //bool bFinalResults = correct_output(a, n, value); 
    bool bFinalResults = true; 



    // release resources 
    checkCudaErrors(cudaEventDestroy(start)); 
    checkCudaErrors(cudaEventDestroy(stop)); 
    checkCudaErrors(cudaFreeHost(a)); 
    checkCudaErrors(cudaFree(d_a)); 

    // cudaDeviceReset causes the driver to clean up all state. While 
    // not mandatory in normal operation, it is good practice. It is also 
    // needed to ensure correct operation when the application is being 
    // profiled. Calling cudaDeviceReset causes all profile data to be 
    // flushed before the application exits 
    cudaDeviceReset(); 

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); 
} 
$ nvcc -I/usr/local/cuda/samples/common/inc t1119.cu -o t1119 
$ cuda-memcheck ./t1119 

(excerpted output:) 

337 : 167 337 
347 : 172 347 
349 : 173 349 
353 : 175 353 
359 : 178 359 
367 : 182 367 
373 : 185 373 
379 : 188 379 
383 : 190 383 
389 : 193 389 
397 : 197 397 
401 : 199 401 
409 : 203 409 
419 : 208 419 
421 : 209 421 
431 : 214 431 
433 : 215 433 
439 : 218 439 
443 : 220 443 
449 : 223 449 
457 : 227 457 
461 : 229 461 
463 : 230 463 
467 : 232 467 
479 : 238 479 
487 : 242 487 
491 : 244 491 
499 : 248 499 
503 : 250 503 
509 : 253 509 
521 : 259 521 
523 : 260 523 
541 : 269 541 
547 : 272 547 
557 : 277 557 
563 : 280 563 
569 : 283 569 
571 : 284 571 
577 : 287 577 
587 : 292 587 
593 : 295 593 
599 : 298 599 
601 : 299 601 
607 : 302 607 
613 : 305 613 
617 : 307 617 
619 : 308 619 

正如上面可以看出,沒有在輸出序列中重複。

+0

我試圖從整數3開始,對於每個單獨的計算,它應該增加2(因爲素數從來沒有)。我認爲我可以做到這一點,因爲idx似乎總是4的倍數。我想我可以通過將idx除以2來得到2的增量。 – Stephen

+0

然後,也許你想要3+'idx' * 2。 'idx'是你的全局唯一線程索引,它從0開始,每個線程增加1個0,1,2,3,...所以如果你想要一個數值輸入序列3,5,7,9, ...正確的算術是3 +'idx' * 2。 'idx'因爲你有它並不總是4的倍數。 –

+0

我按你的建議做了,但它似乎仍然是重複的價值。 – Stephen