如何跟蹤執行的CUDA塊？

-1

爲了測試我對事物的理解，我決定修改CUDA示例中的矢量添加，以便內核在特定時間後退出，然後重新啓動完成。實現「超時」的方式是讓主機在一段時間後設置爲1的固定變量。在內核中，執行該變量的檢查以確定是否應該繼續執行。如果線程繼續執行，則標記爲完成。爲了測試每個線程只執行一次，我修改了除C[i] = C[i] + B[i]之外的所有內容。設備代碼如下：如何跟蹤執行的CUDA塊？

/* Function 
* Internal device function used for getting the current thread's global ID 
* regardless of the block/grid configuration. It assumes that the 
* grid and block are 3 dimensional. 
* 
* @return: The thread's global ID 
*/ 
static __device__ int get_global_idx() 
{ 
    int blockId = blockIdx.x 
    + blockIdx.y * gridDim.x 
    + gridDim.x * gridDim.y * blockIdx.z; 
    int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) 
    + (threadIdx.z * (blockDim.x * blockDim.y)) 
    + (threadIdx.y * blockDim.x) 
    + threadIdx.x; 
    return threadId; 
} 

/* Function 
* Device function that determines if the current thread should continue execution. 
* A check should be used on the return value. If the timeout has not been set 
* and the thread has not previously executed the index at the thread's ID in the 
* thread_ids array is set to 1 to indicate it was allowed to proceed. 
* 
* @param thread_ids: A pointer to the array with a size that matches the max number 
*      of threads that will be spawned 
* 
* @param time_out: Memory mapped variable used by the host to signal the kernel when 
*     execution should suspend 
* 
* @return: A boolean value indicating whether the current thread should continue or not 
*/ 
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out) 
{ 
    if(*time_out == 1){ 
     return false; 
    } 

    int tid = get_global_idx(); 

    if(thread_ids[tid] == 1) 
    { 
     return false; 
    } 
    thread_ids[tid] = 1; 

    return true; 
} 

__global__ void 
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout) 
{ 
    if(!continue(thread_ids, timeout)) 
    { 
     return; 
    } 

    int i = blockDim.x * blockIdx.x + threadIdx.x; 


    if (i < numElements) 
    { 
     /* C[i] = A[i] + B[i]; */ 
     C[i] = C[i] + B[i]; //Modifed from above 
    } 
}

我認爲如果是如何使用__syncthreads（），這可能會失敗。所以我決定做塊級暫停。根據我的理解，我認爲這很簡單。跟蹤塊是否已經啓動，並計算該塊已執行多少個線程，並且只在已經啓動的塊的所有線程都完成時才掛起，並拒絕任何塊未啓動的線程。所以我用一個結構和修改的持續功能如下：

typedef struct block_info_t{ 
    int started; /* Initialized to zero before any kernel launch */ 
    unsigned int thread_count; 
}block_info; 

__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info) 
{ 
    int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z); 
    unsigned int bsize = blockDim.x * blockDim.y * blockDim.z; 

    if(*time_out == 1 && b_info[bid].started == 0) 
    { 
     return false; 
    } 

    if(b_info[bid].thread_count == bsize) 
    { 
     return false; 
    } 

    b_info[bid].started = 1; 
    atomicInc(&b_info[bid].thread_count, bsize); 

    return true; 
}

這不工作，當我的主機（h_B[i] - h_C[i]）我沒有得到一致的結果爲零上執行驗證。這意味着某些線程以某種方式設法執行多次。任何想法如何/爲什麼這發生在後面的嘗試？謝謝。

我不在乎這一點的表現;試圖瞭解真正發生的事情。

編輯

下面是完整的代碼，編譯nvcc file_name.cu和執行program_name <vector-length>。

#include <stdio.h> 
#include <stdlib.h> 
#include <unistd.h> 

// For the CUDA runtime routines (prefixed with "cuda_") 
#include <cuda_runtime.h> 

typedef struct block_info_t{ 
    int started; /* Initialized to zero before any kernel launch */ 
    unsigned int thread_count; 
}block_info; 

__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info) 
{ 
    int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z); 
    unsigned int bsize = blockDim.x * blockDim.y * blockDim.z; 

    if(*time_out == 1 && b_info[bid].started == 0) 
    { 
     return false; 
    } 

    if(b_info[bid].thread_count == bsize) 
    { 
     return false; 
    } 

    b_info[bid].started = 1; 
    atomicInc(&b_info[bid].thread_count, bsize); 

    return true; 
} 

__global__ void 
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info) 
{ 
    if(!continue_execution(time_out, b_info)) 
    { 
     return; 
    } 

    int i = blockDim.x * blockIdx.x + threadIdx.x; 

    if (i < numElements) 
    { 
     //C[i] = A[i] + B[i]; 
     C[i] = C[i] + B[i]; //Modified from above 
    } 
} 

void computation_complete(int *complete, int block_amt, block_info *h_block_info) 
{ 
    size_t i; 
    for(i = 0; i < block_amt; i++) 
    { 
    if(h_block_info[i].started == 1) 
    { 
     continue; 
    } 
    break; 
    } 
    *complete = (i == block_amt) ? 1 : 0; 
} 

int main(int argc, char *argv[]) 
{ 
    if(argc != 2) 
    { 
     fprintf(stderr, "usage: <program-name> <vector-length>\n"); 
     exit(EXIT_FAILURE); 
    } 

    // Print the vector length to be used, and compute its size 
    long numElements = strtol(argv[1], NULL, 10); 
    size_t size = numElements * sizeof(float); 
    printf("[Vector addition of %d elements]\n", numElements); 

    float *h_A = (float *)malloc(size); 
    float *h_B = (float *)malloc(size); 
    float *h_C = (float *)malloc(size); 

    // Initialize the host input vectors 
    for (int i = 0; i < numElements; ++i) 
    { 
     h_A[i] = rand()/(float)RAND_MAX; 
     h_B[i] = rand()/(float)RAND_MAX; 
     h_C[i] = 0.0; 
    } 

    float *d_A = NULL; 
    cudaMalloc((void **)&d_A, size); 

    float *d_B = NULL; 
    cudaMalloc((void **)&d_B, size); 

    float *d_C = NULL; 
    cudaMalloc((void **)&d_C, size); 

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice); 

    int threadsPerBlock = 256; 
    int blocksPerGrid =(numElements + threadsPerBlock - 1)/threadsPerBlock; 

    size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t); 
    block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes); 

    for(int i = 0; i < blocksPerGrid; i++) 
    { 
     h_block_info[i].started = 0; 
     h_block_info[i].thread_count = 0; 
    } 

    block_info *d_block_info = NULL; 
    cudaMalloc(&d_block_info, block_info_bytes); 
    cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice); 

    volatile unsigned int *timeout = NULL; 
    cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped); 
    *timeout = 0; 

    double quantum = 0.0001 * 1000000.0; 
    double initial_quantum = quantum; 

    int complete = 0; 

    /* Here the kernel launch is looped until all blocks are complete */ 
    while(complete == 0) 
    { 
     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info); 
     usleep(quantum); 
     *timeout = 1; 
     cudaDeviceSynchronize(); 

     cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost); 
     computation_complete(&complete, blocksPerGrid, h_block_info); 

     if(complete == 0) 
     { 
     quantum = quantum + initial_quantum; 
     *timeout = 0; 
     } 
    } 

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 

    // Verify that the result vector is correct 
    for (int i = 0; i < numElements; ++i) 
    { 
     if (fabs(h_B[i] - h_C[i]) > 1e-5) 
     { 
      fprintf(stderr, "Result verification failed at element %d!\n", i); 
      exit(EXIT_FAILURE); 
     } 
    } 

    printf("Test PASSED\n"); 

    // Free device global memory 
    cudaFree(d_A); 
    cudaFree(d_B); 
    cudaFree(d_C); 

    free(h_A); 
    free(h_B); 
    free(h_C); 

    cudaDeviceReset(); 
    return 0; 
}

來源

2017-04-18 John

當問「爲什麼不是這個代碼工作？」你[預期]（http://stackoverflow.com/help/on-topic）（< - 點擊這裏閱讀）提供[mcve]。你所展示的不是一個。它應該是一個完整的代碼，其他人可以編譯並運行並查看問題，而無需添加任何內容或進行任何更改。內核本身不是[mcve]。 –

@RobertCrovella，謝謝。我編輯過，包括一個最小化，完整和可驗證的例子。 – John

你有一個競爭條件在你的continue_execution例程。考慮以下情況：

線程塊的warp0進入continue_execution例程。當它檢查變量*time_out和b_info[bid].started時，它們分別證明它們分別爲0和0。所以它進行到下一個測試。
相同線程塊的warp1進入continue_execution例程（假設稍後），它看到變量分別爲1和0。所以它返回false並導致warp1線程退出。
warp0繼續並最終將b_info[bid].started設置爲1，然後更新thread_count。然後它返回true，並繼續使用vector add。

我可以繼續這一點，但我認爲如果你仔細考慮上述3項，你會意識到這是一個你沒有考慮的情況。您的隱含期望是每個線程都會讀取*time_out的連貫（即在給定線程塊上相同）值。但是這不能保證你的代碼，如果它沒有這樣做，那麼我們最終會得到一些線程塊，其中一些線程已經完成了他們的工作，有些線程塊沒有完成他們的工作。

那麼我們如何解決這個問題呢？以上描述應該指明方向。一種可能的方法是保證，對於任何給定的threadblock，每個線程將獲取*time_out的相同值，無論是1或0。一個可能的解決方案是使以下更改vectorAdd內核的開頭：

__shared__ volatile unsigned int my_time_out; 
if (!threadIdx.x) my_time_out = *time_out; 
__syncthreads(); 
if(!continue_execution(&my_time_out, b_info))

這些變化，我們保證在一個區域內每個線程獲取的超時變量一致的看法，並根據我的測試，該問題得到解決：

$ cat t100.cu 
#include <stdio.h> 
#include <stdlib.h> 
#include <unistd.h> 

// For the CUDA runtime routines (prefixed with "cuda_") 
#include <cuda_runtime.h> 

typedef struct block_info_t{ 
    int started; /* Initialized to zero before any kernel launch */ 
    unsigned int thread_count; 
}block_info; 

__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info) 
{ 
    int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z); 
    unsigned int bsize = blockDim.x * blockDim.y * blockDim.z; 

    if(*time_out == 1 && b_info[bid].started == 0) 
    { 
     return false; 
    } 

    if(b_info[bid].thread_count == bsize) 
    { 
     return false; 
    } 

    b_info[bid].started = 1; 
    atomicInc(&b_info[bid].thread_count, bsize); 

    return true; 
} 

__global__ void 
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info) 
{ 
#ifdef USE_FIX 
    __shared__ volatile unsigned int my_time_out; 
    if (!threadIdx.x) my_time_out = *time_out; 
    __syncthreads(); 
    if(!continue_execution(&my_time_out, b_info)) 
#else 
    if(!continue_execution(time_out, b_info)) 
#endif 
    { 
     return; 
    } 

    int i = blockDim.x * blockIdx.x + threadIdx.x; 

    if (i < numElements) 
    { 
     //C[i] = A[i] + B[i]; 
     C[i] = C[i] + B[i]; //Modified from above 
    } 
} 

void computation_complete(int *complete, int block_amt, block_info *h_block_info) 
{ 
    size_t i; 
    for(i = 0; i < block_amt; i++) 
    { 
    if(h_block_info[i].started == 1) 
    { 
     continue; 
    } 
    break; 
    } 
    *complete = (i == block_amt) ? 1 : 0; 
} 

int main(int argc, char *argv[]) 
{ 
    if(argc != 2) 
    { 
     fprintf(stderr, "usage: <program-name> <vector-length>\n"); 
     exit(EXIT_FAILURE); 
    } 

    // Print the vector length to be used, and compute its size 
    long numElements = strtol(argv[1], NULL, 10); 
    size_t size = numElements * sizeof(float); 
    printf("[Vector addition of %ld elements]\n", numElements); 

    float *h_A = (float *)malloc(size); 
    float *h_B = (float *)malloc(size); 
    float *h_C = (float *)malloc(size); 

    // Initialize the host input vectors 
    for (int i = 0; i < numElements; ++i) 
    { 
     h_A[i] = rand()/(float)RAND_MAX; 
     h_B[i] = rand()/(float)RAND_MAX; 
     h_C[i] = 0.0; 
    } 

    float *d_A = NULL; 
    cudaMalloc((void **)&d_A, size); 

    float *d_B = NULL; 
    cudaMalloc((void **)&d_B, size); 

    float *d_C = NULL; 
    cudaMalloc((void **)&d_C, size); 

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice); 

    int threadsPerBlock = 256; 
    int blocksPerGrid =(numElements + threadsPerBlock - 1)/threadsPerBlock; 

    size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t); 
    block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes); 

    for(int i = 0; i < blocksPerGrid; i++) 
    { 
     h_block_info[i].started = 0; 
     h_block_info[i].thread_count = 0; 
    } 

    block_info *d_block_info = NULL; 
    cudaMalloc(&d_block_info, block_info_bytes); 
    cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice); 

    volatile unsigned int *timeout = NULL; 
    cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped); 
    *timeout = 0; 

    double quantum = 0.0001 * 1000000.0; 
    double initial_quantum = quantum; 

    int complete = 0; 

    /* Here the kernel launch is looped until all blocks are complete */ 
    while(complete == 0) 
    { 
     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info); 
     usleep(quantum); 
     *timeout = 1; 
     cudaDeviceSynchronize(); 

     cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost); 
     computation_complete(&complete, blocksPerGrid, h_block_info); 

     if(complete == 0) 
     { 
     quantum = quantum + initial_quantum; 
     *timeout = 0; 
     } 
    } 

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 

    // Verify that the result vector is correct 
    for (int i = 0; i < numElements; ++i) 
    { 
     if (fabs(h_B[i] - h_C[i]) > 1e-5) 
     { 
      fprintf(stderr, "Result verification failed at element %d!\n", i); 
      exit(EXIT_FAILURE); 
     } 
    } 

    printf("Test PASSED\n"); 

    // Free device global memory 
    cudaFree(d_A); 
    cudaFree(d_B); 
    cudaFree(d_C); 

    free(h_A); 
    free(h_B); 
    free(h_C); 

    cudaDeviceReset(); 
    return 0; 
} 
$ nvcc -arch=sm_61 -o t100 t100.cu 
$ ./t100 327678 
[Vector addition of 327678 elements] 
Result verification failed at element 0! 
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX 
$ ./t100 327678 
[Vector addition of 327678 elements] 
Test PASSED 
$ ./t100 327678 
[Vector addition of 327678 elements] 
Test PASSED 
$ ./t100 327678 
[Vector addition of 327678 elements] 
Test PASSED 
$

另外一個改變我做到你的代碼是在這一行：

printf("[Vector addition of %d elements]\n", numElements);

這對問題沒有影響，但格式說明符與您的變量類型不匹配。修改爲%ld。

來源

2017-04-18 22:49:06

如何跟蹤執行的CUDA塊？

回答

相關問題