從循環更改爲內核

我目前工作的一個網格的插值多線程，並具有多線程方面的一些問題。該代碼假設讀取由2x2矩陣表示的映射，然後對其進行插值以將點數增加100倍。在內核中使用for循環時，它的效果很好。從循環更改爲內核

當我試圖改變一個與線程循環，它產生了一些怪異的結果。在數字代替，它與-1填充生成的矩陣。＃QNAN

這裏是我工作的代碼與for循環內核

#include <stdlib.h> 
#include <stdio.h> 
#include <math.h> 
#include <fstream> 
#include "cuda.h" 

using namespace std; 

float Z[41][41]; 

// Macro to catch CUDA errors in CUDA runtime calls 
#define CUDA_SAFE_CALL(call)           \ 
do {                 \ 
    cudaError_t err = call;           \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 

// Macro to catch CUDA errors in kernel launches 
#define CHECK_LAUNCH_ERROR()           \ 
do {                 \ 
    /* Check synchronous errors, i.e. pre-launch */     \ 
    cudaError_t err = cudaGetLastError();        \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
    /* Check asynchronous errors, i.e. kernel failed (ULF) */   \ 
    err = cudaThreadSynchronize();         \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 

texture<float, 2, cudaReadModeElementType> tex; 


__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{ 
    int k = sqrt(numberOfInterpolationsPerSquare); 


    for (float i=0; i<n*k; i++) 
    { 
     for (float j=0; j<m*k; j++) 
     { 
     f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f); 
     } 
    } 

} 

int main (void) 
{ 
    // Start timer 
    clock_t tStart = clock(); 

    // Size of map 
    int n=41; 
    int m=41; 

    int g = 0; 

    float numberOfInterpolationsPerSquare = 100; 
    float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2); 

    size_t pitch, tex_ofs; 
    float *f; 
    float *r; 
    float *map_d = 0; 

    // Build read-Streams 
    ifstream map; 

    //Create and open a txt file for MATLAB 
    ofstream file; 

    // Open data 
    map.open("Map.txt", ios_base::in); 
    file.open("Bilinear.txt"); 

    // Store the map in a 2D array 
    for (int i=0; i<n; i++) 
    { 
     for (int j=0; j<m; j++) 
     { 
      map >> Z[i][j]; 
     } 
    } 

    // Allocate memory on host and device 
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m)); 
    CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float))); 
    r = (float*)malloc(numberOfElements*sizeof(float)); 

    // Copy map from host to device 
    CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice)); 

    // Set texture mode to bilinear interpolation 
    tex.normalized = false; 
    tex.filterMode = cudaFilterModeLinear; 

    // Bind the map to texture 
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch)); 

    // Checking for offset 
    if (tex_ofs !=0) { 
     printf ("tex_ofs = %zu\n", tex_ofs); 
     return EXIT_FAILURE; 
    } 

    // Launch Kernel 
    kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare); 
    CHECK_LAUNCH_ERROR();  
    CUDA_SAFE_CALL (cudaDeviceSynchronize()); 

    // Copy result from device to host 
    cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost); 

    // Write results to file 
    for(int h=0;h<numberOfElements;h++) 
    { 
     if(g==sqrt(numberOfElements)) 
     { 
      file << endl; 
      g=0; 
     } 
     file << r[h] << " "; 
     g++; 
    } 

    // Free memory 
    CUDA_SAFE_CALL (cudaUnbindTexture (tex)); 
    CUDA_SAFE_CALL (cudaFree (map_d)); 
    CUDA_SAFE_CALL (cudaFree (f)); 
    free(r); 

    // Print out execution time 
    printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); 

    return EXIT_SUCCESS; 
}

這裏的多線程內核，不工作

__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{ 
    int k = sqrt(numberOfInterpolationsPerSquare); 

    int i= blockIdx.x * blockDim.x + threadIdx.x; 
    int j= blockIdx.y * blockDim.y + threadIdx.y; 


    if(i>=n*k || j>=m*k) 
     return; 

    f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f); 

}

有誰知道爲什麼多線程版本不起作用？

問候

鬆德雷

來源

2013-07-18 user2594166

如果有人想嘗試的代碼，這裏的地圖：http://codepad.org/fe8aWGMt – user2594166

你是如何啓動多線程內核？另外，在第二個內核中，'i'和'j'是'int'而不是'float'。所以'tex2D'中的'j/k'和'i/k'將會導致整數除法。考慮將'k'聲明爲'float'。 – sgarizvi

謝謝你的回覆！我改變了i和j以浮動，現在它產生了一些數字，但仍然是-1。＃QNAN。內核被稱爲是這樣的： //查找塊 INT來確定nthreads = 1024的號碼; int blocksize = 512; INT Nblocks屬=小區（（N * M * numberOfInterpolationsPerSquare）/來確定nthreads）; //啓動內核內核<<< Nblocks屬，塊大小>>>（M，N，F，numberOfInterpolationsPerSquare）; – user2594166

在第二內核，i和j是int代替float。因此，在tex2Dj/k和i/k將導致整數除法。將k聲明爲float以避免整數除法。

最初，內核是用下面的配置推出：

//Find number of blocks 
int nthreads = 1024; 
int blocksize = 512; 
int nblocks = ceil((n*m*numberOfInterpolationsPerSquare)/nthreads); 

// Launch Kernel 
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);

與上述代碼的問題是，將推出1D塊的一維網格，但在內核中，使用2D索引。內核需要2D網格/塊配置才能正常工作。從內核代碼的外觀，下面的網格/塊的配置應該工作：

float k = sqrt(numberOfInterpolationsPerSquare); 

const int threads_x = (int)ceil(n * k); 
const int threads_y = (int)ceil(m * k); 

const dim3 dimBlock(16,16); 

dim3 dimGrid; 
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x; 
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y; 

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

來源

2013-07-19 13:28:02 sgarizvi

從循環更改爲內核

回答

相關問題