2013-06-12 76 views
2

我有一個代表二維數組的無符號字符的線性數組。我想將它放到CUDA 2D紋理中,並對其進行(浮點)線性插值,即紋理調用獲取4個最近的無符號char鄰居,在內部將它們轉換爲浮點,在它們之間進行插值並返回結果浮點值。設置用於線性插值的CUDA 2D「無符號字符」紋理

我有一些困難設置紋理並將其綁定到紋理參考。我已經通過CUDA參考手冊&附錄,但我只是沒有任何運氣。

下面是可運行代碼,用於設置和綁定1)浮點紋理和2)無符號字符紋理。浮點代碼運行得很好。但是,如果您取消註釋兩個註釋的unsigned char行的底部,會引發「無效參數」錯誤。

#include <cstdio> 
#include <cuda_runtime.h> 

typedef unsigned char uchar; 

// Define (global) texture references; must use "cudaReadModeNormalizedFloat" 
// for ordinal textures 
texture<float, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefFloat; 
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefUChar; 

// Define size of (row major) textures 
size_t const WIDTH = 1000; 
size_t const HEIGHT = 1000; 
size_t const TOT_PIX = WIDTH*HEIGHT; 

int main(void) 
{ 
    // Set texel formats 
    cudaChannelFormatDesc descFloat = cudaCreateChannelDesc<float>(); 
    cudaChannelFormatDesc descUChar = cudaCreateChannelDesc<uchar>(); 

    // Choose to perform texture 2D linear interpolation 
    texRefFloat.filterMode = cudaFilterModeLinear; 
    texRefUChar.filterMode = cudaFilterModeLinear; 

    // Allocate texture device memory 
    float * d_buffFloat; cudaMalloc(&d_buffFloat, sizeof(float)*TOT_PIX); 
    uchar * d_buffUChar; cudaMalloc(&d_buffUChar, sizeof(uchar)*TOT_PIX); 

    // Bind texture references to textures 
    cudaError_t errFloat = cudaSuccess; 
    cudaError_t errUChar = cudaSuccess; 

    errFloat = cudaBindTexture2D(0, texRefFloat, d_buffFloat, descFloat, 
        WIDTH, HEIGHT, sizeof(float)*WIDTH); 
    // Uncomment the following two lines for an error 
    //errUChar = cudaBindTexture2D(0, texRefUChar, d_buffUChar, descUChar, 
    //    WIDTH, HEIGHT, sizeof(uchar)*WIDTH); 

    // Check for errors during binding 
    if (errFloat != cudaSuccess) 
    { 
     printf("Error binding float texture reference: %s\n", 
      cudaGetErrorString(errFloat)); 
     exit(-1); 
    } 

    if (errUChar != cudaSuccess) 
    { 
     printf("Error binding unsigned char texture reference: %s\n", 
      cudaGetErrorString(errUChar)); 
     exit(-1); 
    } 

    return 0; 
} 

任何幫助/見解將不勝感激!

Aaron

+2

紋理的每一行必須被適當地對準。如果將紋理綁定到普通數組(而不是CUDA數組),通常無法保證。要將純內存綁定到2D紋理,您需要使用cudaMallocPitch()分配內存。這將行間距設置爲適合綁定到紋理。請注意,將'0'作爲第一個參數傳遞給紋理綁定API調用是不好的做法。該參數用於CUDA嚮應用程序返回偏移量。如果偏移量不爲零,則需要在紋理訪問期間將其添加到紋理座標。 – njuffa

回答

5

紋理的每一行必須正確對齊。如果將紋理綁定到普通數組(而不是CUDA數組),通常無法保證。要將純內存綁定到2D紋理,您需要使用cudaMallocPitch()分配內存。這將行間距設置爲適合綁定到紋理。請注意,將0作爲第一個參數傳遞到紋理綁定API調用是不好的做法。該參數用於CUDA嚮應用程序返回偏移量。如果偏移量不爲零,則需要在紋理訪問期間將其添加到紋理座標。

下面是一個快速示例,顯示如何從元素爲unsigned char的紋理讀取插值。

#include <stdlib.h> 
#include <stdio.h> 

// Macro to catch CUDA errors in CUDA runtime calls 
#define CUDA_SAFE_CALL(call)           \ 
do {                 \ 
    cudaError_t err = call;           \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 
// Macro to catch CUDA errors in kernel launches 
#define CHECK_LAUNCH_ERROR()           \ 
do {                 \ 
    /* Check synchronous errors, i.e. pre-launch */     \ 
    cudaError_t err = cudaGetLastError();        \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
    /* Check asynchronous errors, i.e. kernel failed (ULF) */   \ 
    err = cudaThreadSynchronize();         \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 

texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex; 

__global__ void kernel (int m, int n, float shift_x, float shift_y) 
{ 
    float val; 
    for (int row = 0; row < m; row++) { 
     for (int col = 0; col < n; col++) { 
      val = tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y); 
      printf ("%.2f ", val); 
     } 
     printf ("\n"); 
    } 
} 

int main (void) 
{ 
    int m = 4; // height = #rows 
    int n = 3; // width = #columns 
    size_t pitch, tex_ofs; 
    unsigned char arr[4][3]= {{11,12,13},{21,22,23},{31,32,33},{251,252,253}}; 
    unsigned char *arr_d = 0; 

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m)); 
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]), 
           n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice)); 
    tex.normalized = false; 
    tex.filterMode = cudaFilterModeLinear; 
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc, 
             n, m, pitch)); 
    if (tex_ofs !=0) { 
     printf ("tex_ofs = %zu\n", tex_ofs); 
     return EXIT_FAILURE; 
    } 
    printf ("reading array straight\n"); 
    kernel<<<1,1>>>(m, n, 0.0f, 0.0f); 
    CHECK_LAUNCH_ERROR(); 
    CUDA_SAFE_CALL (cudaDeviceSynchronize()); 
    printf ("reading array shifted in x-direction\n"); 
    kernel<<<1,1>>>(m, n, 0.5f, 0.0f); 
    CHECK_LAUNCH_ERROR(); 
    CUDA_SAFE_CALL (cudaDeviceSynchronize()); 
    printf ("reading array shifted in y-direction\n"); 
    kernel<<<1,1>>>(m, n, 0.0f, 0.5f); 
    CUDA_SAFE_CALL (cudaDeviceSynchronize()); 
    CUDA_SAFE_CALL (cudaFree (arr_d)); 
    return EXIT_SUCCESS; 
} 

該程序的輸出如下所示:

reading array straight 
0.04 0.05 0.05 
0.08 0.09 0.09 
0.12 0.13 0.13 
0.98 0.99 0.99 
reading array shifted in x-direction 
0.05 0.05 0.05 
0.08 0.09 0.09 
0.12 0.13 0.13 
0.99 0.99 0.99 
reading array shifted in y-direction 
0.06 0.07 0.07 
0.10 0.11 0.11 
0.55 0.56 0.56 
0.98 0.99 0.99 
+0

哇!完整的工作代碼來說明你的答案!非常感謝!那麼,這只是一個僥倖,沒有cudaMallocPitch()可以設置浮動紋理? – Aaron

+0

我不知道紋理行的對齊要求是什麼,但是當簡單地使用'cudaMalloc()'時,可能會意外滿足它。爲了論證,假設所需的行對齊是32字節。那麼每行1000個浮點數= 4000個字節,由32 * 125 = 4000平均除以32個字節,但1000個無符號字符= 1000個字節,不能均勻分配32個字節。 – njuffa