自我實現的cuIDFT.cu的遞歸使用導致每次重新運行代碼時改變輸出

我已經實現了CUDA版本的逆離散餘弦變換（IDCT），通過「翻譯」MATLAB內置函數idct.m到CUDA：自我實現的cuIDFT.cu的遞歸使用導致每次重新運行代碼時改變輸出

我實現cuIDCT.cu，工作時米= N兩者米和ñ是偶數。

cuIDCT.cu

#include <stdio.h> 
#include <stdlib.h> 
#include <cuda.h> 
#include <cufft.h> 
#include <cuComplex.h> 

// round up n/m 
inline int iDivUp(int n, int m) 
{ 
    return (n + m - 1)/m; 
} 

typedef cufftComplex complex; 

#define PI 3.1415926535897932384626433832795028841971693993751 

__global__ 
    void idct_ComputeWeightsKernel(const int n, complex *ww) 
{ 
    const int pos = threadIdx.x + blockIdx.x * blockDim.x; 

    if (pos >= n) return; 

    ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n)); 
    ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n)); 
} 

__global__ 
    void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 

    // Compute precorrection factor 
    ww[0].x = ww[0].x/sqrtf(2); 
    ww[0].y = ww[0].y/sqrtf(2); 

    y[iy + ix*m].x = ww[iy].x * b[pos]; 
    y[iy + ix*m].y = ww[iy].y * b[pos]; 
} 

__global__ 
    void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 

    yy[iy + ix*n].x = y[pos].x/(float) n; 
    yy[iy + ix*n].y = y[pos].y/(float) n; 
} 

__global__ 
    void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 

    // Re-order elements of each column according to equations (5.93) and (5.94) in Jain 
    if (iy < n/2) 
    { 
     a[ix + 2*iy*n]  = yy[pos].x; 
     a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x; 
    } 
} 

/** 
* a = idct(b), where a is of size [n m]. 
* @param b, input array 
* @param n, first dimension of a 
* @param m, second dimension of a 
* @param a, output array 
*/ 
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m] 
{ 
    const int data_size = n * m * sizeof(float); 

    // device memory allocation 
    float *d_in, *d_out; 
    cudaMalloc(&d_in, data_size); 
    cudaMalloc(&d_out, data_size); 

    // transfer data from host to device 
    cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice); 

    // compute IDCT using CUDA 
    // begin============================================ 
    // Compute weights 
    complex *ww; 
    cudaMalloc(&ww, n*sizeof(complex)); 
    dim3 threads(256); 
    dim3 blocks(iDivUp(n, threads.x)); 
    idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww); 

    complex *y; 
    complex *yy; 
    cufftHandle plan; 

    dim3 threads1(32, 6); 
    dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case 

    int Length[1] = {m}; // for each IFFT, the length is m 

    cudaMalloc(&y, n*m*sizeof(complex)); 

    idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y); 

    cufftPlanMany(&plan, 1, Length, 
       Length, 1, m, 
       Length, 1, m, CUFFT_C2C, n); 
    cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m] 

    cudaMalloc(&yy, n*m*sizeof(complex)); 
    Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy); 
    Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out); 
    // end============================================ 

    // transfer result from device to host 
    cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost); 

    // cleanup 
    cufftDestroy(plan); 
    cudaFree(ww); 
    cudaFree(y); 
    cudaFree(yy); 
    cudaFree(d_in); 
    cudaFree(d_out); 
}

然後，我比較我的CUDA IDCT（即cuIDCT.cu）針對MATLAB idct.m的使用下面的代碼的結果：

測試main.cpp功能，和
一個MATLAB的主函數main.m讀取CU的結果DA並將其與MATLAB進行比較。

的main.cpp

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <helper_functions.h> 
#include <stdlib.h> 
#include <stdio.h> 

// N must equal to M, and both must be even numbers 
#define N 256 
#define M 256 

void WriteDataFile(const char *name, int w, int h, const float *in, const float *out) 
{ 
    FILE *stream; 
    stream = fopen(name, "wb"); 

    float data = 202021.25f; 
    fwrite(&data, sizeof(float), 1, stream); 
    fwrite(&w, sizeof(w), 1, stream); 
    fwrite(&h, sizeof(h), 1, stream); 

    for (int i = 0; i < h; i++) 
     for (int j = 0; j < w; j++) 
     { 
      const int pos = j + i * h; 
      fwrite(in + pos, sizeof(float), 1, stream); 
      fwrite(out + pos, sizeof(float), 1, stream); 
     } 

    fclose(stream); 
} 

void cuIDCT(float *b, int n, int m, float *a); 

int main() 
{ 
    // host memory allocation 
    float *h_in = new float [N * M]; 
    float *h_out = new float [N * M]; 
    float *h_temp = new float [N * M]; 

    // input data initialization 
    for (int i = 0; i < N * M; i++) 
    { 
     h_in[i] = (float)rand()/(float)RAND_MAX; 
     h_out[i] = h_in[i]; 
     h_temp[i] = h_in[i]; 
    } 

    // please comment either one case for testing 
    // test case 1: use cuIDCT.cu once 
    // cuIDCT(h_in, N, M, h_out); 

    // test case 2: iteratively use cuIDCT.cu 
    for (int i = 0; i < 4; i++) 
    { 
     if (i % 2 == 0) 
      cuIDCT(h_out, N, M, h_temp); 
     else 
      cuIDCT(h_temp, N, M, h_out); 
    } 

    // write data, for further visualization using MATLAB 
    WriteDataFile("test.flo", N, M, h_in, h_out); 

    // cleanup 
    delete [] h_in; 
    delete [] h_out; 
    delete [] h_temp; 
    cudaDeviceReset(); 
}

的main.m

clc;clear; 

% read 
[h_in, h_out] = read_data('test.flo'); 

% MATLAB result, for test case 1, comment the for-loop 
matlab_out = h_in; 
for i = 1:4 
    matlab_out = idct(matlab_out); 
end 

% compare 
err = matlab_out - h_out; 

% show 
figure(1); 
subplot(221); imshow(h_in, []);  title('h\_in');  colorbar 
subplot(222); imshow(h_out, []);  title('h\_out');  colorbar 
subplot(223); imshow(matlab_out, []); title('matlab\_out'); colorbar 
subplot(224); imshow(err, []);  title('error map'); colorbar 

disp(['maximum error between CUDA and MATLAB is ' ... 
     num2str(max(max(abs(err))))])

我在Windows 7上運行的Visual Studio 11（即VS2012）的代碼與Nvidia的GPU的Tesla K20C ，使用CUDA工具包版本7.5，而我的MATLAB版本是R2015b。

我的測試步驟：

對於測試用例1。未評論測試案例1和評論測試案例2.
1. 運行main.cpp。
2. 在MATLAB中運行main.m。
3. 重複步驟1和步驟2（沒有任何更改，只需重新運行代碼）。

我重複了第3步20次。輸出結果是不變的，並且導致main.m是：

results of test case 1

的最大誤差爲7.7152e-07。

對於測試案例2。未評論測試案例2和評論測試案例1.
1. 運行main.cpp。
2. 在MATLAB中運行main.m。
3. 重複步驟1和步驟2（沒有任何更改，只需重新運行代碼）。

我重複了第3步20次。輸出結果發生改變，並導致main.m是（沒有足夠的信譽來把所有的圖像，只有錯誤的情況如下所示）：

one situation (the wrong one) of test case 2

的最大誤差爲0.45341（2次），0.44898 （1次），0.26186（1次），0.26301（1次）和9.5716e-07（15次）。

從測試結果，我的結論是：

從測試用例1：cuIDCT.cu在數值上是正確的（錯誤〜10^-7）到idct.m。
從測試案例2：遞歸地使用的cuIDCT.cu導致不穩定的結果（即，輸出每變化時重新運行代碼和有時可能數值錯誤的時間，錯誤〜0.1）

我的問題：

從測試案例1我們知道cuIDCT.cu在數值上正確到idct.m。但是爲什麼遞歸使用cuIDCT.cu會導致每次重新運行代碼時產生不同的輸出結果？

任何幫助或建議，高度讚賞。

來源

2016-01-04 WDC

您的要求之一是測試用例2中運行的可變性。我無法再現該問題。作爲一項測試，我添加了一個'double din = 0.0; double dout = 0.0;'在'WriteDataFile'函數的for循環之前。在循環體中，我使用'din + = in [pos]創建了校驗和; dout + = out [pos];'。在for循環之後，我將這些數量輸出。它們的運行速度保持不變。我認爲你的CUDA代碼不會顯示任何運行的可變性。 [Here]（http://pastebin.com/VtYmcxux）是我的完整測試用例。作爲錯誤檢查，您可能想嘗試使用'cuda-memcheck'運行代碼。 –

如果您偶爾（四分之一中的一個）遇到WDDM TDR超時，可能會有變化，因爲您在Windows上。這對我來說似乎不太可能，但排除這一點很好。你應該添加[適當的cuda錯誤檢查]（http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api）任何時候您在使用CUDA代碼時遇到問題。（'cuda-memcheck'會減慢執行速度）最後，你在VS中構建一個調試項目還是一個發佈項目？你在VS中構建一個win32項目還是一個x64項目？ –

@RobertCrovella我根據你的建議修改了我的代碼[http://pastebin.com/3WznVxCa]，添加了（a）'checkCudaErrors'和（b）'cufftSafeCall'和（c）你的'din'和'dout'。我使用「Debug - > Start Debugging（F5）」每次運行VS中的代碼。並且每次使用「Debug - > Start Without Debugging（Ctrl + F5）」獲得前一鏈接中的結果，則手動記錄輸出。第一次運行花費了很長時間，但在後續運行中速度更快。從「Property Pages」中顯示「Platform：Active（x64）」。所以我猜測我正在VS中構建一個調試x64項目。 – WDC

我相信，在您的結果的可變性來了大約由於這個代碼在你idct_ComputeEvenKernel：

// Compute precorrection factor 
ww[0].x = ww[0].x/sqrtf(2); 
ww[0].y = ww[0].y/sqrtf(2);

這不是完全清楚你的目的是在這裏的，但它是值得懷疑的這個代碼可以做你想。您可能會對CUDA執行模型感到困惑。

if (ix >= n || iy >= m) return;

我相信，這意味着65536個線程都將在內核中執行此代碼：

上面的代碼將通過你推出針對通過線程檢查內核每 CUDA線程執行。此外，線程將以或多或少的順序執行該代碼（並非所有CUDA線程都在鎖步中執行）。他們甚至可能會互相推as，因爲他們正試圖將他們的價值寫到ww[0]的位置。所以ww[0]的最終結果將是相當不可預測的。

當我註釋掉這些代碼行時，結果對於我來說變得穩定（儘管與那些行的不同），從運行到運行都不變。

我想指出其他的東西。不管你正在計算一個復量的.x和.y值，我的建議是從該返工的代碼（例如）：

y[iy + ix*m].x = ww[iy].x * b[pos]; 
y[iy + ix*m].y = ww[iy].y * b[pos];

這樣：

complex temp1, temp2; 
temp1 = ww[iy]; 
temp2.x = temp1.x * b[pos]; 
temp2.y = temp2.y * b[pos]; 
y[iy + ix*m] = temp2;

至少根據我的測試，編譯器似乎沒有爲你做這種優化，而且一個副作用的好處是用cuda-memcheck --tool initcheck ...測試你的代碼要容易得多。在第一個實現中，編譯器將以8字節的數量加載y[iy + ix*m]，修改其中的4個或8個字節，然後將y[iy + ix*m]作爲8字節數量來存儲。第二種實現應該更高效（它消除了的負載），並消除了未初始化數量（）的負載，cuda-memcheck工具會將其報告爲危險。

我描述的這種可變性應該是可能的，無論是運行代碼的1遍版本還是4遍版本的代碼。因此，我認爲你關於1-pass版本的陳述是正確的陳述是可疑的。我認爲如果您足夠運行1-pass版本，您最終會看到可變性（儘管它可能需要不同的初始內存條件或運行在不同的GPU類型上）。即使在你自己的結果中，我們也可以看到4個密碼的20個運行中有15個產生了「正確」的結果，即殘差是〜1e-7

這是我修改後的cuIDCT.cu文件， here。我在下面做的假設是，你要計算在ww[0]比例只有一次，在這種情況下，我們可以輕鬆處理算術作爲附錄以前idct_ComputeWeightsKernel：

#include <stdio.h> 
#include <stdlib.h> 
#include <cuda.h> 
#include <cufft.h> 
#include <cuComplex.h> 
#include <helper_cuda.h> 
#include "assert.h" 

// round up n/m 
inline int iDivUp(int n, int m) 
{ 
    return (n + m - 1)/m; 
} 

typedef cufftComplex complex; 

#define PI 3.1415926535897932384626433832795028841971693993751 

#define cufftSafeCall(err)  __cufftSafeCall(err, __FILE__, __LINE__) 
inline void __cufftSafeCall(cufftResult err, const char *file, const int line) 
{ 
    if(CUFFT_SUCCESS != err) { 
     fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ 
      _cudaGetErrorEnum(err)); \ 
      cudaDeviceReset(); assert(0); \ 
    } 
} 


__global__ 
    void idct_ComputeWeightsKernel(const int n, complex *ww) 
{ 
    const int pos = threadIdx.x + blockIdx.x * blockDim.x; 

    if (pos >= n) return; 
    complex temp; 
    temp.x = sqrtf(2*n) * cosf(pos*PI/(2*n)); 
    temp.y = sqrtf(2*n) * sinf(pos*PI/(2*n)); 
    if (pos == 0) { 
     temp.x /= sqrtf(2); 
     temp.y /= sqrtf(2);} 
    ww[pos] = temp; 
} 

__global__ 
    void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 
/* handle this in idct_ComputeWeightsKernel 
    // Compute precorrection factor 
    ww[0].x = ww[0].x/sqrtf(2); 
    ww[0].y = ww[0].y/sqrtf(2); 
*/ 
    complex temp1, temp2; 
    temp1 = ww[iy]; 
    temp2.x = temp1.x * b[pos]; 
    temp2.y = temp1.y * b[pos]; 
    y[iy + ix*m] = temp2; 
} 

__global__ 
    void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 
    complex temp1, temp2; 
    temp1 = y[pos]; 
    temp2.x = temp1.x/(float) n; 
    temp2.y = temp1.y/(float) n; 
    yy[iy + ix*n] = temp2; 
} 

__global__ 
    void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a) 
{ 
    const int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    const int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    if (ix >= n || iy >= m) return; 

    const int pos = ix + iy*n; 

    // Re-order elements of each column according to equations (5.93) and (5.94) in Jain 
    if (iy < n/2) 
    { 
     a[ix + 2*iy*n]  = yy[pos].x; 
     a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x; 
    } 
} 

/** 
* a = idct(b), where a is of size [n m]. 
* @param b, input array 
* @param n, first dimension of a 
* @param m, second dimension of a 
* @param a, output array 
*/ 
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m] 
{ 
    const int data_size = n * m * sizeof(float); 

    // device memory allocation 
    float *d_in, *d_out; 
    checkCudaErrors(cudaMalloc(&d_in, data_size)); 
    checkCudaErrors(cudaMalloc(&d_out, data_size)); 

    // transfer data from host to device 
    checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice)); 

    // compute IDCT using CUDA 
    // begin============================================ 
    // Compute weights 
    complex *ww; 
    checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex))); 
    dim3 threads(256); 
    dim3 blocks(iDivUp(n, threads.x)); 
    idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww); 

    complex *y; 
    complex *yy; 
    cufftHandle plan; 

    dim3 threads1(32, 6); 
    dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case 

    int Length[1] = {m}; // for each IFFT, the length is m 

    checkCudaErrors(cudaMalloc(&y, n*m*sizeof(complex))); 

    idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y); 
    cufftSafeCall(cufftPlanMany(&plan, 1, Length, 
           Length, 1, m, 
           Length, 1, m, CUFFT_C2C, n)); 
    cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m] 

    checkCudaErrors(cudaMalloc(&yy, n*m*sizeof(complex))); 
    Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy); 
    cudaMemset(d_out, 0, data_size); 
    Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out); 
    // end============================================ 

    // transfer result from device to host 
    checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost)); 


    // cleanup 
    cufftDestroy(plan); 
    checkCudaErrors(cudaFree(ww)); 
    checkCudaErrors(cudaFree(y)); 
    checkCudaErrors(cudaFree(yy)); 
    checkCudaErrors(cudaFree(d_in)); 
    checkCudaErrors(cudaFree(d_out)); 
}

你會注意到我扔一個額外的cudaMemsetd_out在那裏，因爲它幫助我清理一個問題cuda-memcheck --tool initcheck ...。它不應該是必要的，你可以刪除它，如果你想。

來源

2016-01-05 03:52:51

自我實現的cuIDFT.cu的遞歸使用導致每次重新運行代碼時改變輸出

回答

相關問題