Cuda矩陣乘法 - 不適用於某些非方形矩陣

我正在嘗試Cuda編程。作爲這一部分，我試圖開發一個矩陣乘法算法在GPU上運行。該算法適用於平方矩陣，但不適用於非方形矩陣。這裏是我的內核Cuda矩陣乘法 - 不適用於某些非方形矩陣

float* multiply_gpu(float* matrix1 , float* matrix2); 
    __global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, 
      float *b, float *result) { 
     int index = blockIdx.x * blockDim.x + threadIdx.x; 
     int result_size = rowsA*columnsB; 
     int value = 0;//the final result 
     //indices of values from input matrices 
     if (index < result_size) { 
      int index1 = (index/rowsA)*rowsA; //get nearest row 
      int index2 = index%columnsB; //get start column 
      int k = 0; 
      while (k<columnsA) { //columnsA == rowsB 
       value += a[index1]*b[index2]; //v = sum a_ik * b_kj 
       index1 ++; 
       index2 += columnsB; 
       k++; 
      } 
      result[index] = value; 
     } 
    }

做我的導師簡要全面的檢查之後，他還沒有看到它的任何問題。我相信問題在於這樣的功能：

float* multiply_gpu(float* matrix1 , float* matrix2) { 
    //the dimensions of the matrices 
    size_t available, total; 
    cudaError_t error; 
    cudaError err = cudaMemGetInfo(&available, &total); 
    if(err != cudaSuccess){ 
     printf("There was an error: %s\n", cudaGetErrorString(err)); 
    } 
    int height1 = matrix1[0]; 
    int width1 = matrix1[1]; 
    int height2 = matrix2[0]; 
    int width2 = matrix2[1]; 
    if (width1!=height2) { 
     return NULL; 
    } 
    //this array contains the result of the operation 
    float* result = (float *) malloc(height1*width2*sizeof(float)); 
    //pointers for device matrices 
    float *d_matrix1; 
    float *d_matrix2; 
    float *d_result; 
    //allocate memory for matrices 
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //now copy matrices onto device -- note the offset of 2 
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //launch multiplication kernel 
//note I have tried adjusting the kernel values between <<< , >>> to no avail 
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result); 
    printf("%d %d %d %d\n",height1,width1,height2,width2); 
    //make the host block until mult is finished running 
    //printf("finished multiplying\n"); 
    cudaDeviceSynchronize(); 
    //copy result back 
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //free now unneeded cuda memory 
    cudaFree(d_matrix1); 
    cudaFree(d_matrix2); 
    cudaFree(d_result); 
    printf("GOT RESULT\n"); 
    for (int i=0;i<height1*width2;i++) { 
     printf("%f ",result[i]); 
    } 
    printf("\n"); 
    //result ready to be returned 
    return result; 
}

注意，它們是參數multiply_gpu所述矩陣具有索引0和寬度其高度在指數1.結果矩陣不具有此信息。

不正確的計算的一個例子：當我喂以下數組到multiply_gpu - {2,3，1,2,3,4,5,6 }，{3,2，1,2- ，3,4,5,6}答案應該是{22,28,49,64}，但是我的單元測試會生成{22,28,40,52}。很近！請注意，對於（1,2,3）*（1,2,3）（不是方形）的點積，該算法很快樂......這裏可能有什麼錯誤？感謝您的幫助。如果我單獨找到一個解決方案，將會發布解

來源

2013-11-26 YardGlassOfCode

有關於矩陣乘法的CUDA標籤相當多的問題。你有看過嗎？如果你用'cuda-memcheck'運行你的代碼會發生什麼？ SO期望：「關於您編寫的代碼問題的問題必須在問題本身中描述具體問題 - 幷包含有效的代碼以再現問題本身。請參閱SSCCE.org以獲取指導。」投票結束。您尚未提供SSCCE.org代碼。 –

是的矩陣乘法在GPU上很常見，並且存在許多關於它的SO問題。我已經讀過它們，但可能不夠徹底。我只是在我的智慧結束，來到這裏得到一個理智的檢查。感謝您與SSCCE.org的鏈接 - 我現在正在審覈它。我也在學習cuda-memcheck。總的來說，我面臨的這個錯誤正在消耗我。我認爲需要更多地關注我自己的代碼和其他矩陣乘法器的評論。 – YardGlassOfCode

我更新了我的答案，因爲我仍然不太對。我認爲現在是正確的 - 它適用於您提及的案例以及我嘗試過的其他三個案例。 –

此行是錯誤的：

 int index1 = (index/rowsA)*rowsA; //get nearest row

應該是這樣的：

 int index1 = (index/columnsB)*columnsA; //get nearest row

爲什麼這個提法是否正確？ index1用於索引A中與我們計算的輸出矩陣位置所指示的行對應的行元素。輸出矩陣位置就是線索索引。如果我們（整數）將線索引除以輸出矩陣即C中的列的數目，我們得到所討論的行號。然後，要找到A中該行的第一個元素，我們乘以A中的列數。這正確地將我們索引到A中相關行的第一個元素。

下面是一個完整的應用程序以及我的測試用例 - 我對您的代碼進行的唯一更改是上面所述的更改。

$ cat t290.cu 
#include <stdio.h> 

__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) { 
     int index = blockIdx.x * blockDim.x + threadIdx.x; 
     int result_size = rowsA*columnsB; 
     int value = 0;//the final result 
     //indices of values from input matrices 
     if (index < result_size) { 
      int index1 = (index/columnsB)*columnsA; //get nearest row 
      int index2 = index%columnsB; //get start column 
      int k = 0; 
      while (k<columnsA) { //columnsA == rowsB 
       value += a[index1]*b[index2]; //v = sum a_ik * b_kj 
       index1 ++; 
       index2 += columnsB; 
       k++; 
      } 
      result[index] = value; 
     } 
    } 

float* multiply_gpu(float* matrix1 , float* matrix2) { 
    //the dimensions of the matrices 
    size_t available, total; 
    cudaError_t error; 
    cudaError err = cudaMemGetInfo(&available, &total); 
    if(err != cudaSuccess){ 
     printf("There was an error: %s\n", cudaGetErrorString(err)); 
    } 
    int height1 = matrix1[0]; 
    int width1 = matrix1[1]; 
    int height2 = matrix2[0]; 
    int width2 = matrix2[1]; 
    if (width1!=height2) { 
     printf("fail!\n"); 
     return NULL; 
    } 
    //this array contains the result of the operation 
    float* result = (float *) malloc(height1*width2*sizeof(float)); 
    //pointers for device matrices 
    float *d_matrix1; 
    float *d_matrix2; 
    float *d_result; 
    //allocate memory for matrices 
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float)); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //now copy matrices onto device -- note the offset of 2 
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //launch multiplication kernel 
//note I have tried adjusting the kernel values between <<< , >>> to no avail 
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result); 
    printf("%d %d %d %d\n",height1,width1,height2,width2); 
    error = cudaGetLastError(); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //make the host block until mult is finished running 
    //printf("finished multiplying\n"); 
    error = cudaDeviceSynchronize(); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //copy result back 
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost); 
    if (error != cudaSuccess) { 
     fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error)); 
     exit(EXIT_FAILURE); 
    } 
    //free now unneeded cuda memory 
    cudaFree(d_matrix1); 
    cudaFree(d_matrix2); 
    cudaFree(d_result); 
    printf("GOT RESULT\n"); 
    for (int i=0;i<height1*width2;i++) { 
     printf("%f ",result[i]); 
    } 
    printf("\n"); 
    //result ready to be returned 
    return result; 
} 

int main(){ 

    float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; 
    float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0}; 
    float *my_result1 = multiply_gpu(m2, m1); 
    float m3[8] = {2,3,1,2,3,4,5,6}; 
    float m4[8] = {3,2,1,2,3,4,5,6}; 
    float *my_result2 = multiply_gpu(m3, m4); 
    float *my_result3 = multiply_gpu(m4, m3); 
    float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1}; 
    float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; 
    float *my_result4 = multiply_gpu(m5, m6); 
    return 0; 
} 

$ nvcc -arch=sm_20 -o t290 t290.cu 
t290.cu: In function âfloat* multiply_gpu(float*, float*)â: 
t290.cu:30: warning: converting to âintâ from âfloatâ 
t290.cu:31: warning: converting to âintâ from âfloatâ 
t290.cu:32: warning: converting to âintâ from âfloatâ 
t290.cu:33: warning: converting to âintâ from âfloatâ 
$ cuda-memcheck ./t290 
========= CUDA-MEMCHECK 
2 2 2 3 
GOT RESULT 
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000 
2 3 3 2 
GOT RESULT 
22.000000 28.000000 49.000000 64.000000 
3 2 2 3 
GOT RESULT 
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000 
2 5 5 4 
GOT RESULT 
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 
========= ERROR SUMMARY: 0 errors 
$

來源

2013-11-26 04:45:24

嗯不完全。但你在正確的軌道上。我自己發現了正確的答案，併發布了它。 – YardGlassOfCode

所以在仔細閱讀我的矩陣碼我發現了一個簡單的問題，在我操作的數學。

這是真的這條線是錯誤的

int index1 = (index/rowsA)*rowsA; //get nearest row

我注意到，因爲我的矩陣是行有序，在（I，J）從元獲取正確的指數公式爲

因此，分配到index1應該

int index1 = (index/rowsA)*columnsA

爲什麼？那麼很明顯，導航到索引行ñ，我們必須通過ň行長度（這是列的矩陣數）移動。我的代碼適用於矩形矩陣，但不適用於其他矩形矩陣，因爲列數與此矩陣中的行數不匹配。

來源

2013-11-26 20:47:55 YardGlassOfCode

這是錯誤的。我提供了一個完整的代碼。將您的配方插入我的完整代碼，它會產生不正確的結果。 –

Cuda矩陣乘法 - 不適用於某些非方形矩陣

回答

相關問題