2012-10-11 680 views
1

我覺得有點不好使一個已經有10個同名的論壇線程,但是在檢查完所有線程以及大多數指南後,我仍然無法確定問題。CUDA矩陣乘法 - 再次

我有一個char數組[40090] [11],我想對它的兩個元素(我認爲整個11字節串作爲一個元素)的每個可能的組合進行自定義操作。我明白這是一種矩陣乘法,矩陣是一列和一行。

遵循SDK手冊我想每個輸出元素有1個線程。由於40090 = 19 * 2110,我使用:

dim3 threadsperblock(19,19); 
dim3 blocksingrid(2110,2110); 
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2); 

問題1:這個罰款?

好吧,然後,我想想我正在關注SDK的maunal示例(不是使用共享內存的示例)。但是,每當我敢於在數據上進行我想要的操作的一部分時,我就會得到一個大量無用的錯誤30:未知的錯誤。所以,問題2:我做錯了什麼?注意:忽略內核不保存任何地方。

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <cstdlib> 
#include <iostream> 
#include <fstream> 
#include <iomanip> 
#include <ctime> 
#include <stdio.h> 
using namespace std; 

cudaError_t cudafunct(void); 
__global__ void xkernel(char * dev_b2); 
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2); 


#define b2c 40090 
unsigned char block2[b2c][11];// 
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle 
//unsigned char *block4,type=0; 
ofstream ofile; 




int main() 
{ 
    ifstream block2file("2.blk",ios::binary); 
    block2file.read((char*)(&block2),b2c*11); 
    block2file.close(); 
    //block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs 

    cudaError_t cudaStatus = cudafunct(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudafunct failed!"); 
     system("PAUSE"); 
     return 1; 
    } 
    /* 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceReset failed!"); 
     return 1; 
    }*/ 
    cout<<"Sequence end. Saving to file...\n";  
    //ofile.open("blk4.et2",ios::binary); 
    //ofile.write((char*)block4,17*counter); 
    //ofile.close(); 
    int t=clock(); 
    //cout<<"\nFound a total of "<<counter<<" block4s.\nTime elapsed: "<<t<<" clocks/"<<(double)t/(double)CLOCKS_PER_SEC<<" seconds\n"; 
    system("PAUSE"); 
} 

// Helper function for using CUDA to add vectors in parallel. 
cudaError_t cudafunct(void) 
{ 
    char *dev_b2 = 0; 
    cudaError_t cudaStatus; 

    cudaStatus = cudaMalloc((void**)&dev_b2, sizeof(block2)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMemcpy(dev_b2, block2, sizeof(block2), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    dim3 threadsperblock(19,19); 
    dim3 blocksingrid(2110,2110); 
    xkernel<<<blocksingrid, threadsperblock>>>(dev_b2); 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!\n", cudaStatus); 
     goto Error; 
    } 
    /* 
    // Copy output vector from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    }*/ 

Error: 
    cudaFree(dev_b2); 
    return cudaStatus; 
} 


__global__ void xkernel(char *dev_b2) 
{ 
     int i = blockIdx.x * blockDim.x + threadIdx.x; 
     int j = blockIdx.y * blockDim.y + threadIdx.y; 
     /*for(int k=0;k<11;k++) 
     { 
      lb2[0][k]=dev_b2[i*b2c+k]; 
      lb2[1][k]=dev_b2[j*b2c+k]; 
     }*/ 
     int b00; 
     b00=dev_b2[i*b2c]; 

     //int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]); 
     //if(!j && !(i % 100))cout<<setw(6)<<i<<"/"<<jc<<" ("<<setw(10)<<(float)100*i/jc<<" %)"<<endl;  
     /*if(
      (dev_b2[i*b2c+7]!=dev_b2[j*b2c+9])||//SW~NW  
      (dev_b2[i*b2c+6]!=dev_b2[j*b2c+10])//SE~NE                       
     ) return; 
     if((type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4])) ==255) return;*/ 
     /*if(
      (dev_b2[i*b2c+0]==dev_b2[j*b2c+0])||//1st=3rd 
      (dev_b2[i*b2c+0]==dev_b2[j*b2c+2])||//1st=4th 
      (dev_b2[i*b2c+2]==dev_b2[j*b2c+0])||//2nd=3rd 
      (dev_b2[i*b2c+2]==dev_b2[j*b2c+2])//2nd=4th 
     ) return;*/ 
     /* 
     *(block4+counter*17+0)=b2[i][0];//1st piece 
     *(block4+counter*17+1)=b2[i][1];//1st rotation 
     *(block4+counter*17+2)=b2[i][2];//2nd piece 
     *(block4+counter*17+3)=b2[i][3];//2nd rotation 
     *(block4+counter*17+4)=b2[j][0];//3rd piece 
     *(block4+counter*17+5)=b2[j][1];//3rd rotation 
     *(block4+counter*17+6)=b2[j][2];//4th piece 
     *(block4+counter*17+7)=b2[j][3];//4th rotation 
     *(block4+counter*17+8)=type; 
     *(block4+counter*17+9)=b2[i][5];//Right frame colours, down->up 
     *(block4+counter*17+10)=b2[j][5]; 
     *(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left 
     *(block4+counter*17+12)=b2[j][7]; 
     *(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down 
     *(block4+counter*17+14)=b2[i][8]; 
     *(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right 
     *(block4+counter++*17+16)=b2[i][10];*/ 
} 



__device__ unsigned char typecheck(unsigned char type1,unsigned char type2) 
{//Warning! Previous error! First partenthesis is t*2* = upper piece! 
     if((type1==4) && (type2==0)) return 0; 
     if((type1==6) && (type2==1)) return 1; 
     if((type1==2) && (type2==6)) return 2; 
     if((type1==3) && (type2==4)) return 3; 
     if((type1==4) && (type2==4)) return 4; 
     if((type1==8) && (type2==5)) return 5; 
     if((type1==6) && (type2==6)) return 6; 
     if((type1==7) && (type2==8)) return 7; 
     if((type1==8) && (type2==8)) return 8; 
     if((type1==9) && (type2==8)) return 9; 
     if((type1==10) && (type2==8)) return 10; 
     if((type1==8) && (type2==11)) return 11; 
     if((type1==8) && (type2==12)) return 12; 
     if((type1==8) && (type2==13)) return 13; 
     return 255; 
} 
+0

您確定CUDA驅動程序正在運行?請從SDK中測試bandwidthTest或deviceQuery。 – ahmad

+0

帶寬測試工作正常。 – user1058795

回答

1

我中有你從你的dev_b2陣列讀出界外的感覺。 blockIdx.x[0..2110]的範圍內,所以變量i[0..]的範圍內。但是,你將它乘以b2c。 因此,您讀取的最高地址將是b2c*= 930488900

但是dev_b2只有大小b2c*11 = 440990

+0

我不認爲這些是範圍。正如我發佈的,blockIdx.x的範圍是2110,而線程equivelant是19.另一個有趣的事情:我發佈的代碼實際上工作。但是,如果不是int b00,我使用int b [0] [0]並嘗試將相同的值賦給b [0] [0],這就是我得到錯誤的地方。 – user1058795

+0

這可能會更好,如果你發佈的代碼實際上失敗了。我不太清楚你的意思是什麼「然而,如果不是int b00,我做了一個int b [0] [0] ...在內核中你有int b00; b00 = dev_b2 [i @ b2c];我將其更改爲int b [1] [1]; b [0] [0] = dev_b2 [i * b2c];它的編譯和運行方式與更改前相同。是32的倍數,warp大小 –

+0

@ user1058795是的,我把b2x和gridDim.x混合在一起,後者有點小,但即使如此,你仍然走出界限,我用數字來解決我的反應。你發佈的代碼實際上什麼都不做,CUDA將通過死代碼消除產生一個空內核。 – CygnusX1