2016-04-22 33 views
-3

我在CUDA下面的內核:Cuda的IDX犯規指數矩陣正確

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
    int j; 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if ((idx > 0) && (idx < N)){ 
     //for(j=0;j<N;j++){ 
     // outgoing[j].p_t1=ingoing[j].p_t1; 
     //} 
     outgoing[idx].p_t1=ingoing[idx].p_t1; 

    } 
} 

這行不通。以下作品:

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
     int j; 
     int idx = threadIdx.x + blockIdx.x * blockDim.x; 
     if ((idx > 0) && (idx < N)){ 
      for(j=0;j<N;j++){ 
      outgoing[j].p_t1=ingoing[j].p_t1; 
      } 
      //outgoing[idx].p_t1=ingoing[idx].p_t1; 

     } 
    } 

出了什麼問題?爲什麼idx不會正確地對矩陣進行索引?

整個代碼寫在下面。理解它並不那麼容易。問題是,當我打印傳出[IDX] .p_t1場在他們打印的時候我做

outgoing[idx].p_t1=ingoing[idx].p_t1; 

0主要功能的結束,但他們是正確的,當我做

for(j=0;j<N;j++){ 
    outgoing[j].p_t1=ingoing[j].p_t1; 
} 

請告訴我錯誤?

/******************** Includes - Defines ****************/ 
#include "pagerank_serial.h" 
#include <stdio.h> 
#include <stdlib.h> 
#include <time.h> 
#include <math.h> 
#include <assert.h> 
#include <string.h> 
#include <sys/time.h> 
#include <fcntl.h> 
#include <cuda.h> 
#include "string.h" 

/******************** Defines ****************/ 
// Number of nodes 
int N; 

// Convergence threashold and algorithm's parameter d 
double threshold, d; 

// Table of node's data 
Node *Nodes; 

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
     int j; 
     int idx = threadIdx.x + blockIdx.x * blockDim.x; 
     if ((idx > 0) && (idx < N)){ 
      for(j=0;j<N;j++){ 
      outgoing[j].p_t1=ingoing[j].p_t1; 
      } 
      //outgoing[idx].p_t1=ingoing[idx].p_t1; 

     } 
    } 
/***** Read graph connections from txt file *****/ 

void Read_from_txt_file(char* filename) 
{ 

FILE *fid; 

int from_idx, to_idx; 
int temp_size; 

fid = fopen(filename, "r"); 
if (fid == NULL){ 
    printf("Error opening data file\n"); 
} 

while (!feof(fid)) 
{ 

    if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx)) 
    { 
    Nodes[from_idx].con_size++; 
    temp_size = Nodes[from_idx].con_size; 
    //Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int)); 
    Nodes[from_idx].To_id[temp_size - 1] = to_idx; 
    } 
} 

//printf("End of connections insertion!\n"); 

fclose(fid); 

} 

/***** Read P vector from txt file*****/  

void Read_P_from_txt_file() 
{ 

FILE *fid; 
double temp_P; 
int index = 0; 

fid = fopen("P.txt", "r"); 
if (fid == NULL){printf("Error opening the Probabilities file\n");} 

while (!feof(fid)) 
{ 
    // P's values are double! 
    if (fscanf(fid," double sum = 0;%lf\n", &temp_P)) 
    { 
    Nodes[index].p_t1 = temp_P; 
    index++; 
    } 
} 
//printf("End of P insertion!"); 

fclose(fid);  

} 


/***** Read E vector from txt file*****/  

void Read_E_from_txt_file() 
{ 

FILE *fid; 
double temp_E; 
int index = 0; 

fid = fopen("E.txt", "r"); 
if (fid == NULL) 
    printf("Error opening the E file\n"); 

while (!feof(fid)) 
{ 
    // E's values are double! 
    if (fscanf(fid,"%lf\n", &temp_E)) 
    { 
    Nodes[index].e = temp_E; 
    index++; 
    } 
} 
//printf("End of E insertion!"); 

fclose(fid);  

} 

/***** Create P and E with equal probability *****/ 

void Random_P_E() 
{ 

int i; 
// Sum of P (it must be =1) 
double sum_P_1 = 0; 
// Sum of E (it must be =1) 
double sum_E_1 = 0; 

// Arrays initialization 
for (i = 0; i < N; i++) 
{ 
    Nodes[i].p_t0 = 0; 
    Nodes[i].p_t1 = 1; 
    Nodes[i].p_t1 = (double) Nodes[i].p_t1/N; 

    sum_P_1 = sum_P_1 + Nodes[i].p_t1; 

    Nodes[i].e = 1; 
    Nodes[i].e = (double) Nodes[i].e/N; 
    sum_E_1 = sum_E_1 + Nodes[i].e; 
} 

// Assert sum of probabilities is =1 

// Print sum of P (it must be =1) 
//printf("Sum of P = %f\n",sum_P_1); 

// Exit if sum of P is !=1 
assert(sum_P_1 = 1); 

//printf("\n"); 

// Print sum of E (it must be =1) 
//printf("Sum of E = %f\n",sum_E_1); 

// Exit if sum of Pt0 is !=1 
assert(sum_E_1 = 1); 

} 


/***** Main function *****/ 

int main(int argc, char** argv) 
{ 

int blockSize;  // The launch configurator returned block size 
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch 
int gridSize;  // The actual grid size needed, based on input size 

// Check input arguments 
if (argc < 5) 
{ 
    printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n"); 
    return 0; 
} 

// get arguments 
char filename[256]; 
strcpy(filename, argv[1]); 
N = atoi(argv[2]); 
threshold = atof(argv[3]); 
d = atof(argv[4]); 

int i; 


// a constant value contributed of all nodes with connectivity = 0 
// it's going to be addes to all node's new probability 


// Allocate memory for N nodes 
Nodes = (Node*) malloc(N * sizeof(Node)); 

for (i = 0; i < N; i++) 
{ 
    Nodes[i].con_size = 0; 
    //Nodes[i].To_id = (int*) malloc(sizeof(int)); 
} 

Read_from_txt_file(filename); 

// set random probabilities 
Random_P_E(); 


Node *h_ingoing; 

Node *h_outgoing; 

h_ingoing = Nodes; 

h_outgoing = (Node *)calloc(N, sizeof *h_outgoing); 

Node *d_ingoing; 

Node *d_outgoing; 

cudaMalloc(&d_ingoing, N * sizeof *d_ingoing); 

cudaMalloc(&d_outgoing, N * sizeof *d_outgoing); 

cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice); 

cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice); 

float time; 

cudaEvent_t begin, end; 

cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N); 

// Round up according to array size 
gridSize = (N + blockSize - 1)/blockSize; 
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize); 

cudaEventCreate(&begin); 

cudaEventCreate(&end); 
cudaEventRecord(begin, 0); 

pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d); 

cudaEventRecord(end, 0); 


cudaEventSynchronize(end); 


cudaEventElapsedTime(&time, begin, end); 

cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost); 

printf("%f\n", time) ; 



printf("\n"); 

// Print final probabilitities 
for (i = 0; i <100; i++) 
{ 
    printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1); 
} 
printf("\n"); 



printf("End of program!\n"); 

return (EXIT_SUCCESS); 
} 
+0

你得到的錯誤是什麼?我不是一個活的編譯器... –

+0

我相信我理解了這個問題,雖然沒有調用全局內核的代碼,但很難知道發生了什麼。 –

+0

我寫了整個代碼,你現在可以解釋一下嗎?謝謝 – Haris

回答

1

當你說主要功能,他們打印0,當我做,我以爲你是指的所有條目,而不僅僅是指數爲0。事實上,索引0不是由你的代碼最前一頁版本處理((idx > 0) && (idx < N))對於idx=0是錯誤的。

在代碼中進一步說明,我們缺少Node類型的定義。這是強制性的,以便更好地理解代碼中可能出現的錯誤。

根據您在編譯中使用的Node的大小,其內容和結構包裝,主機端的大小可能與設備上的Node大小不同。使用printf來驗證這將是有用的,或使用調試器。

此外,你似乎沒有檢查發射中的錯誤。在內核調用後,您一定要添加cudaPeekAtLastErrorcudaDeviceSynchronize以確保沒有錯誤發生。 (來自cuda Runtime API的任何其他方法調用也可能會返回代碼未檢查的錯誤)。

編輯 試圖重現,我寫了以下內容,儘可能接近您的代碼。我沒有足夠內存的卡,因此節點數量較少。

typedef struct 
{ 
    double p_t0; 
    double p_t1; 
    double e; 
    int To_id[460]; 
    int con_size; 
} Node ; 

__global__ void pagerank(Node* ingoing, Node* outgoing, int N) 
{ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x ; 
    if ((idx > 0) && (idx < N)) 
     outgoing[idx].p_t1 = ingoing[idx].p_t1; 
} 

#include <cstdlib> 

#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR @ %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } } 

int main() 
{ 
    // int N = 916428 ; // does not fit on my GPU 
    int N = 400000 ; 

    int blockSize; 
    int minGridSize; 
    int gridSize; 

    Node* Nodes = (Node*)malloc(N * sizeof (Node)) ; 

    for (int i = 0 ; i < N ; ++i) 
     Nodes[i].p_t1 = (double)i+1; 

    Node* h_ingoing = Nodes; 
    Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ; 

    Node* d_ingoing ; 
    Node* d_outgoing ; 

    cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing)); 
    cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing)); 

    cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice)); 
    cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice)); 

    float time; 

    cudaEvent_t begin, end ; 

    //blockSize = 256 ; 
    cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ; 
    gridSize = (N + blockSize -1)/blockSize ; 

    printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ; 

    cudaCheck (cudaEventCreate (&begin)) ; 
    cudaCheck (cudaEventCreate (&end)) ; 

    cudaCheck (cudaEventRecord (begin, 0)) ; 

    pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ; 

    cudaCheck (cudaEventRecord (end, 0)) ; 

    cudaCheck (cudaEventSynchronize (end)) ; 

    cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ; 

    for (int i = 0 ; i < 100 ; ++i) 
    { 
     printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ; 
    } 

    for (int i = 0 ; i < N ; ++i) 
    { 
     if (h_outgoing[i].p_t1 != (double)(i+1)) 
      printf ("Error @ %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1)); 
    } 

    return 0 ; 
} 

除了索引爲0的第一個答案草案出現問題時,每個輸出都是正確的。

+0

謝謝。我在下面添加了節點描述。這會改變什麼嗎? – Haris

+0

節點是一個大型結構。你沒有得到任何運行時錯誤? –

+0

實際上節點矩陣(有916428個節點)的總大小是1.7Gbit。我正在使用特斯拉k20m跑步。不,我沒有收到任何錯誤,至少不是沒有調試工具 – Haris