Cuda版本在串行工作時不工作

我的以下極簡主義Cuda代碼返回錯誤結果（所有多邊形在末尾有0個頂點），而在C++中以串行方式運行的相同代碼運行良好。問題是令人尷尬的並行：沒有通信，沒有syncthreads等，並且Cuda內存分配是成功的。即使我的虛擬變量存儲用於調試目的的輸入數組的內容，Cuda版本也是0。由於我的數組足夠大，因此無法訪問。用Cuda中的循環替換memcpy不會改變任何內容。
我真的不明白會發生什麼......任何想法？謝謝！Cuda版本在串行工作時不工作

Cuda的代碼：在C++

#include <stdio.h> 
    #include <iostream> 
    #include <stdlib.h> 
    #include <cuda.h> 

    class Point2D { 
    public: 
    __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){}; 
    double x, y; 
    }; 

    __device__ double dot(const Point2D &A, const Point2D &B) { 
    return A.x*B.x + A.y*B.y; 
    } 
    __device__ Point2D operator*(double a, const Point2D &P) { 
    return Point2D(a*P.x, a*P.y); 
    } 
    __device__ Point2D operator+(Point2D A, const Point2D &B) { 
    return Point2D(A.x + B.x, A.y + B.y); 
    } 
    __device__ Point2D operator-(Point2D A, const Point2D &B) { 
    return Point2D(A.x - B.x, A.y - B.y); 
    } 
    __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD 
     Point2D M = 0.5*(C+D); 
     return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A); 
    } 

    class Polygon { 
    public: 
     __device__ Polygon():nbpts(0){}; 
     __device__ void addPts(Point2D pt) { 
     pts[nbpts] = pt; 
     nbpts++; 
     }; 
     __device__ Polygon& operator=(const Polygon& rhs) { 
     nbpts = rhs.nbpts; 
     dummy = rhs.dummy; 
     memcpy(pts, rhs.pts, nbpts*sizeof(Point2D)); 
     return *this; 
     } 
     __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) { 

     int new_nbpts = 0; 
     Point2D newpts[128]; 
     Point2D AB(outside_pt-inside_pt); 
     Point2D M(0.5*(outside_pt+inside_pt)); 
     double ABM = dot(AB, M); 

     Point2D S = pts[nbpts-1]; 

     for (int i=0; i<nbpts; i++) { 

      Point2D E = pts[i]; 

      double ddot = -ABM + dot(AB, E); 
      if (ddot<0) { // E inside clip edge 
      double ddot2 = -ABM + dot(AB, S); 
      if (ddot2>0) { 
       newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
       new_nbpts++; 
      } 
      newpts[new_nbpts] = E; 
      new_nbpts++; 
      } else { 
      double ddot2 = -ABM + dot(AB, S); 
      if (ddot2<0) { 
       newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
       new_nbpts++; 
      }  
      } 
      S = E; 
     } 

     memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D)); 
     nbpts = new_nbpts; 
     } 

    //private: 
    Point2D pts[128]; 
    int nbpts; 
    float dummy; 
    }; 


    __global__ void cut_poly(float *a, Polygon* polygons, int N) 
    { 
     int idx = blockIdx.x * blockDim.x + threadIdx.x; 
     if (idx>=N/2) return; 

     Polygon pol; 
     pol.addPts(Point2D(0.,0.)); 
     pol.addPts(Point2D(1.,0.)); 
     pol.addPts(Point2D(1.,1.)); 
     pol.addPts(Point2D(0.,1.)); 

     Point2D curPt(a[2*idx], a[2*idx+1]); 

     for (int i=0; i<N/2; i++) { 
     Point2D other_pt(a[2*i], a[2*i+1]); 
     pol.cut(curPt, other_pt); 
     } 
     pol.dummy = a[idx]; 

     polygons[idx] = pol; 
    } 



    int main(int argc, unsigned char* argv[]) 
    { 

     const int N = 100; 
     float a_h[N], *a_d; 
     Polygon p_h[N/2], *p_d; 

     size_t size = N * sizeof(float); 
     size_t size_pol = N/2 * sizeof(Polygon); 

     cudaError_t err = cudaMalloc((void **) &a_d, size); 
     cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol); 

     for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001; 
     cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); 

     int block_size = 4; 
     int n_blocks = N/block_size + (N%block_size == 0 ? 0:1); 
     cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N); 

     cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost); 
     cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost); 

     for (int i=0; i<N/2; i++) 
     printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts); 

     cudaFree(a_d); 
     cudaFree(p_d); 


     return 0; 
    }

相同的代碼可以正常工作：

#include <stdio.h> 
#include <iostream> 
#include <stdlib.h> 

class Point2D { 
public: 
Point2D(double xx=0, double yy=0):x(xx),y(yy){}; 
double x, y; 
}; 

double dot(const Point2D &A, const Point2D &B) { 
return A.x*B.x + A.y*B.y; 
} 
Point2D operator*(double a, const Point2D &P) { 
return Point2D(a*P.x, a*P.y); 
} 
Point2D operator+(Point2D A, const Point2D &B) { 
return Point2D(A.x + B.x, A.y + B.y); 
} 
Point2D operator-(Point2D A, const Point2D &B) { 
return Point2D(A.x - B.x, A.y - B.y); 
} 
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD 
    Point2D M = 0.5*(C+D); 
    return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A); 
} 

class Polygon { 
public: 
    Polygon():nbpts(0){}; 
    void addPts(Point2D pt) { 
    pts[nbpts] = pt; 
    nbpts++; 
    }; 
    Polygon& operator=(const Polygon& rhs) { 
    nbpts = rhs.nbpts; 
    dummy = rhs.dummy; 
    memcpy(pts, rhs.pts, nbpts*sizeof(Point2D)); 
    return *this; 
    } 
    void cut(const Point2D &inside_pt, const Point2D &outside_pt) { 

    int new_nbpts = 0; 
    Point2D newpts[128]; 
    Point2D AB(outside_pt-inside_pt); 
    Point2D M(0.5*(outside_pt+inside_pt)); 
    double ABM = dot(AB, M); 

    Point2D S = pts[nbpts-1]; 

    for (int i=0; i<nbpts; i++) { 

     Point2D E = pts[i]; 

     double ddot = -ABM + dot(AB, E); 
     if (ddot<0) { // E inside clip edge 
     double ddot2 = -ABM + dot(AB, S); 
     if (ddot2>0) { 
      newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
      new_nbpts++; 
     } 
     newpts[new_nbpts] = E; 
     new_nbpts++; 
     } else { 
     double ddot2 = -ABM + dot(AB, S); 
     if (ddot2<0) { 
      newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
      new_nbpts++; 
     } 
     } 
     S = E; 
    } 

    memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D)); 
    /*for (int i=0; i<128; i++) { 
     pts[i] = newpts[i]; 
    }*/ 
    nbpts = new_nbpts; 
    } 

//private: 
Point2D pts[128]; 
int nbpts; 
float dummy; 
}; 


void cut_poly(int idx, float *a, Polygon* polygons, int N) 
{ 
    if (idx>=N/2) return; 

    Polygon pol; 
    pol.addPts(Point2D(0.,0.)); 
    pol.addPts(Point2D(1.,0.)); 
    pol.addPts(Point2D(1.,1.)); 
    pol.addPts(Point2D(0.,1.)); 

    Point2D curPt(a[2*idx], a[2*idx+1]); 

    for (int i=0; i<N/2; i++) { 
    if (idx==i) continue; 
    Point2D other_pt(a[2*i], a[2*i+1]); 
    pol.cut(curPt, other_pt); 
    } 
    pol.dummy = a[idx]; 

    polygons[idx] = pol; 
} 



int main(int argc, unsigned char* argv[]) 
{ 

    const int N = 100; // Number of elements in arrays 
    float a_h[N], *a_d; // Pointer to host & device arrays 
    Polygon p_h[N/2], *p_d; 

    for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001; 

    for (int idx=0; idx<N; idx++) 
    cut_poly(idx, a_h, p_h, N); 

    for (int i=0; i<N/2; i++) 
    printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts); 

    return 0; 
}

來源

2012-11-29 WhitAngl

在我們檢查整個代碼塊之前，你已經把它縮小到了什麼程度？ – Bart

是的 - 我昨天試過這個問題：http://stackoverflow.com/questions/13619025/cuda-strange-bug但顯然，縮小代碼是一個壞主意，因爲錯誤不再展示，但行爲算法現在完全不同了。：s – WhitAngl

在我們完成代碼之前，請檢查'cudaMemcpy（）'調用的錯誤返回值。 – tera

嗯，我想你可以忽略我的大部分意見。我錯誤地使用了我在CUDA 3.2中創建的機器，並且它在內核啓動失敗時表現出不同的行爲。當我切換到CUDA 4.1和CUDA 5.0時，事情開始變得有意義。我很抱歉在那裏混亂。

無論如何，在經過那個之後，我很快注意到你的CPU和GPU實現是有區別的。特別是在這裏（看CPU的代碼）：

void cut_poly(int idx, float *a, Polygon* polygons, int N) 
{ 
    if (idx>=N/2) return; 

    Polygon pol; 
    pol.addPts(Point2D(0.,0.)); 
    pol.addPts(Point2D(1.,0.)); 
    pol.addPts(Point2D(1.,1.)); 
    pol.addPts(Point2D(0.,1.)); 

    Point2D curPt(a[2*idx], a[2*idx+1]); 

    for (int i=0; i<N/2; i++) { 
    if (idx==i) continue;  /* NOTE THIS LINE MISSING FROM YOUR GPU CODE */ 
    Point2D other_pt(a[2*i], a[2*i+1]); 
    pol.cut(curPt, other_pt); 
    } 
    pol.dummy = a[idx]; 

    polygons[idx] = pol; 
}

參考線我已經添加的註釋以上，如果在cut_poly內核添加的代碼，準確的線到相應點在你的GPU代碼，那麼無論如何，GPU代碼會產生與CPU代碼相同的打印結果。

我會做的另一個觀察是，你是不必要地只用4個線程運行塊。在編寫代碼時，沒有什麼問題，但是一旦你爲了「生產」目的而運行它，你很可能會想要一個更高的數字，比如256，並且一定要選擇一個數字爲了獲得最佳性能，爲32的整數倍。

爲了迴應評論中發佈的問題，我認爲數據已被正確複製，但很可能您沒有在主機上正確訪問它。（我不知道你是如何確定「我的數組沒有正確返回到主機」）。你的大多數班級定義只有__device__。因此，很難訪問主機上的班級結構（例如Polygon班級中的Point2D pts班級）。我在這裏將修改後的代碼，我想表明的數據被傳輸回主持人：

#include <stdio.h> 
    #include <iostream> 
    #include <stdlib.h> 
// #include <cuda.h> 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 


    class Point2D { 
    public: 
    __host__ __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){}; 
    double x, y; 
    }; 

    __host__ __device__ double dot(const Point2D &A, const Point2D &B) { 
    return A.x*B.x + A.y*B.y; 
    } 
    __host__ __device__ Point2D operator*(double a, const Point2D &P) { 
    return Point2D(a*P.x, a*P.y); 
    } 
    __host__ __device__ Point2D operator+(Point2D A, const Point2D &B) { 
    return Point2D(A.x + B.x, A.y + B.y); 
    } 
    __host__ __device__ Point2D operator-(Point2D A, const Point2D &B) { 
    return Point2D(A.x - B.x, A.y - B.y); 
    } 
    __host__ __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD 
     Point2D M = 0.5*(C+D); 
     return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A); 
    } 

    class Polygon { 
    public: 
     __host__ __device__ Polygon():nbpts(0){}; 
     __host__ __device__ void addPts(Point2D pt) { 
     pts[nbpts] = pt; 
     nbpts++; 
     }; 
     __host__ __device__ Polygon& operator=(const Polygon& rhs) { 
     nbpts = rhs.nbpts; 
     dummy = rhs.dummy; 
     memcpy(pts, rhs.pts, nbpts*sizeof(Point2D)); 
     return *this; 
     } 
     __host__ __device__ Point2D getpoint(unsigned i){ 
     if (i<128) return pts[i]; 
     else return pts[0]; 
     } 
     __host__ __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) { 

     int new_nbpts = 0; 
     Point2D newpts[128]; 
     Point2D AB(outside_pt-inside_pt); 
     Point2D M(0.5*(outside_pt+inside_pt)); 
     double ABM = dot(AB, M); 

     Point2D S = pts[nbpts-1]; 

     for (int i=0; i<nbpts; i++) { 

      Point2D E = pts[i]; 

      double ddot = -ABM + dot(AB, E); 
      if (ddot<0) { // E inside clip edge 
      double ddot2 = -ABM + dot(AB, S); 
      if (ddot2>0) { 
       newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
       new_nbpts++; 
      } 
      newpts[new_nbpts] = E; 
      new_nbpts++; 
      } else { 
      double ddot2 = -ABM + dot(AB, S); 
      if (ddot2<0) { 
       newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt); 
       new_nbpts++; 
      } 
      } 
      S = E; 
     } 

     memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D)); 
     nbpts = new_nbpts; 
     } 

    //private: 
    Point2D pts[128]; 
    int nbpts; 
    float dummy; 
    }; 


    __global__ void cut_poly(float *a, Polygon* polygons, int N) 
    { 
     int idx = blockIdx.x * blockDim.x + threadIdx.x; 
     if (idx>=N/2) return; 

     Polygon pol; 
     pol.addPts(Point2D(0.,0.)); 
     pol.addPts(Point2D(1.,0.)); 
     pol.addPts(Point2D(1.,1.)); 
     pol.addPts(Point2D(0.,1.)); 

     Point2D curPt(a[2*idx], a[2*idx+1]); 

     for (int i=0; i<N/2; i++) { 
     if (idx==i) continue; 
     Point2D other_pt(a[2*i], a[2*i+1]); 
     pol.cut(curPt, other_pt); 
     } 
     pol.dummy = pol.getpoint(0).x; 

     polygons[idx] = pol; 
    } 



    int main(int argc, unsigned char* argv[]) 
    { 

     const int N = 100; 
     float a_h[N], *a_d; 
     Polygon p_h[N/2], *p_d; 

     size_t size = N * sizeof(float); 
     size_t size_pol = N/2 * sizeof(Polygon); 

     cudaMalloc((void **) &a_d, size); 
     cudaCheckErrors("cm1"); 
     cudaMalloc((void **) &p_d, size_pol); 
     cudaCheckErrors("cm2"); 

     for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001; 
     cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); 
     cudaCheckErrors("cmcp1"); 

     int block_size = 128; 
     int n_blocks = N/block_size + (N%block_size == 0 ? 0:1); 
     cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N); 
     cudaCheckErrors("kernel"); 

     cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost); 
     cudaCheckErrors("cmcp2"); 
     cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost); 
     cudaCheckErrors("cmcp3"); 

     for (int i=0; i<N/2; i++) 
     printf("%f \t %f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].getpoint(0).x, p_h[i].nbpts); 

     cudaFree(a_d); 
     cudaFree(p_d); 


     return 0; 
    }

我會建議使用發佈這些東西的新問題。

來源

2012-11-30 20:51:51

Arrrgg！非常感謝！我正在並行調試串行和並行實現，並且錯過了我在串行代碼中所做的更正...！非常感謝您發現錯誤並花費時間！ – WhitAngl

事實上，這個代碼仍然存在一個問題，我無法弄清楚：返回給主機的多邊形的所有頂點的座標都設置爲0（在'pol [i] .pts [... ]'）而不是它們的實際值（儘管現在頂點的數量是正確的並且在變量nbpoints中）。：s我檢查了我的Polygon :: operator =被正確調用。這只是我的數組沒有正確返回到主機。 – WhitAngl

請注意，如果我在「dummy」變量中輸出多邊形的頂點座標之一，那麼我會得到正確的答案。 – WhitAngl

Cuda版本在串行工作時不工作

回答

相關問題