2015-04-20 67 views
1

我正試圖在CUDA中實現蠻力距離計算算法。使用CUDA計算不同集合中點之間的所有對距離

#define VECTOR_DIM 128 
thrust::device_vector<float> feature_data_1; 
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points 
thrust::device_vector<float> feature_data_2; 
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points 

現在我想做的是計算從第一矩陣中的每個向量L2距離(的平方差之和)在第二矩陣中的每個向量。

因此,如果陣列1是大小1000的和陣列2是大小2000的,其結果將是在尺寸上的1000*2000浮點矩陣。

我想知道是否有辦法單獨使用Thrust算法來實現這一點。

+0

應該有可能。但是,您已經制作了一個數據存儲佈局,它是一個結構陣列(AoS)。這對GPU的良好性能並不是特別有利(無論是CUDA還是Thrust)。如果你想高效地完成這項工作,你幾乎可以肯定地將你的數據重新排列成與SoA接近的東西。 –

+0

我意識到,一邊看着你的一些其他職位。我現在正在重構。我會更新線程。 – Luca

+1

我認爲你可以注意到:|| || xy ||^2 = || x ||^2 + || y ||^2-2 * ',其中''表示' x'和'y'。如果你假定'x'和'y'向量的行主排序爲'X'和'Y'矩陣,那麼你可以使用類似[使用CUDA減少矩陣行](http://stackoverflow.com/questions/) 17862078/reduce-matrix-rows-with-cuda)來計算所有需要的「|| x ||^2」和「|| y ||^2」。然後可以使用'cublas gemm()'將標量積''計算爲矩陣 - 矩陣乘法'X * Y^T'。 – JackOLantern

回答

3

計算在兩​​個不同集合在CUDA點之間的所有點對距離可以通過觀察來解決該

||x-y||^2=||x||^2+||y||^2-2*<x,y> 

其中|| ||l2規範和<x,y>表示xy之間的標量積。

的規範||x||||y||可以通過Reduce matrix rows with CUDA啓發的方法來計算,而標量積<x,y>然後可以如使用cublas<t>gemm()的矩陣的矩陣乘法X*Y^T來計算。

下面是一個完整的實現。請注意,爲了計算規範|| ||,報告了兩種方法,一種使用cuBLAScublas<t>gemv,另一種使用Thurst的transform。對於問題大小的興趣的,我已經經歷我的GT540M上以下時序:

Approach nr. 1 0.12ms 
Approach nr. 2 0.59ms 

include <cublas_v2.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/reduce.h> 
#include <thrust/functional.h> 
#include <thrust/random.h> 
#include <thrust/sequence.h> 

#include <stdio.h> 
#include <iostream> 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

#define BLOCK_SIZE_X 16 
#define BLOCK_SIZE_Y 16 

/***********************************************************/ 
/* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */ 
/***********************************************************/ 
struct abs2 { 
    __host__ __device__ double operator()(const float &x) const { return x * x; } 
}; 

// --- Required for approach #2 
__device__ float *vals; 

/******************************************/ 
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ 
/******************************************/ 
struct row_reduction { 

    const int Ncols; // --- Number of columns 

    row_reduction(int _Ncols) : Ncols(_Ncols) {} 

    __device__ float operator()(float& x, int& y) { 
     float temp = 0.f; 
     for (int i = 0; i<Ncols; i++) 
      temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)]; 
     return temp; 
    } 
}; 

/************************************************/ 
/* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */ 
/************************************************/ 
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots, 
             const int NX, const int NY) { 

    const int i = threadIdx.x + blockIdx.x * gridDim.x; 
    const int j = threadIdx.y + blockIdx.y * gridDim.y; 

    if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j]; 

} 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    //const int Ndims = 128;  // --- Number of rows 
    //const int NX = 1000;  // --- Number of columns 
    //const int NY = 2000;  // --- Number of columns 

    const int Ndims = 3;  // --- Number of rows 
    const int NX = 4;  // --- Number of columns 
    const int NY = 5;  // --- Number of columns 

    // --- Random uniform integer distribution between 10 and 99 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(10, 99); 

    // --- Matrices allocation and initialization 
    thrust::device_vector<float> d_X(Ndims * NX); 
    thrust::device_vector<float> d_Y(Ndims * NY); 
    for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng); 
    for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng); 

    TimingGPU timerGPU; 

    // --- cuBLAS handle creation 
    cublasHandle_t handle; 
    cublasSafeCall(cublasCreate(&handle)); 

    /**********************************************/ 
    /* CALCULATING THE NORMS OF THE ELEMENTS OF X */ 
    /**********************************************/ 
    thrust::device_vector<float> d_norms_x_2(NX); 

    // --- Approach nr. 1 
    //timerGPU.StartCounter(); 
    thrust::device_vector<float> d_X_2(Ndims * NX); 
    thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2()); 

    thrust::device_vector<float> d_ones(Ndims, 1.f); 

    float alpha = 1.f; 
    float beta = 0.f; 
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1)); 

    //printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); 

    // --- Approach nr. 2 
    //timerGPU.StartCounter(); 
// float *s_vals = thrust::raw_pointer_cast(&d_X[0]); 
// gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); 
// thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0), d_norms_x_2.begin(), row_reduction(Ndims)); 

    //printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); 

    /**********************************************/ 
    /* CALCULATING THE NORMS OF THE ELEMENTS OF Y */ 
    /**********************************************/ 
    thrust::device_vector<float> d_norms_y_2(NX); 

    thrust::device_vector<float> d_Y_2(Ndims * NX); 
    thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2()); 

    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1)); 


    /***********************************/ 
    /* CALCULATING THE SCALAR PRODUCTS */ 
    /***********************************/ 
    thrust::device_vector<float> d_dots(NX * NY); 

    cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha, 
           thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta, 
           thrust::raw_pointer_cast(d_dots.data()), NX)); 

    /*****************************/ 
    /* ASSEMBLE THE FINAL RESULT */ 
    /*****************************/ 

    dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y); 
    dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y)); 
    assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()), 
               thrust::raw_pointer_cast(d_dots.data()), NX, NY); 

    for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n"; 

    return 0; 
} 

Utilities.cuUtilities.cuh文件將被保留下來here和這裏不再贅述。 TimingGPU.cuTimingGPU.cuh保持爲here並且也被省略。

+0

你可以評論這條線是幹什麼的嗎? (推杆,CUBLAS_OP_T,Ndims,NX,&alpha,thrust :: raw_pointer_cast(d_X_2.data()),Ndims, thrust :: raw_pointer_cast(d_ones.data()),1,&beta,thrust :: raw_pointer_cast (d_norms_x_2.data()),1)); – Luca