使用CUDA計算不同集合中點之間的所有對距離

我正試圖在CUDA中實現蠻力距離計算算法。使用CUDA計算不同集合中點之間的所有對距離

#define VECTOR_DIM 128 
thrust::device_vector<float> feature_data_1; 
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points 
thrust::device_vector<float> feature_data_2; 
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points

現在我想做的是計算從第一矩陣中的每個向量L2距離（的平方差之和）在第二矩陣中的每個向量。

因此，如果陣列1是大小1000的和陣列2是大小2000的，其結果將是在尺寸上的1000*2000浮點矩陣。

我想知道是否有辦法單獨使用Thrust算法來實現這一點。

來源

2015-04-20 Luca

應該有可能。但是，您已經制作了一個數據存儲佈局，它是一個結構陣列（AoS）。這對GPU的良好性能並不是特別有利（無論是CUDA還是Thrust）。如果你想高效地完成這項工作，你幾乎可以肯定地將你的數據重新排列成與SoA接近的東西。 –

我意識到，一邊看着你的一些其他職位。我現在正在重構。我會更新線程。 – Luca

我認爲你可以注意到：|| || xy ||^2 = || x ||^2 + || y ||^2-2 * '，其中''表示' x'和'y'。如果你假定'x'和'y'向量的行主排序爲'X'和'Y'矩陣，那麼你可以使用類似[使用CUDA減少矩陣行]（http://stackoverflow.com/questions/） 17862078/reduce-matrix-rows-with-cuda）來計算所有需要的「|| x ||^2」和「|| y ||^2」。然後可以使用'cublas gemm（）'將標量積''計算爲矩陣 - 矩陣乘法'X * Y^T'。 – JackOLantern

計算在兩個不同集合在CUDA點之間的所有點對距離可以通過觀察來解決該

||x-y||^2=||x||^2+||y||^2-2*<x,y>

其中|| ||是l2規範和<x,y>表示x和y之間的標量積。

的規範||x||和||y||可以通過Reduce matrix rows with CUDA啓發的方法來計算，而標量積<x,y>然後可以如使用cublas<t>gemm()的矩陣的矩陣乘法X*Y^T來計算。

下面是一個完整的實現。請注意，爲了計算規範|| ||，報告了兩種方法，一種使用cuBLAScublas<t>gemv，另一種使用Thurst的transform。對於問題大小的興趣的，我已經經歷我的GT540M上以下時序：

Approach nr. 1 0.12ms 
Approach nr. 2 0.59ms 

include <cublas_v2.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/reduce.h> 
#include <thrust/functional.h> 
#include <thrust/random.h> 
#include <thrust/sequence.h> 

#include <stdio.h> 
#include <iostream> 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

#define BLOCK_SIZE_X 16 
#define BLOCK_SIZE_Y 16 

/***********************************************************/ 
/* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */ 
/***********************************************************/ 
struct abs2 { 
    __host__ __device__ double operator()(const float &x) const { return x * x; } 
}; 

// --- Required for approach #2 
__device__ float *vals; 

/******************************************/ 
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ 
/******************************************/ 
struct row_reduction { 

    const int Ncols; // --- Number of columns 

    row_reduction(int _Ncols) : Ncols(_Ncols) {} 

    __device__ float operator()(float& x, int& y) { 
     float temp = 0.f; 
     for (int i = 0; i<Ncols; i++) 
      temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)]; 
     return temp; 
    } 
}; 

/************************************************/ 
/* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */ 
/************************************************/ 
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots, 
             const int NX, const int NY) { 

    const int i = threadIdx.x + blockIdx.x * gridDim.x; 
    const int j = threadIdx.y + blockIdx.y * gridDim.y; 

    if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j]; 

} 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    //const int Ndims = 128;  // --- Number of rows 
    //const int NX = 1000;  // --- Number of columns 
    //const int NY = 2000;  // --- Number of columns 

    const int Ndims = 3;  // --- Number of rows 
    const int NX = 4;  // --- Number of columns 
    const int NY = 5;  // --- Number of columns 

    // --- Random uniform integer distribution between 10 and 99 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(10, 99); 

    // --- Matrices allocation and initialization 
    thrust::device_vector<float> d_X(Ndims * NX); 
    thrust::device_vector<float> d_Y(Ndims * NY); 
    for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng); 
    for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng); 

    TimingGPU timerGPU; 

    // --- cuBLAS handle creation 
    cublasHandle_t handle; 
    cublasSafeCall(cublasCreate(&handle)); 

    /**********************************************/ 
    /* CALCULATING THE NORMS OF THE ELEMENTS OF X */ 
    /**********************************************/ 
    thrust::device_vector<float> d_norms_x_2(NX); 

    // --- Approach nr. 1 
    //timerGPU.StartCounter(); 
    thrust::device_vector<float> d_X_2(Ndims * NX); 
    thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2()); 

    thrust::device_vector<float> d_ones(Ndims, 1.f); 

    float alpha = 1.f; 
    float beta = 0.f; 
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1)); 

    //printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); 

    // --- Approach nr. 2 
    //timerGPU.StartCounter(); 
// float *s_vals = thrust::raw_pointer_cast(&d_X[0]); 
// gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); 
// thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0), d_norms_x_2.begin(), row_reduction(Ndims)); 

    //printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); 

    /**********************************************/ 
    /* CALCULATING THE NORMS OF THE ELEMENTS OF Y */ 
    /**********************************************/ 
    thrust::device_vector<float> d_norms_y_2(NX); 

    thrust::device_vector<float> d_Y_2(Ndims * NX); 
    thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2()); 

    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1)); 


    /***********************************/ 
    /* CALCULATING THE SCALAR PRODUCTS */ 
    /***********************************/ 
    thrust::device_vector<float> d_dots(NX * NY); 

    cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha, 
           thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta, 
           thrust::raw_pointer_cast(d_dots.data()), NX)); 

    /*****************************/ 
    /* ASSEMBLE THE FINAL RESULT */ 
    /*****************************/ 

    dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y); 
    dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y)); 
    assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()), 
               thrust::raw_pointer_cast(d_dots.data()), NX, NY); 

    for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n"; 

    return 0; 
}

的Utilities.cu和Utilities.cuh文件將被保留下來here和這裏不再贅述。 TimingGPU.cu和TimingGPU.cuh保持爲here並且也被省略。

來源

2015-04-22 21:00:21 JackOLantern

你可以評論這條線是幹什麼的嗎？（推杆，CUBLAS_OP_T，Ndims，NX，＆alpha，thrust :: raw_pointer_cast（d_X_2.data（）），Ndims， thrust :: raw_pointer_cast（d_ones.data（）），1，＆beta，thrust :: raw_pointer_cast （d_norms_x_2.data（）），1））; – Luca

使用CUDA計算不同集合中點之間的所有對距離

回答

相關問題