2015-03-03 110 views
2

正如標題所說,我想在向量中使用函數進行元素操作。我想知道cublas庫中是否有任何函數可以這樣做?cubac中是否有函數可以將sigmoid函數與向量一起使用?

+2

我不認爲乙狀結腸可以應用於元素方面使用單個CUBLAS呼叫的載體。用[thrust](https://github.com/thrust/thrust/wiki/Quick-Start-Guide)來處理會很簡單。編寫一個CUDA內核來做它也是相當微不足道的。 – 2015-03-03 01:24:28

回答

3

我不知道可以幫助完成任務的合適的CUBLAS功能。但是,您可以輕鬆編寫自己的代碼,該代碼應用sigmoid函數或其他任何單參數函數,以元素方式將其應用於矢量。請注意,在大多數情況下,此類代碼將受內存限制而非計算限制。請參閱下面的CUDA程序以獲取有效的示例,尤其是sigmoid_kernel()。該程序的輸出應該是這樣的:

source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001 
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001 

#include <stdlib.h> 
#include <stdio.h> 
#include <math.h> 

#define DEFAULT_LEN 100000 

// Macro to catch CUDA errors in CUDA runtime calls 
#define CUDA_SAFE_CALL(call)           \ 
do {                 \ 
    cudaError_t err = call;           \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 

// Macro to catch CUDA errors in kernel launches 
#define CHECK_LAUNCH_ERROR()           \ 
do {                 \ 
    /* Check synchronous errors, i.e. pre-launch */     \ 
    cudaError_t err = cudaGetLastError();        \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
    /* Check asynchronous errors, i.e. kernel failed (ULF) */   \ 
    err = cudaThreadSynchronize();         \ 
    if (cudaSuccess != err) {           \ 
     fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 
       __FILE__, __LINE__, cudaGetErrorString(err));  \ 
     exit(EXIT_FAILURE);           \ 
    }                 \ 
} while (0) 

__device__ __forceinline__ double sigmoid (double a) 
{ 
    return 1.0/(1.0 + exp (-a)); 
} 

__global__ void sigmoid_kernel (const double * __restrict__ src, 
           double * __restrict__ dst, int len) 
{ 
    int stride = gridDim.x * blockDim.x; 
    int tid = blockDim.x * blockIdx.x + threadIdx.x; 
    for (int i = tid; i < len; i += stride) { 
     dst[i] = sigmoid (src[i]); 
    } 
}  

int main (void) 
{ 
    double *source, *result; 
    double *d_a = 0, *d_b = 0; 

    int len = DEFAULT_LEN; 

    /* Allocate memory on host */ 
    source = (double *)malloc (len * sizeof (source[0])); 
    if (!source) return EXIT_FAILURE; 
    result = (double *)malloc (len * sizeof (result[0])); 
    if (!result) return EXIT_FAILURE; 

    /* create source data */ 
    for (int i = 0; i < len; i++) source [i] = i * 1e-5; 

    /* spot check of source data */ 
    printf ("source[0]=% 23.16e source[%d]=% 23.16e\n", 
      source[0], len-1, source[len-1]); 

    /* Allocate memory on device */ 
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len)); 
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len)); 

    /* Push source data to device */ 
    CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len, 
           cudaMemcpyHostToDevice)); 

    /* Compute execution configuration */ 
    dim3 dimBlock(256); 
    int threadBlocks = (len + (dimBlock.x - 1))/dimBlock.x; 
    if (threadBlocks > 65520) threadBlocks = 65520; 
    dim3 dimGrid(threadBlocks); 

    sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len); 
    CHECK_LAUNCH_ERROR(); 

    /* retrieve results from device */ 
    CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len, 
           cudaMemcpyDeviceToHost)); 

    /* spot check of results */ 
    printf ("result[0]=% 23.16e result[%d]=% 23.16e\n", 
      result[0], len-1, result[len-1]); 

    /* free memory on host and device */ 
    CUDA_SAFE_CALL (cudaFree(d_a)); 
    CUDA_SAFE_CALL (cudaFree(d_b)); 
    free (result); 
    free (source); 

    return EXIT_SUCCESS; 
} 
相關問題