2
正如標題所說,我想在向量中使用函數進行元素操作。我想知道cublas庫中是否有任何函數可以這樣做?cubac中是否有函數可以將sigmoid函數與向量一起使用?
正如標題所說,我想在向量中使用函數進行元素操作。我想知道cublas庫中是否有任何函數可以這樣做?cubac中是否有函數可以將sigmoid函數與向量一起使用?
我不知道可以幫助完成任務的合適的CUBLAS功能。但是,您可以輕鬆編寫自己的代碼,該代碼應用sigmoid函數或其他任何單參數函數,以元素方式將其應用於矢量。請注意,在大多數情況下,此類代碼將受內存限制而非計算限制。請參閱下面的CUDA程序以獲取有效的示例,尤其是sigmoid_kernel()
。該程序的輸出應該是這樣的:
source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001
。
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DEFAULT_LEN 100000
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ double sigmoid (double a)
{
return 1.0/(1.0 + exp (-a));
}
__global__ void sigmoid_kernel (const double * __restrict__ src,
double * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = sigmoid (src[i]);
}
}
int main (void)
{
double *source, *result;
double *d_a = 0, *d_b = 0;
int len = DEFAULT_LEN;
/* Allocate memory on host */
source = (double *)malloc (len * sizeof (source[0]));
if (!source) return EXIT_FAILURE;
result = (double *)malloc (len * sizeof (result[0]));
if (!result) return EXIT_FAILURE;
/* create source data */
for (int i = 0; i < len; i++) source [i] = i * 1e-5;
/* spot check of source data */
printf ("source[0]=% 23.16e source[%d]=% 23.16e\n",
source[0], len-1, source[len-1]);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
/* Push source data to device */
CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock(256);
int threadBlocks = (len + (dimBlock.x - 1))/dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
CHECK_LAUNCH_ERROR();
/* retrieve results from device */
CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
cudaMemcpyDeviceToHost));
/* spot check of results */
printf ("result[0]=% 23.16e result[%d]=% 23.16e\n",
result[0], len-1, result[len-1]);
/* free memory on host and device */
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
free (result);
free (source);
return EXIT_SUCCESS;
}
我不認爲乙狀結腸可以應用於元素方面使用單個CUBLAS呼叫的載體。用[thrust](https://github.com/thrust/thrust/wiki/Quick-Start-Guide)來處理會很簡單。編寫一個CUDA內核來做它也是相當微不足道的。 – 2015-03-03 01:24:28