我想調用兩個設備功能從CUDA內核函數:我可以從CUDA內核函數調用__device__函數嗎?
編輯:爲了避免混淆,該函數定義在不同的文件作爲內核的定義,我提供的完整代碼:
Complete code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<fstream>
#include<string>
#include<iterator>
using namespace std;
#define POLYNOMIAL 0x04C11DB7L //Standard CRC-32 polynomial
#define M 62352 //Number of bits in the bloom filter
#define K 4 //Number of bits set per mapping in filter
typedef unsigned short int word16;
typedef unsigned int word32;
__device__ static word32 CrcTable[256]; //Table of 8-bit CRC32 remainders
__device__ char BFilter[M/8]; //Bloom filter array of M/8 bytes
word32 NumBytes; //Number of bytes in Bloom filter
void gen_crc_table(void);
__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);
__device__ void mapBloom(word32 hash);
__device__ word32 crc32;
__device__ int retCode;
__global__ void mapBloomKernel(const char* d_wordList, int* sizeOfWords)
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
const char *current_word = &(*(d_wordList+(index*30)));
for(int i=0; i<K; i++)
{
crc32 = update_crc(i, d_wordList+(index*30), sizeOfWords[index]);
mapBloom(crc32);
}
}
/*
Main Function
*/
int main()
{
FILE *fp1;
FILE *fp2;
word32 i;
cout<<"-----------------------------------------------"<<endl;
cout<<"-- Program to implement a general Bloom filter --\n";
cout<<"-----------------------------------------------"<<endl;
//Determine number of bytes in Bloom Filter
NumBytes = M/8;
if((M%8)!=0)
{
cout<<"*** ERROR - M value must be dibisible by 8 \n";
exit(1);
}
//Initialize the CRC32 table
gen_crc_table();
//Clear the Bloom filter
for(i = 0; i<NumBytes; i++)
{
BFilter[i] = 0x00;
}
fp1 = fopen("word_list_10000.txt","r");
if(fp1 == NULL)
{
cout<<"ERROR in opening input file #1 ***\n";
exit(1);
}
fp2 = fopen("bloom_query.txt","r");
if(fp2 == NULL)
{
cout<<"ERROR in opening input file #2 ***\n";
exit(1);
}
//determine the number of words in list:
std::ifstream f("word_list_10000.txt");
std::istream_iterator<std::string> beg(f), end;
int number_of_words = distance(beg,end);
cout<<"Number of words in file: "<<number_of_words<<endl;
cout<<"size of char: "<<sizeof(char)<<endl;
cout<<"Reading to array!: "<<endl;
ifstream file("word_list_10000.txt");
const int text_length = 30;
char *wordList = new char[10000 * text_length];
int *sizeOfWords = new int[10000];
for(int i=0; i<number_of_words; i++)
{
file>>wordList + (i*text_length);
sizeOfWords[i] = strlen(wordList + (i*text_length));
cout<<wordList + (i*text_length)<<endl;
}
char *dev_wordList;
char *dev_sizeOfWords;
cudaMalloc((void**)&dev_wordList, 30*number_of_words*sizeof(char));
cudaMalloc((void**)&dev_sizeOfWords, number_of_words * sizeof(char));
cudaMemcpy(dev_wordList, wordList, 30 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_sizeOfWords, sizeOfWords, number_of_words * sizeof(char), cudaMemcpyHostToDevice);
unsigned int crc_size = sizeof(word32) * 256;
unsigned int bfilter_size = sizeof(char) * M/8;
static word32* d_CrcTable;
char* d_BFilter;
cudaMalloc((void**)&d_CrcTable, crc_size);
cudaMalloc((void**)&d_BFilter, bfilter_size);
//copy host arrays CrcTable & BFilter to device memory
cudaMemcpy(d_CrcTable, CrcTable, crc_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_BFilter, BFilter, bfilter_size, cudaMemcpyHostToDevice);
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
mapBloomKernel<<<grid, threads>>>(dev_wordList, sizeOfWords);
fclose(fp1);
//Output results header
cout<<"----------------------------------------------------------\n";
cout<<"Matching strings are... \n";
/*
...
...
...
*/
fclose(fp2);
}
/*
* Function to initialize CRC32 table
*/
void gen_crc_table(void)
{
register word32 crc_accum;
register word16 i, j;
//Initialize the CRC32 8-bit look-up table
for(i=0; i<256; i++)
{
crc_accum = ((word32) i<<24);
for(j=0; j<8; j++)
{
if(crc_accum & 0x80000000L)
crc_accum = (crc_accum << 1) ^POLYNOMIAL;
else
crc_accum = (crc_accum << 1);
}
CrcTable[i] = crc_accum;
//cout<<CrcTable[i]<<endl;
}
}
/*
* Function to generate CRC32
*/
__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size)
{
register word32 i, j;
for(j=0; j<data_blk_size; j++)
{
i = ((int) (crc_accum >>24)^*data_blk_ptr++) & 0xFF;
crc_accum = (crc_accum << 8)^CrcTable[i];
}
crc_accum = ~crc_accum;
return crc_accum;
}
/*
* Function to map hash into Bloom filter
*/
__device__ void mapBloom(word32 hash)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char mapBit;
tempInt = hash % M;
byteNum = tempInt/8;
bitNum = tempInt % 8;
mapBit = 0x80;
mapBit = mapBit >> bitNum;
//Map the bit into Bloom filter
BFilter[byteNum] = BFilter[byteNum] | mapBit;
}
/*
* Function to test for a Bloom filter match
*/
__device__ int testBloom(word32 hash)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char testBit;
int retCode;
tempInt = hash % M;
byteNum = tempInt/8;
bitNum = tempInt % 8;
testBit = 0x80;
testBit = testBit >> bitNum;
if (BFilter[byteNum] & testBit)
retCode = 1;
else
retCode = 0;
return retCode;
}
命令行中使用編譯:
/OUT:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.exe" /INCREMENTAL:NO
/NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" "cudart.lib"
"kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib"
"ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST
/ManifestFile:"Debug\CUDA_Bloom_filter_v0.exe.intermediate.manifest" /ALLOWISOLATION
/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG
/PDB:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pdb"
/SUBSYSTEM:CONSOLE
/PGD:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pgd" /TLBID:1
/DYNAMICBASE /NXCOMPAT /MACHINE:X86 /ERRORREPORT:QUEUE
全輸出:
7 IntelliSense: expected an expression e:\...\kernel.cu 145 18 CUDA_Bloom_filter_v0
Error 6 error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing
Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --
cl-version 2010 -ccbin "F:\Installed\Microsoft Visual Studio 2010\VC\bin" -I"C:\Program Files\NVIDIA
GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing
Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart
static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd "
-o Debug\kernel.cu.obj
"E:\...\kernel.cu"" exited with code 2. C:\...\CUDA 5.5.targets 592 10 CUDA_Bloom_filter_v0
Error 5 error : **External calls are not supported** (found non-inlined call to _Z10update_crcjPKcj) E:\...\kernel.cu 40 1 CUDA_Bloom_filter_v0
函數定義在哪裏? – 2014-09-25 12:27:10
實際上,你可以*只*從內核函數(和其他'__device__'函數)調用'__device__'函數。所以這個問題一定在別的地方。您的目標是哪種計算能力? – Angew 2014-09-25 12:35:21
我懷疑(預)費米版本.. – 2014-09-25 12:37:25