我可以從CUDA內核函數調用__device__函數嗎？

我想調用兩個設備功能從CUDA內核函數：我可以從CUDA內核函數調用__device__函數嗎？

編輯：爲了避免混淆，該函數定義在不同的文件作爲內核的定義，我提供的完整代碼：

Complete code: 



#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include<iostream> 
#include<fstream> 
#include<string> 
#include<iterator> 
using namespace std; 


#define POLYNOMIAL 0x04C11DB7L //Standard CRC-32 polynomial 
#define M 62352 //Number of bits in the bloom filter 
#define K 4 //Number of bits set per mapping in filter 

typedef unsigned short int word16; 
typedef unsigned int word32; 

__device__ static word32 CrcTable[256]; //Table of 8-bit CRC32 remainders 
__device__ char BFilter[M/8];   //Bloom filter array of M/8 bytes 
word32 NumBytes;    //Number of bytes in Bloom filter 

void gen_crc_table(void); 
__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size); 
__device__ void mapBloom(word32 hash);   
__device__ word32 crc32; 
__device__ int retCode; 

__global__ void mapBloomKernel(const char* d_wordList, int* sizeOfWords) 
{ 
    //access thread id 
    const unsigned int bid = blockIdx.x; 
    const unsigned int tid = threadIdx.x; 
    const unsigned int index = bid * blockDim.x + tid; 

    const char *current_word = &(*(d_wordList+(index*30))); 
    for(int i=0; i<K; i++) 
    { 
     crc32 = update_crc(i, d_wordList+(index*30), sizeOfWords[index]); 
     mapBloom(crc32); 
    } 

} 

/* 
    Main Function 
*/ 

int main() 
{ 
    FILE *fp1;     
    FILE *fp2;     
    word32 i; 

    cout<<"-----------------------------------------------"<<endl; 
    cout<<"-- Program to implement a general Bloom filter --\n"; 
    cout<<"-----------------------------------------------"<<endl; 

    //Determine number of bytes in Bloom Filter 
    NumBytes = M/8; 
    if((M%8)!=0) 
    { 
     cout<<"*** ERROR - M value must be dibisible by 8 \n"; 
     exit(1); 
    } 

    //Initialize the CRC32 table 
    gen_crc_table(); 

    //Clear the Bloom filter 
    for(i = 0; i<NumBytes; i++) 
    { 
     BFilter[i] = 0x00; 
    } 

    fp1 = fopen("word_list_10000.txt","r"); 
    if(fp1 == NULL) 
    { 
     cout<<"ERROR in opening input file #1 ***\n"; 
     exit(1); 
    } 

    fp2 = fopen("bloom_query.txt","r"); 
    if(fp2 == NULL) 
    { 
     cout<<"ERROR in opening input file #2 ***\n"; 
     exit(1); 
    } 

    //determine the number of words in list: 

    std::ifstream f("word_list_10000.txt"); 
    std::istream_iterator<std::string> beg(f), end; 
    int number_of_words = distance(beg,end); 

    cout<<"Number of words in file: "<<number_of_words<<endl; 
    cout<<"size of char: "<<sizeof(char)<<endl; 

    cout<<"Reading to array!: "<<endl; 
    ifstream file("word_list_10000.txt"); 

    const int text_length = 30; 

    char *wordList = new char[10000 * text_length]; 
    int *sizeOfWords = new int[10000]; 

    for(int i=0; i<number_of_words; i++) 
    { 
     file>>wordList + (i*text_length); 
     sizeOfWords[i] = strlen(wordList + (i*text_length)); 
     cout<<wordList + (i*text_length)<<endl; 
    } 

     char *dev_wordList; 
     char *dev_sizeOfWords; 

     cudaMalloc((void**)&dev_wordList, 30*number_of_words*sizeof(char)); 
     cudaMalloc((void**)&dev_sizeOfWords, number_of_words * sizeof(char)); 
     cudaMemcpy(dev_wordList, wordList, 30 * number_of_words * sizeof(char), cudaMemcpyHostToDevice); 
     cudaMemcpy(dev_sizeOfWords, sizeOfWords, number_of_words * sizeof(char), cudaMemcpyHostToDevice); 


    unsigned int crc_size = sizeof(word32) * 256; 
    unsigned int bfilter_size = sizeof(char) * M/8; 

    static word32* d_CrcTable; 
    char* d_BFilter; 

    cudaMalloc((void**)&d_CrcTable, crc_size); 
    cudaMalloc((void**)&d_BFilter, bfilter_size); 

    //copy host arrays CrcTable & BFilter to device memory 

    cudaMemcpy(d_CrcTable, CrcTable, crc_size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_BFilter, BFilter, bfilter_size, cudaMemcpyHostToDevice); 

    //Setup execution parameters 
    int n_blocks = (number_of_words + 255)/256; 
    int threads_per_block = 256; 

    dim3 grid(n_blocks, 1, 1); 
    dim3 threads(threads_per_block, 1, 1); 

    mapBloomKernel<<<grid, threads>>>(dev_wordList, sizeOfWords); 

    fclose(fp1); 

    //Output results header 
    cout<<"----------------------------------------------------------\n"; 
    cout<<"Matching strings are... \n"; 

    /* 


    ... 
    ... 
    ... 

    */ 

    fclose(fp2); 
} 



/* 
* Function to initialize CRC32 table 
*/ 

void gen_crc_table(void) 
{ 
    register word32 crc_accum; 
    register word16 i, j; 
    //Initialize the CRC32 8-bit look-up table 
    for(i=0; i<256; i++) 
    { 
     crc_accum = ((word32) i<<24); 
     for(j=0; j<8; j++) 
     { 
      if(crc_accum & 0x80000000L) 
       crc_accum = (crc_accum << 1) ^POLYNOMIAL; 
      else 
       crc_accum = (crc_accum << 1); 
     } 
     CrcTable[i] = crc_accum; 
     //cout<<CrcTable[i]<<endl; 
    } 
} 

/* 
* Function to generate CRC32 
*/ 

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size) 
{ 
    register word32 i, j; 
    for(j=0; j<data_blk_size; j++) 
    { 
     i = ((int) (crc_accum >>24)^*data_blk_ptr++) & 0xFF; 
     crc_accum = (crc_accum << 8)^CrcTable[i]; 
    } 
    crc_accum = ~crc_accum; 

    return crc_accum; 
} 

/* 
* Function to map hash into Bloom filter 
*/ 

__device__ void mapBloom(word32 hash) 
{ 
    int tempInt; 
    int bitNum; 
    int byteNum; 
    unsigned char mapBit; 
    tempInt = hash % M; 
    byteNum = tempInt/8; 
    bitNum = tempInt % 8; 

    mapBit = 0x80; 
    mapBit = mapBit >> bitNum; 

    //Map the bit into Bloom filter 
    BFilter[byteNum] = BFilter[byteNum] | mapBit; 
} 

/* 
* Function to test for a Bloom filter match 
*/ 

__device__ int testBloom(word32 hash) 
{ 
    int tempInt; 
    int bitNum; 
    int byteNum; 
    unsigned char testBit; 
    int retCode; 
    tempInt = hash % M; 
    byteNum = tempInt/8; 
    bitNum = tempInt % 8; 

    testBit = 0x80; 
    testBit = testBit >> bitNum; 
    if (BFilter[byteNum] & testBit) 
     retCode = 1; 
    else 
     retCode = 0; 
    return retCode; 
}

命令行中使用編譯：

/OUT:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.exe" /INCREMENTAL:NO 
/NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" "cudart.lib" 
"kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" 
"ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST 
/ManifestFile:"Debug\CUDA_Bloom_filter_v0.exe.intermediate.manifest" /ALLOWISOLATION 
/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG 
/PDB:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pdb" 
/SUBSYSTEM:CONSOLE 
/PGD:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pgd" /TLBID:1 
/DYNAMICBASE /NXCOMPAT /MACHINE:X86 /ERRORREPORT:QUEUE

全輸出：

7 IntelliSense: expected an expression e:\...\kernel.cu 145 18 CUDA_Bloom_filter_v0 
    Error 6 error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env -- 
cl-version 2010 -ccbin "F:\Installed\Microsoft Visual Studio 2010\VC\bin" -I"C:\Program Files\NVIDIA 
GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing 
Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart 
static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " 
-o Debug\kernel.cu.obj 
"E:\...\kernel.cu"" exited with code 2. C:\...\CUDA 5.5.targets 592 10 CUDA_Bloom_filter_v0 
    Error 5 error : **External calls are not supported** (found non-inlined call to _Z10update_crcjPKcj) E:\...\kernel.cu 40 1 CUDA_Bloom_filter_v0

來源

2014-09-25 Alex Iacob

函數定義在哪裏？ – 2014-09-25 12:27:10

實際上，你可以*只*從內核函數（和其他'__device__'函數）調用'__device__'函數。所以這個問題一定在別的地方。您的目標是哪種計算能力？ – Angew 2014-09-25 12:35:21

我懷疑（預）費米版本.. – 2014-09-25 12:37:25

編譯器感到困惑，因爲你的函數原型（向前聲明）是這樣的：

__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);

，但你的定義是這樣的：

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size) 
{

你的函數定義預計，第二個參數是類型爲char *。但是您傳遞了const char *參數（並且您的前向聲明的類型爲const char *）。

這是一個基本的C/C++編碼錯誤。

您的前向聲明應該符合您的定義。由於沒有，編譯器在其他地方尋找匹配函數並且找不到它。

此問題的解決方法是讓你的函數定義匹配：

           add const here 
               v 
__device__ word32 update_crc(word32 crc_accum, const char *data_blk_ptr, word32 data_blk_size) 
{

需要注意的是，當我與此修復程序編譯代碼，還有一些非常重要的警告：

t573.cu(73): warning: a __device__ variable "BFilter" cannot be directly written in a host function 

t573.cu(185): warning: a __device__ variable "CrcTable" cannot be directly written in a host function

這些不應該被忽視。例如，採取的第一個警告，你有這樣的變量：

__device__ char BFilter[M/8];   //Bloom filter array of M/8 bytes

您不能將此變量直接在主機代碼寫（在main）：

//Clear the Bloom filter 
for(i = 0; i<NumBytes; i++) 
{ 
    BFilter[i] = 0x00; 
}

而是使用一個函數像cudaMemcpyToSymbol()

來源

2014-09-25 15:37:42

感謝您提供完整且有用的答案！ – 2014-09-25 19:33:37

我可以從CUDA內核函數調用device函數嗎？

回答

我可以從CUDA內核函數調用__device__函數嗎？

回答

相關問題

我可以從CUDA內核函數調用device函數嗎？