我要編寫一個cuda代碼,它在一組數據字符串內搜索關鍵字字符串集合,並返回關鍵字 - 數據字符串對的布爾數組數組。數據字符串:此刻,10000(可能會有所不同)字符串,每個字符串最多有250個字符。
關鍵字字符串:目前,100(可能會有所不同)字符串,每個字符串最多有100個字符。
每個字符串的長度是已知的。
我的問題是,在這種情況下,以下哪種方法可能更適合。
第一:
gridDim.x =>關鍵字字符串
gridDim.y =>#數據串
blockDim =>(最大串大小(250在這種情況下),1,1)
樸素的#算法將用於搜索
每個線程都會將關鍵字和數據的字符從全局mem加載到共享mem。
每個線程將負責天真搜索算法中的窗口之一。
結果將被寫入布爾數組。
因此,每個塊將負責關鍵字數據對。
第二:
gridDim =>(#數據串,1,1)
blockDim =>(關鍵字字符串的#,1,1)
在每個塊中,數據串將被載入到共享MEM。
在這種情況下,每個線程將負責關鍵字數據對而不是塊。
每個線程都會在數據字符串中搜索相應的關鍵字。
樸素算法在這種情況下不是必需的,可以使用Boyer-Moore。
對於大文件內部的搜索,由於數據的長度比關鍵字的長度大得多,所以使用第一種方法。但是在這種情況下,我不確定第一個方案是否更好。另一方面,對於第二種方法,合併關鍵字可能是一個問題,因爲長度不固定。關鍵字的大小有一個上限。所以,填充可能會緩解整合,但會消耗更多的內存。
無論如何,如果你曾經在類似的案例中工作過,或者知道比我上面描述的更好的方法,請幫助我。
預先感謝您。關於Cuda字符串搜索的建議
所以,我已經實施了這兩種情況。
代碼方法1:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = blockIdx.y;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
__shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
__shared__ bool isFound;
if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
}
else
{
isFound = false;
sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
if (threadIdx.x < keywordLength)
sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
__syncthreads();
if (threadIdx.x <= dataLength - keywordLength)
{
for (int i = 0; i < keywordLength && !isFound; i++)
{
if (sData[threadIdx.x + i] != sKeyword[i])
break;
if (i == keywordLength - 1)
isFound = true;
}
}
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
代碼方法2:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * __restrict__ dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = threadIdx.x;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
if (dataIndex < SEARCHITEMSIZE)
{
int my_tid = keywordIndex;
while (my_tid < dataLength)
{
sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
my_tid += blockDim.x;
}
__syncthreads();
if (keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
}
else
{
bool isFound = true;
for (int i = 0; i <= dataLength - keywordLength; i++)
{
for (int j = 0; j < keywordLength; j++)
{
if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
{
isFound = false;
break;
}
}
if (isFound)
break;
}
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
////////////////////Traverse Keyword Array////////////////////////////
char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];
////////////////////Traverse Keyword Array////////////////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < SEARCHITEMSIZE; j++)
result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] keywordPtr_T;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] result_T;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
第二條本辦法給比第一種方法更好的結果。然而,第二種方法的表現取決於關鍵字的數量。如果關鍵字的數量是192的倍數,則gpu具有比cpu更高的性能(malloc + memcpy + kernel的CPU時間<時間)。
我該怎麼做才能克服這種依賴性?
增加線程數量並傳遞多個數據字符串而不是每個塊中都有一個可行嗎?