使用Cuda搜索字符串的建议
我将编写一个cuda代码,在一组数据字符串中搜索一组关键字字符串,并返回一个布尔值数组以查找关键字数据字符串对。使用Cuda搜索字符串的建议,cuda,parallel-processing,Cuda,Parallel Processing,我将编写一个cuda代码,在一组数据字符串中搜索一组关键字字符串,并返回一个布尔值数组以查找关键字数据字符串对。数据字符串:目前,10000个(可能不同)字符串,每个字符串最多有250个字符 关键字字符串:目前,100个(可能不同)字符串,每个字符串最多有100个字符 每个字符串的长度都是已知的。 我的问题是,以下哪种方法可能更适合这种情况 第一: gridDim.x=>#个关键字字符串 gridDim.y=>#个数据字符串 blockDim=>(最大字符串大小(本例中为250),1,1) 将使
数据字符串:目前,10000个(可能不同)字符串,每个字符串最多有250个字符
关键字字符串:目前,100个(可能不同)字符串,每个字符串最多有100个字符
每个字符串的长度都是已知的。
我的问题是,以下哪种方法可能更适合这种情况
第一:
gridDim.x=>#个关键字字符串
gridDim.y=>#个数据字符串
blockDim=>(最大字符串大小(本例中为250),1,1)
将使用朴素算法进行搜索
每个线程将把关键字和数据的字符从全局mem加载到共享mem
每个线程将负责朴素搜索算法中的一个窗口
结果将写入布尔数组
因此,每个块将负责关键字数据对
第二名:
gridDim=>(#个数据字符串,1,1)
blockDim=>(#个关键字字符串,1,1)
在每个块中,数据字符串将加载到共享mem
在这种情况下,每个线程将负责关键字数据对而不是块
每个线程将在数据字符串中搜索相应的关键字。
在这种情况下,不需要朴素的算法,可以使用Boyer Moore。
对于大型文件中的搜索,由于数据的长度远大于关键字的长度,所以使用第一种方法。但在这种情况下,我不确定第一种方法是否更好。另一方面,对于第二种方法,合并关键字可能是一个问题,因为长度不是固定的。关键字的大小有一个上限。因此,填充可能会简化合并,但会消耗更多内存。
无论如何,如果您曾经处理过类似的案例,或者知道比我上面描述的更好的方法,请帮助我。
先谢谢你 所以,我已经实现了这两种情况
进近1的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = blockIdx.y;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
__shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
__shared__ bool isFound;
if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
}
else
{
isFound = false;
sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
if (threadIdx.x < keywordLength)
sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
__syncthreads();
if (threadIdx.x <= dataLength - keywordLength)
{
for (int i = 0; i < keywordLength && !isFound; i++)
{
if (sData[threadIdx.x + i] != sKeyword[i])
break;
if (i == keywordLength - 1)
isFound = true;
}
}
resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“stdio.h”
#包括“iostream”
#包括“chrono”
#包括“cstdlib”
#定义SEARCHTERMSIZE 100
#定义SEARCHITEMSIZE 65000
#定义MAXDATASTRINGSIZE 250
#定义MAXKEYWORDSTRINGSSIZE 50
使用名称空间std;
__全局搜索关键字内核(bool*resultPtr、const char*dataPtr、const short*datalength、const char*keywordPtr、const short*keywordlength)
{
int dataIndex=blockIdx.x;
int关键字索引=blockIdx.y;
int dataLength=dataLength[dataIndex];
int keywordLength=keywordLength[keywordIndex];
__共享字符sData[MAXDATASTRINGSIZE];
__共享字符sKeyword[MAXKEYWORDSTRINGSSIZE];
__找到了共享对象;
if(dataIndex cout我建议blockDim=(16,16,1)
和gridDim=(#of data strings/16,#of keyword strings/16,1)
。在您的情况下,在共享内存中可以理想地容纳数十个字符串的情况下,这样的块网格划分将导致最小的全局内存访问,同时不会引入计算开销
填充不是一个好的选择,除非每个字符串的长度都非常接近最大值(例如,最大值的80%)。如果保留每个字符串的偏移量数组(CPU擅长生成偏移量),则合并全局内存读取就很简单了
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50
using namespace std;
__global__ void searchKeywordKernel(bool* resultPtr, const char * __restrict__ dataPtr, const short* dataLengths, const char * keywordPtr, const short* keywordLengths)
{
int dataIndex = blockIdx.x;
int keywordIndex = threadIdx.x;
int dataLength = dataLengths[dataIndex];
int keywordLength = keywordLengths[keywordIndex];
__shared__ char sData[MAXDATASTRINGSIZE];
if (dataIndex < SEARCHITEMSIZE)
{
int my_tid = keywordIndex;
while (my_tid < dataLength)
{
sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
my_tid += blockDim.x;
}
__syncthreads();
if (keywordIndex < SEARCHTERMSIZE)
{
if (dataLength < keywordLength)
{
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
}
else
{
bool isFound = true;
for (int i = 0; i <= dataLength - keywordLength; i++)
{
for (int j = 0; j < keywordLength; j++)
{
if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
{
isFound = false;
break;
}
}
if (isFound)
break;
}
resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
short* dataLengths = new short[SEARCHITEMSIZE];
short temp;
short tempChar;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
dataLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" << endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
for (int k = 0; k < temp; k++)
{
tempChar = rand() % 26;
keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar; //97->a, 98->b, 122->z
}
keywordLengths[i] = temp;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
////////////////////Traverse Keyword Array////////////////////////////
char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];
////////////////////Traverse Keyword Array////////////////////////////
char* d_dataPtr;
short* d_dataLengths;
char* d_keywordPtr;
short* d_keywordLengths;
bool* d_resultPtr;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < SEARCHITEMSIZE; j++)
result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
if (dataLengths[j] < keywordLengths[i])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
else
{
for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
{
cpuResult[i*SEARCHITEMSIZE + j] = true;
for (int l = 0; l < keywordLengths[i]; l++)
{
if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
{
cpuResult[i*SEARCHITEMSIZE + j] = false;
break;
}
}
if (cpuResult[i*SEARCHTERMSIZE + j])
break;
}
}
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernelRes = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i])
{
kernelRes = false;
break;
}
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel computation: " << kernelRes << endl;
cout << "Before Deleting arrays" << endl;
delete[] dataPtr;
delete[] keywordPtr;
delete[] keywordPtr_T;
delete[] dataLengths;
delete[] keywordLengths;
delete[] result;
delete[] result_T;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_dataPtr);
cudaFree(d_keywordPtr);
cudaFree(d_dataLengths);
cudaFree(d_keywordLengths);
cudaFree(d_resultPtr);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}