使用Cuda搜索字符串的建议

使用Cuda搜索字符串的建议,cuda,parallel-processing,Cuda,Parallel Processing,我将编写一个cuda代码,在一组数据字符串中搜索一组关键字字符串,并返回一个布尔值数组以查找关键字数据字符串对。数据字符串:目前,10000个(可能不同)字符串,每个字符串最多有250个字符 关键字字符串:目前,100个(可能不同)字符串,每个字符串最多有100个字符 每个字符串的长度都是已知的。 我的问题是,以下哪种方法可能更适合这种情况 第一: gridDim.x=>#个关键字字符串 gridDim.y=>#个数据字符串 blockDim=>(最大字符串大小(本例中为250),1,1) 将使

我将编写一个cuda代码,在一组数据字符串中搜索一组关键字字符串,并返回一个布尔值数组以查找关键字数据字符串对。

数据字符串:目前,10000个(可能不同)字符串,每个字符串最多有250个字符

关键字字符串:目前,100个(可能不同)字符串,每个字符串最多有100个字符

每个字符串的长度都是已知的。

我的问题是,以下哪种方法可能更适合这种情况

第一:
gridDim.x=>#个关键字字符串
gridDim.y=>#个数据字符串
blockDim=>(最大字符串大小(本例中为250),1,1)
将使用朴素算法进行搜索
每个线程将把关键字和数据的字符从全局mem加载到共享mem
每个线程将负责朴素搜索算法中的一个窗口
结果将写入布尔数组
因此,每个块将负责关键字数据对

第二名:
gridDim=>(#个数据字符串,1,1)
blockDim=>(#个关键字字符串,1,1)
在每个块中,数据字符串将加载到共享mem
在这种情况下,每个线程将负责关键字数据对而不是块
每个线程将在数据字符串中搜索相应的关键字。
在这种情况下,不需要朴素的算法,可以使用Boyer Moore。

对于大型文件中的搜索,由于数据的长度远大于关键字的长度,所以使用第一种方法。但在这种情况下,我不确定第一种方法是否更好。另一方面,对于第二种方法,合并关键字可能是一个问题,因为长度不是固定的。关键字的大小有一个上限。因此,填充可能会简化合并,但会消耗更多内存。

无论如何,如果您曾经处理过类似的案例,或者知道比我上面描述的更好的方法,请帮助我。
先谢谢你

所以,我已经实现了这两种情况
进近1的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = blockIdx.y;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];
    __shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
    __shared__ bool isFound;

    if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
    {
        if (dataLength < keywordLength)
        {
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
        }
        else
        {
            isFound = false;
            sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
            if (threadIdx.x < keywordLength)
                sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
            __syncthreads();

            if (threadIdx.x <= dataLength - keywordLength)
            {
                for (int i = 0; i < keywordLength && !isFound; i++)
                {
                    if (sData[threadIdx.x + i] != sKeyword[i])
                        break;
                    if (i == keywordLength - 1)
                        isFound = true;
                }
            }
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
    searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“stdio.h”
#包括“iostream”
#包括“chrono”
#包括“cstdlib”
#定义SEARCHTERMSIZE 100
#定义SEARCHITEMSIZE 65000
#定义MAXDATASTRINGSIZE 250
#定义MAXKEYWORDSTRINGSSIZE 50
使用名称空间std;
__全局搜索关键字内核(bool*resultPtr、const char*dataPtr、const short*datalength、const char*keywordPtr、const short*keywordlength)
{
int dataIndex=blockIdx.x;
int关键字索引=blockIdx.y;
int dataLength=dataLength[dataIndex];
int keywordLength=keywordLength[keywordIndex];
__共享字符sData[MAXDATASTRINGSIZE];
__共享字符sKeyword[MAXKEYWORDSTRINGSSIZE];
__找到了共享对象;
if(dataIndexcout我建议
blockDim=(16,16,1)
gridDim=(#of data strings/16,#of keyword strings/16,1)
。在您的情况下,在共享内存中可以理想地容纳数十个字符串的情况下,这样的块网格划分将导致最小的全局内存访问,同时不会引入计算开销

填充不是一个好的选择,除非每个字符串的长度都非常接近最大值(例如,最大值的80%)。如果保留每个字符串的偏移量数组(CPU擅长生成偏移量),则合并全局内存读取就很简单了

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>

#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char  * __restrict__ dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = threadIdx.x;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];

    if (dataIndex < SEARCHITEMSIZE)
    {
        int my_tid = keywordIndex;
        while (my_tid < dataLength)
        {
            sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
            my_tid += blockDim.x;
        }
        __syncthreads();
        if (keywordIndex < SEARCHTERMSIZE)
        {
            if (dataLength < keywordLength)
            {
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
            }
            else
            {
                bool isFound = true;
                for (int i = 0; i <= dataLength - keywordLength; i++)
                {
                    for (int j = 0; j < keywordLength; j++)
                    {
                        if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
                        {
                            isFound = false;
                            break;
                        }
                    }
                    if (isFound)
                        break;
                }
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
            }
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    ////////////////////Traverse Keyword Array////////////////////////////

    char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
            keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];

    ////////////////////Traverse Keyword Array////////////////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < SEARCHITEMSIZE; j++)
            result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] keywordPtr_T;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] result_T;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}