C++ 在核函数中使用cuPrint打印字符串向量的元素_C++_String_Cuda_Parallel Processing

C++ 在核函数中使用cuPrint打印字符串向量的元素

c++ string cuda parallel-processing

C++ 在核函数中使用cuPrint打印字符串向量的元素,c++,string,cuda,parallel-processing,C++,String,Cuda,Parallel Processing,我试图使用cuPrint函数打印作为内核函数参数传递的字符串向量的元素内核的代码 __global__ void testKernel(string wordList[10000]) { //access thread id const unsigned int bid = blockIdx.x; const unsigned int tid = threadIdx.x; const unsigned int index = bid * blockDim.x +

我试图使用cuPrint函数打印作为内核函数参数传递的字符串向量的元素

内核的代码

__global__ void testKernel(string wordList[10000])
{
    //access thread id
    const unsigned int bid = blockIdx.x;
    const unsigned int tid = threadIdx.x;
    const unsigned int index = bid * blockDim.x + tid;


    cuPrintf("wordList[%d]: %s \n", index, wordList[index]);
}

//Allocate device memory for word list
    string* d_wordList;
    cudaMalloc((void**)&d_wordList, sizeof(string)*number_of_words);

    //Copy word list from host to device
    cudaMemcpy(d_wordList, wordList, sizeof(string)*number_of_words, cudaMemcpyHostToDevice);

    //Setup execution parameters
    int n_blocks = (number_of_words + 255)/256;
    int threads_per_block = 256;

    dim3 grid(n_blocks, 1, 1);
    dim3 threads(threads_per_block, 1, 1);

    cudaPrintfInit();
    testKernel<<<grid, threads>>>(d_wordList);
    cudaDeviceSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();

从主函数编写代码以设置执行参数并启动内核

__global__ void testKernel(string wordList[10000])
{
    //access thread id
    const unsigned int bid = blockIdx.x;
    const unsigned int tid = threadIdx.x;
    const unsigned int index = bid * blockDim.x + tid;


    cuPrintf("wordList[%d]: %s \n", index, wordList[index]);
}

//Allocate device memory for word list
    string* d_wordList;
    cudaMalloc((void**)&d_wordList, sizeof(string)*number_of_words);

    //Copy word list from host to device
    cudaMemcpy(d_wordList, wordList, sizeof(string)*number_of_words, cudaMemcpyHostToDevice);

    //Setup execution parameters
    int n_blocks = (number_of_words + 255)/256;
    int threads_per_block = 256;

    dim3 grid(n_blocks, 1, 1);
    dim3 threads(threads_per_block, 1, 1);

    cudaPrintfInit();
    testKernel<<<grid, threads>>>(d_wordList);
    cudaDeviceSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();

//为单词列表分配设备内存
字符串*d_单词列表；
cudamaloc（（void**）和d_字表，sizeof（string）*字数；
//将单词列表从主机复制到设备
cudaMemcpy（d_单词列表，单词列表，大小（字符串）*单词数，cudaMemcpyHostToDevice）；
//设置执行参数
int n_块=（单词数+255）/256；
int threads_per_block=256；
dim3网格（n_块，1,1）；
dim3螺纹（每个螺纹块的螺纹，1,1）；
cudaPrintfInit（）；
testKernel（d_单词列表）；
cudaDeviceSynchronize（）；
cudaPrintfDisplay（标准输出，真）；
cudaPrintfEnd（）；

我得到一个错误：错误44错误：不允许从全局函数（“testKernel”）调用主机函数（“std:：basic\u字符串，std:：allocator>：：~basic\u字符串”）。D:…\kernel.cu44 1 CUDA\u BF\u大单词列表 "
我错过了什么？
谢谢。
< P>一般，不能使用CUDA设备代码中的C++库（包括<代码> <代码> >的函数。< /P> 使用
char
数组来保存字符串

是一个将“字符串”作为以null结尾的字符的C样式数组进行操作并将其传递给内核的示例。
我修改了代码，并使用字符数组代替字符串
内核的更新版本为：

__global__ void testKernel(char* d_wordList) { //access thread id const unsigned int bid = blockIdx.x; const unsigned int tid = threadIdx.x; const unsigned int index = bid * blockDim.x + tid; //cuPrintf("Hello World from kernel! \n"); cuPrintf("!! %c%c%c%c%c%c%c%c%c%c \n" , d_wordList[index * 20 + 0], d_wordList[index * 20 + 1], d_wordList[index * 20 + 2], d_wordList[index * 20 + 3], d_wordList[index * 20 + 4], d_wordList[index * 20 + 5], d_wordList[index * 20 + 6], d_wordList[index * 20 + 7], d_wordList[index * 20 + 8], d_wordList[index * 20 + 9]); }
我还想知道是否有更简单的方法来打印字符数组中的单词。（基本上，我需要打印并在以后使用每个内核函数一个字）
主功能的代码是：

const int text_length = 20; char (*wordList)[text_length] = new char[10000][text_length]; char *dev_wordList; for(int i=0; i<number_of_words; i++) { file>>wordList[i]; cout<<wordList[i]<<endl; } cudaMalloc((void**)&dev_wordList, 20*number_of_words*sizeof(char)); cudaMemcpy(dev_wordList, &(wordList[0][0]), 20 * number_of_words * sizeof(char), cudaMemcpyHostToDevice); char (*resultWordList)[text_length] = new char[10000][text_length]; cudaMemcpy(resultWordList, dev_wordList, 20 * number_of_words * sizeof(char), cudaMemcpyDeviceToHost); for(int i=0; i<number_of_words; i++) cout<<resultWordList[i]<<endl; //Setup execution parameters int n_blocks = (number_of_words + 255)/256; int threads_per_block = 256; dim3 grid(n_blocks, 1, 1); dim3 threads(threads_per_block, 1, 1); cudaPrintfInit(); testKernel<<<grid, threads>>>(dev_wordList); cudaDeviceSynchronize(); cudaPrintfDisplay(stdout,true); cudaPrintfEnd();

内核启动是正确的，每个线程显示一个单词。但我需要这个10000字的程序。我漏掉了什么？
我正在读一个txt文件中的单词，就像这样//构建一个字符串数组，其中包含文本文件string wordList[10000]中的单词；if（file.is_open（））{for（inti=0；i>wordList[i]；//Cout在我的回答中提供了一个指向示例代码的链接，该代码演示了如何操作C样式字符串。我假设您可以处理文件I/O。这不是CUDA特有的。是的，处理文件I/O没有问题。谢谢！发布您自己问题的答案并使用它来问新问题可能不是一个好主意。这并不是一个好主意Works。如果您有一个新问题，建议您提出一个新问题。请注意，您最后的问题对我来说并不清楚。什么不起作用？您是否知道每块线程数限制？您是否知道内核的printf在其可以生成的输出量方面受到限制？什么实际上不起作用？（发布新问题）好的，谢谢你的建议。我知道每个块的线程数限制，在我的例子中，每个块的线程数是512。问题是对于网格/线程数的较大参数，内核不会输出，但问题可能是cuPritf函数的限制。我调查了这个问题，原因是cuPritf限于最多2048个线程。