Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/lua/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ Cuda内核返回向量_C++_C_Cuda_Parallel Processing_Nvidia - Fatal编程技术网

C++ Cuda内核返回向量

C++ Cuda内核返回向量,c++,c,cuda,parallel-processing,nvidia,C++,C,Cuda,Parallel Processing,Nvidia,我有一个单词列表,我的目标是在一个很长的短语中匹配每个单词。 我在匹配每个单词时没有问题,我唯一的问题是返回一个包含每个匹配信息的结构向量 代码: typedef struct { int A, B, C; } Match; __global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches) { int a, b, c; [...] //P

我有一个单词列表,我的目标是在一个很长的短语中匹配每个单词。 我在匹配每个单词时没有问题,我唯一的问题是返回一个包含每个匹配信息的结构向量

代码:

typedef struct {
    int A, B, C; } Match;

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        _matches.push_back(new Match{ A = a, B = b, C = c }); //Here comes the unknown, what should I do here???
    }
}

main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    vector<Match> * matches; //Obviously I can't pass a vector to a kernel

    Find<<< X, Y >>>(myPhrase, wordList, matches);

    [...]

}
typedef结构{
int A,B,C;}匹配;
__全局无效查找(veryLongPhrase*\u短语、单词*\u单词列表、向量*\u匹配)
{
INTA、b、c;
[…]//对短语中的每个单词进行并行搜索
if(match)//当找到一个匹配项时
{
_matches.push_back(新匹配{A=A,B=B,C=C});//未知的东西来了,我该怎么办???
}
}
main()
{
[...]
veryLongPhrase*myPhrase=“敏捷的棕色狐狸跳过懒惰的狗等等。”
Words*wordList={“the”,“lazy”};
vector*matches;//显然我不能将向量传递给内核
查找>(我的短语、单词列表、匹配项);
[...]
}
我已经试过了,但是没有成功,你能给我一些建议吗


非常感谢。

类似的功能应该可以使用(在浏览器中编码,未经测试):

下面是一个成功的例子:

$ cat t347.cu
#include <iostream>
#include <vector>

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match & mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = mt;
    return insert_pt;}
  else return -1;}

__global__ void Find()
{

    if(threadIdx.x < 10) //Simulate a found occurrence
    {
        Match a = { .A = 1, .B = 2, .C = 3 };
        my_push_back(a);    }
}


main()
{

    Find<<< 2, 256 >>>();

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    if (dsize >= N) {printf("overflow error\n"); return 1;}
    std::vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    std::cout << "number of matches = " << dsize << std::endl;
    std::cout << "A  =  " << results[dsize-1].A << std:: endl;
    std::cout << "B  =  " << results[dsize-1].B << std:: endl;
    std::cout << "C  =  " << results[dsize-1].C << std:: endl;

}
$ nvcc -arch=sm_11 -o t347 t347.cu
$ ./t347
number of matches = 20
A  =  1
B  =  2
C  =  3
$
$cat t347.cu
#包括
#包括
//N是要插入的最大结构数
#定义N 10000
类型定义结构{
int A,B,C;}匹配;
__设备匹配开发数据[N];
__设备计数=0;
__设备输入我的推送(匹配和mt){
int insert_pt=atomicAdd(&dev_count,1);
如果(插入\u pt();
int-dsize;
cudaMemcpyFromSymbol(&dsize,dev_count,sizeof(int));
如果(dsize>=N){printf(“溢出错误\N”);返回1;}
标准::矢量结果(dsize);
cudaMemcpyFromSymbol(&(结果[0]),开发数据,dsize*sizeof(匹配));

std::cout-thrust分区似乎很适合解决这个问题。不过我应该指出,这将是一个关于它是否比CPU快的废话。内存布局将非常糟糕(我假设你将它作为GPU内存中的字符**进行布局)。分区并不是你在好日子在GPU上能做的最快的操作。打开和关闭数据复制将是一场噩梦(对于内存复制循环而言)。如果你的短语长度不超过几千字,我认为它不会加快你的代码速度。我的情况非常相似,这个解决方案非常有效。感谢Robert的回答。我只想澄清两件事:1)MS VS编辑器总是抱怨
atomicAdd
,因为它在编译时才定义。2)的同样,它会抱怨
cudamemcpyfromsymsymbol
的第二个参数,因为函数模板说
const void*
,但不支持指向设备内存的指针。尝试丢弃警告无效。
nvcc -arch=sm_11 ...
$ cat t347.cu
#include <iostream>
#include <vector>

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match & mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = mt;
    return insert_pt;}
  else return -1;}

__global__ void Find()
{

    if(threadIdx.x < 10) //Simulate a found occurrence
    {
        Match a = { .A = 1, .B = 2, .C = 3 };
        my_push_back(a);    }
}


main()
{

    Find<<< 2, 256 >>>();

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    if (dsize >= N) {printf("overflow error\n"); return 1;}
    std::vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    std::cout << "number of matches = " << dsize << std::endl;
    std::cout << "A  =  " << results[dsize-1].A << std:: endl;
    std::cout << "B  =  " << results[dsize-1].B << std:: endl;
    std::cout << "C  =  " << results[dsize-1].C << std:: endl;

}
$ nvcc -arch=sm_11 -o t347 t347.cu
$ ./t347
number of matches = 20
A  =  1
B  =  2
C  =  3
$