C++ 如何根据索引使用推力累积数组？_C++_Cuda_Thrust

C++ 如何根据索引使用推力累积数组？

c++ cuda

C++ 如何根据索引使用推力累积数组？,c++,cuda,thrust,C++,Cuda,Thrust,我正在尝试根据索引累加数组。我的输入是两个长度相同的向量。第一个向量是索引。第二个向量是值。我的目标是根据索引累积值。我在C++中有类似的代码。但我是新的推力编码。我能用推力装置代码实现这一点吗？我可以使用哪个功能？我没有发现类似于“map”的函数。它是否比CPU（主机）代码更高效？我的C++版本的迷你示例代码.< /p> int a[10]={1,2,3,4,5,1,1,3,4,4}; vector<int> key(a,a+10); double b[10]={1,2,3,4,

我正在尝试根据索引累加数组。我的输入是两个长度相同的向量。第一个向量是索引。第二个向量是值。我的目标是根据索引累积值。我在C++中有类似的代码。但我是新的推力编码。我能用推力装置代码实现这一点吗？我可以使用哪个功能？我没有发现类似于“map”的函数。它是否比CPU（主机）代码更高效？我的C++版本的迷你示例代码.< /p>

int a[10]={1,2,3,4,5,1,1,3,4,4};
vector<int> key(a,a+10);
double b[10]={1,2,3,4,5,1,2,3,4,5};
vector<double> val(b,b+10);

unordered_map<size_t,double> M;
for (size_t i = 0;i< 10 ;i++)
{
    M[key[i]] = M[key[i]]+val[i];
}

inta[10]={1,2,3,4,5,1,1,3,4}；
矢量键（a，a+10）；
双b[10]={1,2,3,4,5,1,2,3,4,5}；
向量val（b，b+10）；
无序地图M；
对于（大小i=0；i<10；i++）
{
M[key[i]]=M[key[i]]+val[i]；
}

如注释所示，实现这一点的标准方法是对数据（键、值）重新排序，以便将相似的键分组在一起。您可以使用

按键排序

<代码>按_键减少_然后求解

使用为每个提供的具有原子结构的函子，也可以以一种稍微不带推力的方式，在不重新排序的情况下解决问题

以下说明了这两个方面：

$ cat t27.cu
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <unordered_map>
#include <vector>

// this functor only needed for the non-reordering case
// requires compilation for a cc6.0 or higher GPU e.g. -arch=sm_60
struct my_func {
  double *r;
  my_func(double *_r) : r(_r) {};
  template <typename T>
  __host__ __device__
  void operator()(T t) {
    atomicAdd(r+thrust::get<0>(t)-1, thrust::get<1>(t));  // assumes consecutive keys starting at 1
  }
};

int main(){

  int a[10]={1,2,3,4,5,1,1,3,4,4};
  std::vector<int> key(a,a+10);
  double b[10]={1,2,3,4,5,1,2,3,4,5};
  std::vector<double> val(b,b+10);

  std::unordered_map<size_t,double> M;
  for (size_t i = 0;i< 10 ;i++)
  {
    M[key[i]] = M[key[i]]+val[i];
  }
  for (int i = 1; i < 6; i++) std::cout << M[i] << " ";
  std::cout << std::endl;
  int size_a = sizeof(a)/sizeof(a[0]);
  thrust::device_vector<int>    d_a(a, a+size_a);
  thrust::device_vector<double> d_b(b, b+size_a);
  thrust::device_vector<double> d_r(5); //assumes only 5 keys, for illustration
  thrust::device_vector<int> d_k(5); // assumes only 5 keys, for illustration
  // method 1, without reordering
  thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_a.begin(), d_b.begin())), size_a, my_func(thrust::raw_pointer_cast(d_r.data())));
  thrust::host_vector<double> r = d_r;
  thrust::copy(r.begin(), r.end(), std::ostream_iterator<double>(std::cout, " "));
  std::cout << std::endl;
  thrust::fill(d_r.begin(), d_r.end(), 0.0);
  // method 2, with reordering
  thrust::sort_by_key(d_a.begin(), d_a.end(), d_b.begin());
  thrust::reduce_by_key(d_a.begin(), d_a.end(), d_b.begin(), d_k.begin(), d_r.begin());
  thrust::copy(d_r.begin(), d_r.end(), r.begin());
  thrust::copy(r.begin(), r.end(), std::ostream_iterator<double>(std::cout, " "));
  std::cout << std::endl;
}
$ nvcc -o t27 t27.cu -std=c++14 -arch=sm_70
$ ./t27
4 2 6 13 5
4 2 6 13 5
4 2 6 13 5
$

$cat t27.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//此函子仅适用于非重新排序情况
//需要编译cc6.0或更高的GPU，例如-arch=sm_60
结构我的函数{
双*r；
my_func（double*_r）：r（_r）{}；
模板
__主机设备__
void运算符（）（T）{
atomicAdd（r+推力：：get（t）-1，推力：：get（t））；//假设连续键从1开始
}
};
int main（）{
int a[10]={1,2,3,4,5,1,1,3,4,4}；
std：：向量键（a，a+10）；
双b[10]={1,2,3,4,5,1,2,3,4,5}；
std：：向量val（b，b+10）；
std：：无序映射M；
对于（大小i=0；i<10；i++）
{
M[key[i]]=M[key[i]]+val[i]；
}
对于注释中指出的（inti=1；i<6；i++）std:：cout，执行此操作的标准方法是对数据（键、值）进行重新排序，以便将相似的键分组在一起。您可以使用按键排序然后求解
使用为每个
提供的具有原子结构的函子，也可以以一种稍微不带推力的方式，在不重新排序的情况下解决问题
以下说明了这两个方面：
$ cat t27.cu
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <unordered_map>
#include <vector>

// this functor only needed for the non-reordering case
// requires compilation for a cc6.0 or higher GPU e.g. -arch=sm_60
struct my_func {
  double *r;
  my_func(double *_r) : r(_r) {};
  template <typename T>
  __host__ __device__
  void operator()(T t) {
    atomicAdd(r+thrust::get<0>(t)-1, thrust::get<1>(t));  // assumes consecutive keys starting at 1
  }
};

int main(){

  int a[10]={1,2,3,4,5,1,1,3,4,4};
  std::vector<int> key(a,a+10);
  double b[10]={1,2,3,4,5,1,2,3,4,5};
  std::vector<double> val(b,b+10);

  std::unordered_map<size_t,double> M;
  for (size_t i = 0;i< 10 ;i++)
  {
    M[key[i]] = M[key[i]]+val[i];
  }
  for (int i = 1; i < 6; i++) std::cout << M[i] << " ";
  std::cout << std::endl;
  int size_a = sizeof(a)/sizeof(a[0]);
  thrust::device_vector<int>    d_a(a, a+size_a);
  thrust::device_vector<double> d_b(b, b+size_a);
  thrust::device_vector<double> d_r(5); //assumes only 5 keys, for illustration
  thrust::device_vector<int> d_k(5); // assumes only 5 keys, for illustration
  // method 1, without reordering
  thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_a.begin(), d_b.begin())), size_a, my_func(thrust::raw_pointer_cast(d_r.data())));
  thrust::host_vector<double> r = d_r;
  thrust::copy(r.begin(), r.end(), std::ostream_iterator<double>(std::cout, " "));
  std::cout << std::endl;
  thrust::fill(d_r.begin(), d_r.end(), 0.0);
  // method 2, with reordering
  thrust::sort_by_key(d_a.begin(), d_a.end(), d_b.begin());
  thrust::reduce_by_key(d_a.begin(), d_a.end(), d_b.begin(), d_k.begin(), d_r.begin());
  thrust::copy(d_r.begin(), d_r.end(), r.begin());
  thrust::copy(r.begin(), r.end(), std::ostream_iterator<double>(std::cout, " "));
  std::cout << std::endl;
}
$ nvcc -o t27 t27.cu -std=c++14 -arch=sm_70
$ ./t27
4 2 6 13 5
4 2 6 13 5
4 2 6 13 5
$

$cat t27.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//此函子仅适用于非重新排序情况
//需要编译cc6.0或更高的GPU，例如-arch=sm_60
结构我的函数{
双*r；
my_func（double*_r）：r（_r）{}；
模板
__主机设备__
void运算符（）（T）{
atomicAdd（r+推力：：get（t）-1，推力：：get（t））；//假设连续键从1开始
}
};
int main（）{
int a[10]={1,2,3,4,5,1,1,3,4,4}；
std：：向量键（a，a+10）；
双b[10]={1,2,3,4,5,1,2,3,4,5}；
std：：向量val（b，b+10）；
std：：无序映射M；
对于（大小i=0；i<10；i++）
{
M[key[i]]=M[key[i]]+val[i]；
}
对于（int i=1；i<6；i++）std:：cout-推力具有高效的按键减少实现，但它们要求输入按键顺序分组，并且数据不按顺序排列。因此，您的选择是按键排序或排序，然后减少，或者使用其他方法。GPU不能很好地进行随机访问，很难实现适当的内存带宽利用率，这是性能的关键rmance在这种操作中具有高效的按键缩减实现，但它们要求输入按键顺序分组，并且您的数据没有顺序。因此，您可以选择按键排序或排序，然后缩减，或者使用其他方法。GPU的随机访问做得不好，很难实现适当的内存带宽利用率，这是这种操作性能的关键哇！非常感谢你的回答！我需要一些时间来测试性能。哇！非常感谢你的回答！我需要一些时间来测试性能。