Cuda 如何使用无排序的推力将相等的元素组合在一起_Cuda_Thrust

Cuda 如何使用无排序的推力将相等的元素组合在一起

cuda

Cuda 如何使用无排序的推力将相等的元素组合在一起,cuda,thrust,Cuda,Thrust,我有一个元素数组，每个元素只定义“等于”操作符。换句话说，对于这种类型的元素，没有定义排序既然我不能像在推力直方图示例中那样使用推力：：排序，那么如何使用推力将相等的元素组合在一起呢例如：我的数组最初是 a e t b c a c e t a 其中相同的字符表示相同的元素在精化之后，数组应该是 a a a t t b c c e e 但也可能是这样 a a a c c t t e e b 或任何其他排列谢谢我们发现您的真正目标是消除float4元素向量中的重复项。为了应用，需

我有一个元素数组，每个元素只定义“等于”操作符。换句话说，对于这种类型的元素，没有定义排序

既然我不能像在推力直方图示例中那样使用推力：：排序，那么如何使用推力将相等的元素组合在一起呢

例如：

我的数组最初是

a e t b c a c e t a

其中相同的字符表示相同的元素

在精化之后，数组应该是

a a a t t b c c e e

但也可能是这样

a a a c c t t e e b

或任何其他排列

谢谢

我们发现您的真正目标是消除

float4

元素向量中的重复项。为了应用，需要对元素进行排序

所以你需要一个四维数据的排序方法。这可以使用空间填充曲线来完成。我以前使用过对3D数据进行排序。有一些高效的CUDA实现，但是quick Google并没有为4D案例返回一个现成的实现

我找到了一篇论文，其中列出了使用z阶曲线对n维数据点进行排序的通用算法：（参见算法1：浮点莫顿顺序算法）。这个算法也有一个简单的例子

对于4D数据，循环可以展开，但可能有更简单、更有效的算法可用

因此（未完全实现的）操作顺序将如下所示：

#include <thrust/device_vector.h>
#include <thrust/unique.h>
#include <thrust/sort.h>

inline __host__ __device__ float dot(const float4& a, const float4& b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

struct identity_4d
{
  __host__ __device__
  bool operator()(const float4& a, const float4& b) const
  {
    // based on the norm function you provided in the discussion
    return dot(a,b) < (0.1f*0.1f);
  }
};

struct z_order_4d
{
  __host__ __device__
  bool operator()(const float4& p, const float4& q) const
  {
    // you need to implement the z-order algorithm here
    // ...
  }
};

int main()
{
  const int N = 100;
  thrust::device_vector<float4> data(N);
  // fill the data
  // ...

  thrust::sort(data.begin(),data.end(), z_order_4d());

  thrust::unique(data.begin(),data.end(), identity_4d());

}

#包括
#包括
#包括
内联\uuuuuu主机\uuuuuuuu设备\uuuuuuu浮点点（常量浮点4&a、常量浮点4&b）
{
返回a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w；
}
结构标识
{
__主机设备__
布尔运算符（）（常量浮点4&a、常量浮点4&b）常量
{
//基于您在讨论中提供的norm函数
返回点（a、b）<（0.1f*0.1f）；
}
};
结构z_顺序4d
{
__主机设备__
布尔运算符（）（常量浮点4&p，常量浮点4&q）常量
{
//您需要在这里实现z顺序算法
// ...
}
};
int main（）
{
常数int N=100；
推力：：设备_矢量数据（N）；
//填写数据
// ...
推力：：排序（data.begin（），data.end（），z_order_4d（））；
唯一（data.begin（），data.end（），identity_4d（））；
}

我建议您遵循@m.s.在发布的答案中所述的方法。正如我在评论中所说，元素排序是一种非常有用的机制，有助于降低此类问题的复杂性

然而，提出的问题是，是否有可能在不进行排序的情况下对类似的元素进行分组。有了一个天生的并行处理器，比如GPU，我花了一些时间思考如何在不进行排序的情况下实现它

如果我们既有大量的对象，又有大量独特的对象类型，那么我认为有可能为这个问题带来某种程度的并行性，但是我在这里概述的方法仍然会有糟糕的、分散的内存访问模式。对于只有少量不同或唯一对象类型的情况，我在这里讨论的算法没有什么值得推荐的。这只是一种可能的方法。很可能还有其他更好的方法：

起点是为每个元素开发一组“链表”，指示左侧的匹配邻居和右侧的匹配邻居。这是通过我的

search_functor

和

struch:：for_each

在整个数据集上实现的。这一步相当并行，对于大型数据集也具有相当高的内存访问效率，但它确实需要从开始到结束对整个数据集进行最坏情况下的遍历（我称之为副作用，即无法使用排序；我们必须将每个元素与其他元素进行比较，直到找到匹配项）。两个链表的生成允许我们避免所有对所有的比较

一旦我们有了从步骤1构建的列表（右邻居和左邻居），使用

推力：：count

计算唯一对象的数量就很容易了

然后，我们使用

推力：：copy_if

流压缩获得每个唯一元素的起始索引（即数据集中每种类型的唯一元素的最左边索引）

下一步是计算每个唯一元素的实例数。这一步是执行列表遍历，每个元素列表一个线程。如果我有少数独特的元素，这将无法有效地利用GPU。此外，列表遍历将导致糟糕的访问模式

在我们计算了每种类型对象的数量后，我们可以通过对每种类型对象的数量进行

asch:：exclusive\u scan

为输出列表中的每种对象类型构建一系列起始索引

最后，我们可以将每个输入元素复制到输出列表中的适当位置。因为我们还没有办法对元素进行分组或排序，所以我们必须再次求助于列表遍历。同样，如果唯一对象类型的数量很小，并且内存访问模式也很糟糕，那么这将是GPU的低效使用

下面是一个完整的示例，使用您的字符样本数据集。为了帮助澄清我们打算对没有固有顺序的对象进行分组的想法，我创建了一个有点任意的对象定义（

my_obj

），它定义了

比较运算符，但没有定义

$cat t707.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
模板
我的班
{
T元素；
整数指数；
公众：
__主机设备我的对象（）：元素（0），索引（0）{}；
__主机设备我的对象（ta）：元素（a），索引（0）{}；
__主机设备我的对象（ta，intidx）：元素（a），索引（idx）{}；
__主机设备__
不明白{
重新
$ cat t707.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/for_each.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
#include <thrust/count.h>
#include <iostream>


template <typename T>
class my_obj
{
  T element;
  int index;
  public:
  __host__ __device__ my_obj() : element(0), index(0) {};
  __host__ __device__ my_obj(T a) : element(a), index(0) {};
  __host__ __device__ my_obj(T a, int idx) : element(a), index(idx) {};
  __host__ __device__
  T get() {
    return element;}
  __host__ __device__
  void set(T a) {
    element = a;}
  __host__ __device__
  int get_idx() {
    return index;}
  __host__ __device__
  void set_idx(int idx) {
    index = idx;}
  __host__ __device__
  bool operator ==(my_obj &e2)
  {
    return (e2.get() == this->get());
  }
};

template <typename T>
struct search_functor
{
    my_obj<T> *data;
    int end;
    int *rn;
    int *ln;
    search_functor(my_obj<T> *_a, int *_rn, int *_ln, int len) : data(_a), rn(_rn), ln(_ln), end(len) {};
    __host__ __device__
    void operator()(int idx){
      for (int i = idx+1; i < end; i++)
        if (data[idx] == data[i]) {
          ln[i] = idx;
          rn[idx] = i;
          return;}
      return;
      }
};

template <typename T>
struct copy_functor
{
    my_obj<T> *data;
    my_obj<T> *result;
    int *rn;
    copy_functor(my_obj<T> *_in, my_obj<T> *_out, int *_rn) : data(_in), result(_out), rn(_rn) {};
    __host__ __device__
    void operator()(const thrust::tuple<int, int> &t1) const {
      int idx1 = thrust::get<0>(t1);
      int idx2 = thrust::get<1>(t1);
      result[idx1] = data[idx2];
      int i = rn[idx2];
      int j = 1;
      while (i != -1){
        result[idx1+(j++)] = data[i];
        i = rn[i];}
      return;
      }
};

struct count_functor
{
    int *rn;
    int *ot;
    count_functor(int *_rn, int *_ot) : rn(_rn), ot(_ot) {};
    __host__ __device__
    int operator()(int idx1, int idx2){
      ot[idx1] = idx2;
      int i = rn[idx1];
      int count = 1;
      while (i != -1) {
        ot[i] = idx2;
        count++;
        i = rn[i];}
      return count;
      }
};

using namespace thrust::placeholders;

int main(){

  // data setup
  char data[] = { 'a' ,  'e' ,  't' ,  'b' ,  'c' ,  'a' ,  'c' , 'e' ,  't' ,  'a' };
  int sz = sizeof(data)/sizeof(char);
  for (int i = 0; i < sz; i++) std::cout << data[i] << ",";
  std::cout << std::endl;
  thrust::host_vector<my_obj<char> > h_data(sz);
  for (int i = 0; i < sz; i++) { h_data[i].set(data[i]); h_data[i].set_idx(i); }
  thrust::device_vector<my_obj<char> > d_data = h_data;


  // create left and right neighbor indices
  thrust::device_vector<int> ln(d_data.size(), -1);
  thrust::device_vector<int> rn(d_data.size(), -1);
  thrust::for_each(thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(0) + sz, search_functor<char>(thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(rn.data()), thrust::raw_pointer_cast(ln.data()), d_data.size()));
  // determine number of unique objects
  int uni_objs = thrust::count(ln.begin(), ln.end(), -1);
  // determine the number of instances of each unique object
    // get object starting indices
  thrust::device_vector<int> uni_obj_idxs(uni_objs);
  thrust::copy_if(thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(0)+d_data.size(), ln.begin(), uni_obj_idxs.begin(), (_1 == -1));
    // count each object list
  thrust::device_vector<int> num_objs(uni_objs);
  thrust::device_vector<int> obj_type(d_data.size());
  thrust::transform(uni_obj_idxs.begin(), uni_obj_idxs.end(), thrust::counting_iterator<int>(0),  num_objs.begin(), count_functor(thrust::raw_pointer_cast(rn.data()), thrust::raw_pointer_cast(obj_type.data())));

  // at this point, we have built object lists that have allowed us to identify a unique, orderable "type" for each object
  // the sensible thing to do would be to employ a sort_by_key on obj_type and an index sequence at this point
  // and use the reordered index sequence to reorder the original objects, thus grouping them
  // however...  without sorting...

  // build output vector indices
  thrust::device_vector<int> copy_start(num_objs.size());
  thrust::exclusive_scan(num_objs.begin(), num_objs.end(), copy_start.begin());
  // copy (by object type) input to output
  thrust::device_vector<my_obj<char> > d_result(d_data.size());
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(copy_start.begin(), uni_obj_idxs.begin())), thrust::make_zip_iterator(thrust::make_tuple(copy_start.end(), uni_obj_idxs.end())), copy_functor<char>(thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d_result.data()), thrust::raw_pointer_cast(rn.data())));

  // display results
  std::cout << "Grouped: " << std::endl;
  for (int i = 0; i < d_data.size(); i++){
    my_obj<char> temp = d_result[i];
    std::cout << temp.get() << ",";}
  std::cout << std::endl;
  for (int i = 0; i < d_data.size(); i++){
    my_obj<char> temp = d_result[i];
    std::cout << temp.get_idx() << ",";}
  std::cout << std::endl;
  return 0;
}
$ nvcc -o t707 t707.cu
$ ./t707
a,e,t,b,c,a,c,e,t,a,
Grouped:
a,a,a,e,e,t,t,b,c,c,
0,5,9,1,7,2,8,3,4,6,
$