Sorting CUDA推力排序或CUB:：DeviceRadixSort_Sorting_Cuda_Thrust_Cub

Sorting CUDA推力排序或CUB:：DeviceRadixSort

sorting cuda

Sorting CUDA推力排序或CUB:：DeviceRadixSort,sorting,cuda,thrust,cub,Sorting,Cuda,Thrust,Cub,我有一个由float4数组表示的粒子池，其中w分量是粒子在[0,1]范围内的当前寿命我需要根据粒子的寿命按降序对这个数组进行排序，这样我就可以保留一个精确的计数器，以显示当前有多少粒子处于“活动”状态（寿命大于0）。我需要这个计数器，因为当我需要激活更多粒子（随机发生）时，它将允许我索引到阵列中的正确点我的粒子数组存储在设备内存中，看起来我应该能够对数组进行排序，而不必将数组传输到主机内存我没有太多的运气在网上找到例子，显示我如何可以做到这一点，无论是推力或幼兽。此外，我对使用推力犹豫不决

我有一个由float4数组表示的粒子池，其中w分量是粒子在[0,1]范围内的当前寿命

我需要根据粒子的寿命按降序对这个数组进行排序，这样我就可以保留一个精确的计数器，以显示当前有多少粒子处于“活动”状态（寿命大于0）。我需要这个计数器，因为当我需要激活更多粒子（随机发生）时，它将允许我索引到阵列中的正确点

我的粒子数组存储在设备内存中，看起来我应该能够对数组进行排序，而不必将数组传输到主机内存

我没有太多的运气在网上找到例子，显示我如何可以做到这一点，无论是推力或幼兽。此外，我对使用推力犹豫不决，因为我不知道如何防止它降级为合并排序（比基数排序慢得多），因为我需要基于w组件进行排序。至于CUB，我根本就没有找到任何资源来了解我如何做到这一点

我还希望将生存期存储在w组件中，因为这使我在代码的其他部分的生活变得更加轻松

有没有一个简单的方法可以做到这一点？感谢您的帮助。

在cub或STRUCH中，我们可以仅在

.w

的“键”上进行排序，在值仅为线性递增索引的情况下进行键值排序：

0, 1, 2, 3, ...

然后，我们可以使用索引序列的结果重排一步对原始

float4

数组重新排序（有效地按

.w

排序）。这将允许您保持基数排序速度（立方或推力），并且可能相当有效，因为

float4

数量只需要移动/重新排列一次，而不是在排序操作期间连续移动

这是一个在32M元素上的完整的推力示例，演示了一种“普通”推力排序，使用函子指定

.w

元素（

sort\u f4\u w

）上的排序，遵循上述方法。在这种情况下，在我的特定设置（Fedora 20、CUDA 7、Quadro5000）上，第二种方法似乎快了5倍：

$ cat t686.cu
#include <iostream>
#include <vector_types.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/equal.h>

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#define DSIZE (32*1048576)

struct sort_f4_w
{
  __host__ __device__
  bool operator()(const float4 &a, const float4 &b) const {
    return (a.w < b.w);}
};
// functor to extract the .w element from a float4
struct f4_to_fw : public thrust::unary_function<float4, float>
{
  __host__ __device__
  float operator()(const float4 &a) const {
    return a.w;}
};
// functor to extract the .x element from a float4
struct f4_to_fx : public thrust::unary_function<float4, float>
{
  __host__ __device__
  float operator()(const float4 &a) const {
    return a.x;}
};


bool validate(thrust::device_vector<float4> &d1, thrust::device_vector<float4> &d2){
  return thrust::equal(thrust::make_transform_iterator(d1.begin(), f4_to_fx()), thrust::make_transform_iterator(d1.end(), f4_to_fx()), thrust::make_transform_iterator(d2.begin(), f4_to_fx()));
}


int main(){
  unsigned long long t1_time, t2_time;
  float4 *mydata = new float4[DSIZE];
  for (int i = 0; i < DSIZE; i++){
    mydata[i].x = i;
    mydata[i].y = i;
    mydata[i].z = i;
    mydata[i].w = rand()/(float)RAND_MAX;}

  thrust::host_vector<float4>   h_data(mydata, mydata+DSIZE);
  // do once as a warm-up run, then report timings on second run
  for (int i = 0; i < 2; i++){
    thrust::device_vector<float4> d_data1 = h_data;
    thrust::device_vector<float4> d_data2 = h_data;

  // first time sort using typical thrust approach
    t1_time = dtime_usec(0);
    thrust::sort(d_data1.begin(), d_data1.end(), sort_f4_w());
    cudaDeviceSynchronize();
    t1_time = dtime_usec(t1_time);
  // now extract keys and create index values, sort, then rearrange
    t2_time = dtime_usec(0);
    thrust::device_vector<float> keys(DSIZE);
    thrust::device_vector<int> vals(DSIZE);
    thrust::copy(thrust::make_transform_iterator(d_data2.begin(), f4_to_fw()), thrust::make_transform_iterator(d_data2.end(), f4_to_fw()), keys.begin());
    thrust::sequence(vals.begin(), vals.end());
    thrust::sort_by_key(keys.begin(), keys.end(), vals.begin());
    thrust::device_vector<float4> result(DSIZE);
    thrust::copy(thrust::make_permutation_iterator(d_data2.begin(), vals.begin()), thrust::make_permutation_iterator(d_data2.begin(), vals.end()), result.begin());
    cudaDeviceSynchronize();
    t2_time = dtime_usec(t2_time);
    if (!validate(d_data1, result)){
      std::cout << "Validation failure " << std::endl;
      }
    }
  std::cout << "thrust t1 time: " << t1_time/(float)USECPSEC << "s, t2 time: " << t2_time/(float)USECPSEC << std::endl;
}


$ nvcc -o t686 t686.cu
$ ./t686
thrust t1 time: 0.731456s, t2 time: 0.149959
$

$cat t686.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义USECPSEC 10000000ull
无符号长时间dtime\u usec（无符号长时间启动）{
蒂梅瓦尔电视；
gettimeofday（&tv，0）；
返回（（tv.tv_sec*USECPSEC）+tv.tv_usec）-开始；
}
#定义DSIZE（32*1048576）
结构排序\u f4\u w
{
__主机设备__
布尔运算符（）（常量浮点4&a、常量浮点4&b）常量{
返回（a.wstd：：cout在cub或推力中，我们只能在.w
的“keys”上排序，在值只是线性递增索引的情况下进行键值排序（cub和推力都提供奇特的迭代器来自动生成索引序列）。然后，我们可以使用索引序列的结果重新排列对float4
数组进行一步重新排序，以便按.w
进行排序。这将允许您保持基数排序速度（立方或推力）由于float4
数量只需要移动/重新排列一次，而不是在排序操作期间连续移动，@RobertCrovella您能提供一个这样的代码示例吗？我不确定您如何指定w组件是键？这样我可以将您标记为answer也是。不先创建一个主机向量，然后再创建一个设备向量，有可能做到这一点吗？例如，我的float4s数组已经存储在设备内存中（需要每帧/迭代执行一次）。是的，不需要（单个）主机向量（h_data
）在我的例子中，除了它是一个方便的