Sorting 如何使用推力对矩阵的行进行排序？_Sorting_Cuda_Thrust

Sorting 如何使用推力对矩阵的行进行排序？

sorting cuda

Sorting 如何使用推力对矩阵的行进行排序？,sorting,cuda,thrust,Sorting,Cuda,Thrust,我有一个5000x500矩阵，我想用cuda分别对每一行进行排序。我可以使用arrayfire，但这只是推力：：排序上的for循环，这不应该是有效的 for（尺寸类型w=0；w

我有一个5000x500矩阵，我想用cuda分别对每一行进行排序。我可以使用arrayfire，但这只是推力：：排序上的for循环，这不应该是有效的

for（尺寸类型w=0；w


有没有一种方法可以将各种操作融合在一起，从而使它们并行运行？事实上，我所寻找的是一种将for循环迭代融合到中的通用方法。
我可以想到两种可能性，其中一种已经由@JaredHoberock提出。我不知道有什么通用方法可以在推力中融合循环迭代，但第二种方法是更通用的方法。我的猜测是，在这种情况下，第一种方法是两种方法中速度更快的
使用矢量化排序。如果要按嵌套for循环排序的区域不重叠，则可以使用前面讨论的2个背对背稳定排序操作进行矢量化排序

推力v1.8（CUDA 7 RC提供，或通过直接从includes下载，方法是在传递给另一个推力算法的自定义函子中包含推力算法调用。如果您使用推力：：for_each
操作来选择需要执行的单个排序，您可以通过包含一个推力算法调用来运行这些排序传递给每个

的函数中的

推力：：排序

操作

下面是3种方法之间的全面比较：

原始的循环排序方法

矢量化/批量排序

嵌套排序

在每种情况下，我们都会对相同的16000组进行排序，每组1000整数

$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>

#define NSORTS 16000
#define DSIZE 1000

int my_mod_start = 0;
int my_mod(){
  return (my_mod_start++)/DSIZE;
}

bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
  return thrust::equal(d1.begin(), d1.end(), d2.begin());
}


struct sort_functor
{
  thrust::device_ptr<int> data;
  int dsize;
  __host__ __device__
  void operator()(int start_idx)
  {
    thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
  }
};



#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

int main(){
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
  thrust::host_vector<int> h_data(DSIZE*NSORTS);
  thrust::generate(h_data.begin(), h_data.end(), rand);
  thrust::device_vector<int> d_data = h_data;

  // first time a loop
  thrust::device_vector<int> d_result1 = d_data;
  thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
  unsigned long long mytime = dtime_usec(0);
  for (int i = 0; i < NSORTS; i++)
    thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;

  //vectorized sort
  thrust::device_vector<int> d_result2 = d_data;
  thrust::host_vector<int> h_segments(DSIZE*NSORTS);
  thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
  thrust::device_vector<int> d_segments = h_segments;
  mytime = dtime_usec(0);
  thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
  thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
  //nested sort
  thrust::device_vector<int> d_result3 = d_data;
  sort_functor f = {d_result3.data(), DSIZE};
  thrust::device_vector<int> idxs(NSORTS);
  thrust::sequence(idxs.begin(), idxs.end());
  mytime = dtime_usec(0);
  thrust::for_each(idxs.begin(), idxs.end(), f);
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
  return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$

$cat t617.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#1.6万吨
#定义DSIZE 1000
int my_mod_start=0；
int my_mod（）{
返回（my_mod_start++）/DSIZE；
}
bool验证（推力：装置矢量和d1，推力：装置矢量和d2）{
返回推力：：相等（d1.begin（），d1.end（），d2.begin（））；
}
结构排序函数
{
推力：设备ptr数据；
int-dsize；
__主机设备__
void运算符（）（int start_idx）
{
推力：：排序（推力：：设备，数据+（dsize*start\u idx），数据+（dsize*（start\u idx+1））；
}
};
#包括
#包括
#定义USECPSEC 10000000ull
无符号长时间dtime\u usec（无符号长时间启动）{
蒂梅瓦尔电视；
gettimeofday（&tv，0）；
返回（（tv.tv_sec*USECPSEC）+tv.tv_usec）-开始；
}
int main（）{
cudaDeviceSetLimit（cudaLimitMallocHeapSize，（16*DSIZE*传感器））；
推力：主机向量h_数据（DSIZE*NSORTS）；
生成（h_data.begin（），h_data.end（），rand）；
推力：：设备矢量d_数据=h_数据；
//第一次循环
推力：：设备向量d_结果1=d_数据；
推力：：设备\u ptr r1ptr=推力：：设备\u指针\u转换（d_result1.data（））；
无符号long long mytime=dtime\u usec（0）；
对于（int i=0；istd：：你能适应这种方法吗？我会尝试在调用推力：：for_each
的内部嵌套一个调用推力：：sort。我正在尝试理解这两种方法…谢谢。好吧！我放弃了。我会用简单的方法来做。非常感谢！你不仅回答了我的问题，还向我展示了如何做很多事情我必须补充一点，我看到了这一点：并且意识到正常的推力排序比任何其他方法都要快（除了手工编码的并行排序方法）。我的数据是无符号的，数据的16个最高有效位都是零。因此，我只是将行号放在16 msb中，并对其进行排序。嗨，你知道如何沿矩阵的行进行argsort吗？这意味着找出矩阵中行元素的顺序，而不是排序后的矩阵。
$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>

#define NSORTS 16000
#define DSIZE 1000

int my_mod_start = 0;
int my_mod(){
  return (my_mod_start++)/DSIZE;
}

bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
  return thrust::equal(d1.begin(), d1.end(), d2.begin());
}


struct sort_functor
{
  thrust::device_ptr<int> data;
  int dsize;
  __host__ __device__
  void operator()(int start_idx)
  {
    thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
  }
};



#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

int main(){
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
  thrust::host_vector<int> h_data(DSIZE*NSORTS);
  thrust::generate(h_data.begin(), h_data.end(), rand);
  thrust::device_vector<int> d_data = h_data;

  // first time a loop
  thrust::device_vector<int> d_result1 = d_data;
  thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
  unsigned long long mytime = dtime_usec(0);
  for (int i = 0; i < NSORTS; i++)
    thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;

  //vectorized sort
  thrust::device_vector<int> d_result2 = d_data;
  thrust::host_vector<int> h_segments(DSIZE*NSORTS);
  thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
  thrust::device_vector<int> d_segments = h_segments;
  mytime = dtime_usec(0);
  thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
  thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
  //nested sort
  thrust::device_vector<int> d_result3 = d_data;
  sort_functor f = {d_result3.data(), DSIZE};
  thrust::device_vector<int> idxs(NSORTS);
  thrust::sequence(idxs.begin(), idxs.end());
  mytime = dtime_usec(0);
  thrust::for_each(idxs.begin(), idxs.end(), f);
  cudaDeviceSynchronize();
  mytime = dtime_usec(mytime);
  std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
  if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
  return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$