Sorting 如何使用cuda计算矩阵每行中元素的顺序?

Sorting 如何使用cuda计算矩阵每行中元素的顺序?,sorting,cuda,thrust,Sorting,Cuda,Thrust,我正在寻找如何使用cuda/沿矩阵的行或列进行argsort 这意味着给定一个矩阵,如: A = [[ 3.4257, -1.2345, 0.6232, -0.1354], [-1.6639, 0.1557, -0.1763, 1.0257], [0.6863, 0.0992, 1.4487, 0.0157]]. 我需要计算每行中元素的顺序,因此输出为: index = [[1, 3, 2, 0], [0, 2, 1, 3],

我正在寻找如何使用cuda/沿矩阵的行或列进行
argsort

这意味着给定一个矩阵,如:

A = [[ 3.4257, -1.2345,  0.6232, -0.1354], 
     [-1.6639,  0.1557, -0.1763,  1.0257], 
     [0.6863,  0.0992,  1.4487,  0.0157]].
我需要计算每行中元素的顺序,因此输出为:

index = [[1, 3, 2, 0],
         [0, 2, 1, 3],
         [3, 1, 0, 2]]

我怎样才能做到这一点呢?

这可以通过
推力::排序来实现。我们需要一组行索引和一组列索引。行索引用于确保在行之间划分排序顺序。列索引将在排序后为我们提供结果

将值、行索引、列索引压缩在一起。创建排序函子,该函子先对行排序,然后对值排序。输出是重新排列的列索引

$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

using namespace thrust::placeholders;

struct my_sort_functor
{
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(const T1 &t1, const T2 &t2){
    if  (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
    if  (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
    if  (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
    return false;
  }
};

typedef float mt;
typedef int it;

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> row_idx(d_A.size());
  thrust::device_vector<it> col_idx(d_A.size());
  thrust::sequence(row_idx.begin(), row_idx.end());
  thrust::sequence(col_idx.begin(), col_idx.end());
  thrust::transform(row_idx.begin(), row_idx.end(), row_idx.begin(), _1/cols);
  thrust::transform(col_idx.begin(), col_idx.end(), col_idx.begin(), _1%cols);
  auto my_zip_iterator = thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(), row_idx.begin(), col_idx.begin()));
  thrust::sort(my_zip_iterator, my_zip_iterator+rows*cols, my_sort_functor());
  thrust::host_vector<it> h_col_idx = col_idx;
  thrust::copy_n(h_col_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$
$cat t114.cu
#包括
#包括
#包括
#包括
#包括
使用命名空间推力::占位符;
结构我的排序函子
{
模板
__主机设备__
布尔运算符()(常数T1和T1,常数T2和T2){
if(推力::get(t1)<推力::get(t2))返回true;
if(推力::获取(t1)>推力::获取(t2))返回false;
if(推力::get(t1)<推力::get(t2))返回true;
返回false;
}
};
类型定义浮动mt;
键入def int;
int main(){
mt A[]={3.4257,-1.2345,0.6232,-0.1354,
-1.6639,  0.1557, -0.1763,  1.0257,
0.6863,  0.0992,  1.4487,  0.0157};
const int rows=3;
常数int cols=4;
推力:装置矢量d(A,A+行*cols);
推力:设备向量行idx(d_A.size());
推力:设备向量列idx(d_A.size());
顺序(row_idx.begin(),row_idx.end());
序列(col_idx.begin(),col_idx.end());
转换(row_idx.begin(),row_idx.end(),row_idx.begin(),_1/cols);
转换(col_idx.begin(),col_idx.end(),col_idx.begin(),_1%cols);
自动my_zip_迭代器=推力::make_zip_迭代器(推力::make_元组(d_A.begin(),row_idx.begin(),col_idx.begin());
推力::排序(my_-zip_迭代器,my_-zip_迭代器+行*cols,my_-sort_函子());
推力:主机向量h\u col\u idx=col\u idx;
推力::复制(h_col_idx.begin(),rows*cols,std::ostream_迭代器(std::cout,“,”);

std::cout@RobertCrovella:这是该行值升序的正确argsort
[-1.2345,-0.1354,0.6232,3.4257]
而且给定的推力没有按键分段排序,没有直接的方法可以做到这一点。是的,你可能会从一个稳定的排序中破解一些东西,但没有直接实现任何东西。是的,我很困惑。
$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <iostream>

using namespace thrust::placeholders;
typedef float mt;
typedef int it;

struct my_sort_functor
{
  mt *d;
  it cols;
  my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
  __host__ __device__
  bool operator()(const it &t1, const it &t2){
    it row1 = t1/cols;
    it row2 = t2/cols;
    if  (row1 < row2) return true;
    if  (row1 > row2) return false;
    if  (d[t1] < d[t2]) return true;
    return false;
  }
};

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> idx(d_A.size());
  thrust::sequence(idx.begin(), idx.end());
  thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
  thrust::transform(idx.begin(), idx.end(), idx.begin(), _1%cols);
  thrust::host_vector<it> h_idx = idx;
  thrust::copy_n(h_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$
$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <iostream>

using namespace thrust::placeholders;
typedef float mt;
typedef unsigned it;

struct my_sort_functor
{
  mt *d;
  it cols;
  my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
  __host__ __device__
  bool operator()(const it &t1, const it &t2){
    it row1 = t1>>16;
    it row2 = t2>>16;
    if  (row1 < row2) return true;
    if  (row1 > row2) return false;
    it col1 = t1&65535;
    it col2 = t2&65535;
    it i1 = row1*cols+col1;
    it i2 = row2*cols+col2;
    if  (d[i1] < d[i2]) return true;
    return false;
  }
};

struct my_transform_functor
{
  it cols;
  my_transform_functor(it _cols) : cols(_cols) {};
  __host__ __device__
    it operator()(const it &t1){
      it row = t1/cols;
      it col = t1 - row*cols;
      return (row << 16) + col;
    }
};

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  // assume rows and cols are each less than 65536
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> idx(d_A.size());
  thrust::sequence(idx.begin(), idx.end());
  thrust::transform(idx.begin(), idx.end(), idx.begin(), my_transform_functor(cols));
  thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
  thrust::host_vector<it> h_idx = idx;
  for (int i = 0; i < rows*cols; i++) std::cout << (h_idx[i]&65535) << ",";
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$