Sorting 使用CUDA/ARTH对多个阵列进行排序_Sorting_Vector_Cuda_Gpu_Thrust

Sorting 使用CUDA/ARTH对多个阵列进行排序

sorting vector cuda

Sorting 使用CUDA/ARTH对多个阵列进行排序,sorting,vector,cuda,gpu,thrust,Sorting,Vector,Cuda,Gpu,Thrust,我有一个需要在GPU上排序的大数组。阵列本身是多个较小子阵列的串联，这些子阵列满足以下条件：给定i

我有一个需要在GPU上排序的大数组。阵列本身是多个较小子阵列的串联，这些子阵列满足以下条件：给定i 我知道我可以简单地在整个阵列上使用

推力：：排序

，但我想知道是否有可能启动多个并发排序，每个子阵列一个。我希望通过这样做来提高性能。我的假设是，排序多个较小的数组比排序一个包含所有元素的大型数组要快

如果有人能给我一个这样做的方法或纠正我的假设，我将不胜感激。

在推力中执行多个并发排序（“矢量化”排序）的方法是通过标记子数组，并提供一个自定义函子，它是一个普通推力排序函子，也可以通过其键对子数组进行排序

另一种可能的方法是使用背对背

推力：：稳定\u排序\u按_键

，如前所述

正如你所指出的，在你的案例中，另一种方法就是做一个普通的排序，因为这最终是你的目标

然而，我认为任何一种推力排序方法都不可能比纯排序有显著的加速，尽管您可以尝试一下。推力有一个快速路径基数排序，它将在某些情况下使用，纯排序方法可能会在您的情况下使用。（在其他情况下，例如，当您提供自定义函子时，推力通常会使用较慢的合并排序方法。）

如果子数组的大小在一定范围内，我认为使用块基数按立方排序（每个子数组一个块），可能会得到更好的结果（性能方面）

下面是一个使用特定大小的示例（因为您没有给出大小范围和其他详细信息的指示），将推力“纯排序”与带函子的推力分段排序以及cub块排序方法进行比较。对于这种特殊情况，cub排序最快：

$ cat t1.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/scan.h>
#include <thrust/equal.h>
#include <cstdlib>
#include <iostream>


#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

const int num_blocks = 2048;
const int items_per = 4;
const int nTPB = 512;
const int block_size = items_per*nTPB; // must be a whole-number multiple of nTPB;
typedef float mt;

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

struct my_sort_functor
{
        template <typename T, typename T2>
        __host__ __device__
        bool operator()(T t1, T2 t2){
                if (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
                if (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
                if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
                return true;}
};

// from: https://nvlabs.github.io/cub/example_block_radix_sort_8cu-example.html#_a0
#define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/block_radix_sort.cuh>
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
bool g_uniform_keys;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
template <
    typename    Key,
    int         BLOCK_THREADS,
    int         ITEMS_PER_THREAD>
__launch_bounds__ (BLOCK_THREADS)
__global__ void BlockSortKernel(
    Key         *d_in,          // Tile of input
    Key         *d_out)         // Tile of output
{
    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
    // Specialize BlockRadixSort type for our thread block
    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
    // Shared memory
    __shared__ union TempStorage
    {
        typename BlockLoadT::TempStorage        load;
        typename BlockRadixSortT::TempStorage   sort;
    } temp_storage;
    // Per-thread tile items
    Key items[ITEMS_PER_THREAD];
    // Our current block's offset
    int block_offset = blockIdx.x * TILE_SIZE;
    // Load items into a blocked arrangement
    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
    // Barrier for smem reuse
    __syncthreads();
    // Sort keys
    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
    // Store output in striped fashion
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
}

int main(){
        const int ds = num_blocks*block_size;
        thrust::host_vector<mt>      data(ds);
        thrust::host_vector<int>     keys(ds);
        for (int i = block_size; i < ds; i+=block_size) keys[i] = 1; // mark beginning of blocks
        thrust::device_vector<int> d_keys = keys;
        for (int i = 0; i < ds; i++) data[i] = (rand()%block_size) + (i/block_size)*block_size;  // populate data
        thrust::device_vector<mt>  d_data = data;
        thrust::inclusive_scan(d_keys.begin(), d_keys.end(), d_keys.begin());  // fill out keys array  000111222...
        thrust::device_vector<mt> d1 = d_data;  // make a copy of unsorted data
        cudaDeviceSynchronize();
        unsigned long long os = dtime_usec(0);
        thrust::sort(d1.begin(), d1.end());  // ordinary sort
        cudaDeviceSynchronize();
        os = dtime_usec(os);
        thrust::device_vector<mt> d2 = d_data;  // make a copy of unsorted data
        cudaDeviceSynchronize();
        unsigned long long ss = dtime_usec(0);
        thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(d2.begin(), d_keys.begin())), thrust::make_zip_iterator(thrust::make_tuple(d2.end(), d_keys.end())), my_sort_functor());
        cudaDeviceSynchronize();
        ss = dtime_usec(ss);
        if (!thrust::equal(d1.begin(), d1.end(), d2.begin())) {std::cout << "oops1" << std::endl; return 0;}
        std::cout << "ordinary thrust sort: " << os/(float)USECPSEC << "s " << "segmented sort: " << ss/(float)USECPSEC << "s" << std::endl;
        thrust::device_vector<mt> d3(ds);
        cudaDeviceSynchronize();
        unsigned long long cs = dtime_usec(0);
        BlockSortKernel<mt, nTPB, items_per><<<num_blocks, nTPB>>>(thrust::raw_pointer_cast(d_data.data()),  thrust::raw_pointer_cast(d3.data()));
        cudaDeviceSynchronize();
        cs = dtime_usec(cs);
        if (!thrust::equal(d1.begin(), d1.end(), d3.begin())) {std::cout << "oops2" << std::endl; return 0;}
        std::cout << "cub sort: " << cs/(float)USECPSEC << "s" << std::endl;
}
$ nvcc -o t1 t1.cu
$ ./t1
ordinary thrust sort: 0.001652s segmented sort: 0.00263s
cub sort: 0.000265s
$

$cat t1.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义USECPSEC 10000000ull
const int num_blocks=2048；
const int items_per=4；
常数int nTPB=512；
const int block_size=每*nTPB的项目数；//必须是nTPB的整数倍；
类型定义浮动mt；
无符号长时间dtime\u usec（无符号长时间启动）{
蒂梅瓦尔电视；
gettimeofday（&tv，0）；
返回（（tv.tv_sec*USECPSEC）+tv.tv_usec）-开始；
}
结构我的排序函子
{
模板
__主机设备__
布尔运算符（）（t1，T2）{
if（推力：：get（t1）<推力：：get（t2））返回true；
if（推力：：获取（t1）>推力：：获取（t2））返回false；
if（推力：：获取（t1）>推力：：获取（t2））返回false；
返回true；}
};
//发件人：https://nvlabs.github.io/cub/example_block_radix_sort_8cu-example.html#_a0
#定义CUB_STDERR
#包括
#包括
#包括。没有必要问我这个问题；根据你问题中的信息，我无法回答
我不声明此代码或我发布的任何其他代码的正确性。任何使用我发布的任何代码的人都要自担风险。我只是声称我试图解决原始帖子中的问题，并提供一些解释。我并不是说我的代码是无缺陷的，也不是说它适合任何特定的用途。使用它（或不使用）的风险由您自己承担。
在推力中没有明显的方法可以做到这一点。在推力中进行多个并发排序（“矢量化”排序）的方法是通过标记子数组，并提供一个自定义functor，它是一个普通推力排序functor，也可以通过其键对子数组进行排序。您可以使用struch:：sort_by_key
来帮助实现这一点，但这不是强制性的。然而，我认为它不可能给一个明显的速度超过纯排序，虽然你可以尝试它。如果子数组的大小在一定范围内，我认为使用块基数排序（按cub排序，每个子数组一个块）可能会得到更好的结果（性能方面）。@RobertCrovella:如何使用按键排序？很抱歉，这是一个误导性的评论。我想到的是如图所示，按键连续排序。使用sort_by_键方法的优点是，它可以允许推力使用基数排序快速路径。我认为排序函子方法不会。此外，按密钥排序方法实际上不会利用子数组的排序。如果您的目的是进行并行（即向量化）排序，那么即使子数组没有排序，它也能正常工作。事实上，现在我想起来了，我建议的两种方法都没有利用子数组的排序，这是可以论证的。它们只是矢量化的排序。