Sorting CUDA中按键排序（小）数组_Sorting_Cuda_Parallel Processing_Reduce_Cub

Sorting CUDA中按键排序（小）数组

sorting cuda parallel-processing

Sorting CUDA中按键排序（小）数组,sorting,cuda,parallel-processing,reduce,cub,Sorting,Cuda,Parallel Processing,Reduce,Cub,我正在尝试编写一个函数，它接受一块未排序的键/值对，例如 <7, 4> <2, 8> <3, 1> <2, 2> <1, 5> <7, 1> <3, 8> <7, 2> 这对于较小的数据集来说效果很好，但是对于较大的数据集（尽管仍然在单个块的大小内），单次调用就无法做到这一点尝试在同一个函数中结合排序和归约是否明智？显然，该函数需要多次调用，但是否可以根据其大小确定需要调用多少次以耗尽所有数据或

我正在尝试编写一个函数，它接受一块未排序的键/值对，例如

<7, 4>
<2, 8>
<3, 1>
<2, 2>
<1, 5>
<7, 1>
<3, 8>
<7, 2>

这对于较小的数据集来说效果很好，但是对于较大的数据集（尽管仍然在单个块的大小内），单次调用就无法做到这一点

尝试在同一个函数中结合排序和归约是否明智？显然，该函数需要多次调用，但是否可以根据其大小确定需要调用多少次以耗尽所有数据

或者我应该用这样的东西单独进行还原：

__device__ int interReduce(int2 *sdata, int tid) {
  int index = tid;
  while (sdata[index].x == sdata[tid].x) {
    index--;
    if (index < 0)
      break;
  }
  if (index+1 != tid) {
    atomicAdd(&sdata[index+1].y, sdata[tid].y);
    sdata[tid].x = 99;
    sdata[tid].y = 99;
    return 1;
  }
  return 0;
}

\uuuuu设备\uuuuuu中断（int2*sdata，inttid）{
综合指数=tid；
while（sdata[index].x==sdata[tid].x）{
索引--；
如果（指数<0）
打破
}
如果（指数+1！=tid）{
原子添加（&sdata[index+1].y，sdata[tid].y）；
sdata[tid].x=99；
sdata[tid].y=99；
返回1；
}
返回0；
}

我正试图找到最有效的解决方案，但我在CUDA和并行算法方面的经验有限。

你可以用它来实现

使用后接

下面是一个例子：

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#define N 12
typedef thrust::device_vector<int>::iterator dintiter;
int main(){

  thrust::device_vector<int> keys(N);
  thrust::device_vector<int> values(N);
  thrust::device_vector<int> new_keys(N);
  thrust::device_vector<int> new_values(N);
  thrust::sequence(keys.begin(), keys.end());
  thrust::sequence(values.begin(), values.end());

  keys[3] = 1;
  keys[9] = 1;
  keys[8] = 2;
  keys[7] = 4;

  thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
  thrust::pair<dintiter, dintiter> new_end;
  new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());

  std::cout << "results  values:" << std::endl;
  thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl << "results keys:" << std::endl;
  thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl;

  return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括
#定义N 12
typedef推力：：设备向量：：迭代器dintiter；
int main（）{
推力：：设备_矢量键（N）；
推力：设备_矢量值（N）；
推力：：设备矢量新按键（N）；
推力：：装置矢量新值（N）；
顺序（keys.begin（），keys.end（））；
序列（values.begin（），values.end（））；
键[3]=1；
键[9]=1；
键[8]=2；
键[7]=4；
推力：：按键排序（keys.begin（）、keys.end（）、values.begin（））；
推力：配对新的_端；
new_end=推力：：按_键减少_（keys.begin（），keys.end（），values.begin（），new_keys.begin（），new_values.begin（））；
std：：cout从您的帖子中，似乎您需要按键对许多小数组进行排序。引用您自己的话：
这对于较小的数据集来说效果很好，但是对于较大的数据集（尽管仍然在单个块的大小内），单次调用就无法做到这一点
下面你会发现一个完整的例子围绕我的答案和使用
#包括
#包括
#包括
#包括“Utilities.cuh”
使用名称空间cub；
/**********************************/
/*CUB BLOCKSORT内核没有共享*/
/**********************************/
模板
__全局无效BlockSortKernel（浮点*d_值、整数*d_键、浮点*d_值结果、整数*d_键结果）
{
//---专门化BlockLoad、BlockStore和BlockRadixSort集合类型
typedef cub:：BlockLoad BlockLoadIntT；
typedef cub:：BlockLoad blockloadfloat；
typedef cub:：BlockStore BlockStoreIntT；
typedef cub:：BlockStore blockstorefloat；
typedef cub:：BlockRadixSort BlockRadixSort；
//---为集体分配类型安全、可重复利用的共享内存
__共享联合{
typename BlockLoadInt:：TempStorage loadInt；
typename BlockLoadFloat:：TempStorage loadFloat；
typename BlockStoreInt:：TempStorage storeInt；
typename BlockStoreFloat:：TempStorage storeFloat；
typename BlockRadixSortT:：TempStorage排序；
}临时储存；
//---获取此块的连续键段（跨线程阻塞）
int thread_key[每个线程的项目数]；
浮动线程_值[每个线程的项目_]；
int block_offset=blockIdx.x*（块线程*每个线程的项目）；
BlockLoadIntT（temp_storage.loadInt）.Load（d_键+块偏移，线程键）；
blockloadfloat（temp_storage.loadFloat）.Load（d_值+block_偏移量，线程_值）；
__同步线程（）；
//---对键进行集体排序
BlockRadixSortT（temp_storage.sort）.SortBlockedToStriped（线程键、线程值）；
__同步线程（）；
//---存储已排序的段
BlockStoreIntT（temp_storage.storeInt）.Store（d_key_result+block_offset，thread_key）；
blockstorefloat（temp\u storage.storeFloat）.Store（d\u值\u结果+块\u偏移，线程\u值）；
}
/*******************************/
/*CUB块排序内核共享*/
/*******************************/
模板
__全局\uuuuu无效共享\u块内核（浮点*d\u值、整数*d\u键、浮点*d\u值\u结果、整数*d\u键\u结果）
{
//---共享内存分配
__共享\uuuuufloat SharedMemoryArrayValue[块线程*每个线程的项目]；
__shared_uuuint sharedMemoryArrayKeys[块线程*每个线程的项目]；
//---专门化BlockStore和BlockRadixSort集合类型
typedef cub:：BlockRadixSort BlockRadixSort；
//---为集体分配类型安全、可重复利用的共享内存
__共享类型名BlockRadixSortT:：临时存储临时存储；
int block_offset=blockIdx.x*（块线程*每个线程的项目）；
//---将数据加载到共享内存
对于（int k=0；k__device__ int interReduce(int2 *sdata, int tid) {
  int index = tid;
  while (sdata[index].x == sdata[tid].x) {
    index--;
    if (index < 0)
      break;
  }
  if (index+1 != tid) {
    atomicAdd(&sdata[index+1].y, sdata[tid].y);
    sdata[tid].x = 99;
    sdata[tid].y = 99;
    return 1;
  }
  return 0;
}

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#define N 12
typedef thrust::device_vector<int>::iterator dintiter;
int main(){

  thrust::device_vector<int> keys(N);
  thrust::device_vector<int> values(N);
  thrust::device_vector<int> new_keys(N);
  thrust::device_vector<int> new_values(N);
  thrust::sequence(keys.begin(), keys.end());
  thrust::sequence(values.begin(), values.end());

  keys[3] = 1;
  keys[9] = 1;
  keys[8] = 2;
  keys[7] = 4;

  thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
  thrust::pair<dintiter, dintiter> new_end;
  new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());

  std::cout << "results  values:" << std::endl;
  thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl << "results keys:" << std::endl;
  thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));
  std::cout << std::endl;

  return 0;
}

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/**********************************/
/* CUB BLOCKSORT KERNEL NO SHARED */
/**********************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
    // --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
    typedef cub::BlockLoad      <int*,   BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE>  BlockLoadIntT;
    typedef cub::BlockLoad      <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE>  BlockLoadFloatT;
    typedef cub::BlockStore     <int*,   BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreIntT;
    typedef cub::BlockStore     <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreFloatT;
    typedef cub::BlockRadixSort <int ,   BLOCK_THREADS, ITEMS_PER_THREAD, float>                 BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ union {
        typename BlockLoadIntT      ::TempStorage loadInt;
        typename BlockLoadFloatT    ::TempStorage loadFloat;
        typename BlockStoreIntT     ::TempStorage storeInt;
        typename BlockStoreFloatT   ::TempStorage storeFloat;
        typename BlockRadixSortT    ::TempStorage sort;
    } temp_storage;

    // --- Obtain this block's segment of consecutive keys (blocked across threads)
    int   thread_keys[ITEMS_PER_THREAD];
    float thread_values[ITEMS_PER_THREAD];
    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    BlockLoadIntT(temp_storage.loadInt).Load(d_keys   + block_offset, thread_keys);
    BlockLoadFloatT(temp_storage.loadFloat).Load(d_values + block_offset, thread_values);
    __syncthreads(); 

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(thread_keys, thread_values);
    __syncthreads(); 

    // --- Store the sorted segment
    BlockStoreIntT(temp_storage.storeInt).Store(d_keys_result   + block_offset, thread_keys);
    BlockStoreFloatT(temp_storage.storeFloat).Store(d_values_result + block_offset, thread_values);

}

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValues[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float>  BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k] = d_values[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys[threadIdx.x * ITEMS_PER_THREAD + k]   = d_keys[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)  [ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys   + (threadIdx.x * ITEMS_PER_THREAD))),
                                                       *static_cast<float(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayValues + (threadIdx.x * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_result[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k];
        d_keys_result  [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys  [threadIdx.x * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int numElemsPerArray  = 8;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = 4;

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_values = (float *)malloc(N * sizeof(float));
    int *h_keys     = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_values[i] = rand() % RANGE;
        h_keys[i]   = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys[k * numElemsPerArray + i], h_values[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_result1 = (float *)malloc(N * sizeof(float));
    float *h_values_result2 = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1   = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2   = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_values;            gpuErrchk(cudaMalloc((void **)&d_values,         N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_result1;    gpuErrchk(cudaMalloc((void **)&d_values_result1, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel no shared
    gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result1, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_result1, d_values_result1, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel no shared\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_result1[k * numElemsPerArray + i]);

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result2, d_keys_result2); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_result2, d_values_result2, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result2,   d_keys_result2,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel shared\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result2[k * numElemsPerArray + i], h_values_result2[k * numElemsPerArray + i]);

    return 0;
}

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValuesA[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ float sharedMemoryArrayValuesB[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryHelperIndices[BLOCK_THREADS * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, int>    BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValuesA [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayValuesB [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys    [threadIdx.x * ITEMS_PER_THREAD + k] = d_keys   [block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
        sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k] =                          threadIdx.x * ITEMS_PER_THREAD + k ;
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys     + (threadIdx.x * ITEMS_PER_THREAD))),
                                                       *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + (threadIdx.x * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_resultA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
        d_values_resultB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
        d_keys_result   [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys                             [threadIdx.x * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int numElemsPerArray  = 8;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = 4;

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_valuesA    = (float *)malloc(N * sizeof(float));
    float *h_valuesB    = (float *)malloc(N * sizeof(float));
    int *h_keys         = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_valuesA[i] = rand() % RANGE;
        h_valuesB[i] = rand() % RANGE;
        h_keys[i]    = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_resultA  = (float *)malloc(N * sizeof(float));
    float *h_values_resultB  = (float *)malloc(N * sizeof(float));
    float *h_values_result2  = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1    = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2    = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_valuesA;           gpuErrchk(cudaMalloc((void **)&d_valuesA,        N * sizeof(float)));
    float *d_valuesB;           gpuErrchk(cudaMalloc((void **)&d_valuesB,        N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_resultA;    gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
    float *d_values_resultB;    gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel using shared memory\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);

    return 0;
}

#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>

#include "Utilities.cuh"

using namespace cub;

/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCKSIZE_X, int BLOCKSIZE_Y, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
    // --- Shared memory allocation
    __shared__ float sharedMemoryArrayValuesA [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ float sharedMemoryArrayValuesB [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryArrayKeys    [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
    __shared__ int   sharedMemoryHelperIndices[BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];

    // --- Specialize BlockStore and BlockRadixSort collective types
    typedef cub::BlockRadixSort <int , BLOCKSIZE_X, ITEMS_PER_THREAD, int, 4, false, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCKSIZE_Y> BlockRadixSortT;

    // --- Allocate type-safe, repurposable shared memory for collectives
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    int block_offset = blockIdx.x * (BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD);

    // --- Load data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        sharedMemoryArrayValuesA [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryArrayValuesB [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryArrayKeys    [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_keys   [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
        sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] =                          (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k ;
    }
    __syncthreads();

    // --- Collectively sort the keys
    BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys     + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))),
                                                       *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))));
    __syncthreads();

    // --- Write data to shared memory
    for (int k = 0; k < ITEMS_PER_THREAD; k++) {
        d_values_resultA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
        d_values_resultB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
        d_keys_result   [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys                             [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
    }
}

/********/
/* MAIN */
/********/
int main() {

    const int blockSize_x       = 2;
    const int blockSize_y       = 4;
    const int numElemsPerArray  = blockSize_x * blockSize_y;
    const int numArrays         = 4;
    const int N                 = numArrays * numElemsPerArray;
    const int numElemsPerThread = numElemsPerArray / (blockSize_x * blockSize_y);

    const int RANGE             = N * numElemsPerThread;

    // --- Allocating and initializing the data on the host
    float *h_valuesA    = (float *)malloc(N * sizeof(float));
    float *h_valuesB    = (float *)malloc(N * sizeof(float));
    int *h_keys         = (int *)  malloc(N * sizeof(int));
    for (int i = 0 ; i < N; i++) {
        h_valuesA[i] = rand() % RANGE;
        h_valuesB[i] = rand() % RANGE;
        h_keys[i]    = rand() % RANGE;
    }

    printf("Original\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);

    // --- Allocating the results on the host
    float *h_values_resultA  = (float *)malloc(N * sizeof(float));
    float *h_values_resultB  = (float *)malloc(N * sizeof(float));
    float *h_values_result2  = (float *)malloc(N * sizeof(float));
    int   *h_keys_result1    = (int *)  malloc(N * sizeof(int));
    int   *h_keys_result2    = (int *)  malloc(N * sizeof(int));

    // --- Allocating space for data and results on device
    float *d_valuesA;           gpuErrchk(cudaMalloc((void **)&d_valuesA,        N * sizeof(float)));
    float *d_valuesB;           gpuErrchk(cudaMalloc((void **)&d_valuesB,        N * sizeof(float)));
    int   *d_keys;              gpuErrchk(cudaMalloc((void **)&d_keys,           N * sizeof(int)));
    float *d_values_resultA;    gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
    float *d_values_resultB;    gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
    float *d_values_result2;    gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
    int   *d_keys_result1;      gpuErrchk(cudaMalloc((void **)&d_keys_result1,   N * sizeof(int)));
    int   *d_keys_result2;      gpuErrchk(cudaMalloc((void **)&d_keys_result2,   N * sizeof(int)));

    // --- BlockSortKernel with shared
    gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_keys,   h_keys,   N * sizeof(int),   cudaMemcpyHostToDevice));
    shared_BlockSortKernel<blockSize_x, blockSize_y, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); 
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());    
    gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_keys_result1,   d_keys_result1,   N * sizeof(int),   cudaMemcpyDeviceToHost));

    printf("\n\nBlockSortKernel using shared memory\n\n");
    for (int k = 0; k < numArrays; k++) 
        for (int i = 0; i < numElemsPerArray; i++)
            printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);

    return 0;
}