Sorting CUDA中按键排序(小)数组
我正在尝试编写一个函数,它接受一块未排序的键/值对,例如Sorting CUDA中按键排序(小)数组,sorting,cuda,parallel-processing,reduce,cub,Sorting,Cuda,Parallel Processing,Reduce,Cub,我正在尝试编写一个函数,它接受一块未排序的键/值对,例如 <7, 4> <2, 8> <3, 1> <2, 2> <1, 5> <7, 1> <3, 8> <7, 2> 这对于较小的数据集来说效果很好,但是对于较大的数据集(尽管仍然在单个块的大小内),单次调用就无法做到这一点 尝试在同一个函数中结合排序和归约是否明智?显然,该函数需要多次调用,但是否可以根据其大小确定需要调用多少次以耗尽所有数据 或
<7, 4>
<2, 8>
<3, 1>
<2, 2>
<1, 5>
<7, 1>
<3, 8>
<7, 2>
这对于较小的数据集来说效果很好,但是对于较大的数据集(尽管仍然在单个块的大小内),单次调用就无法做到这一点
尝试在同一个函数中结合排序和归约是否明智?显然,该函数需要多次调用,但是否可以根据其大小确定需要调用多少次以耗尽所有数据
或者我应该用这样的东西单独进行还原:
__device__ int interReduce(int2 *sdata, int tid) {
int index = tid;
while (sdata[index].x == sdata[tid].x) {
index--;
if (index < 0)
break;
}
if (index+1 != tid) {
atomicAdd(&sdata[index+1].y, sdata[tid].y);
sdata[tid].x = 99;
sdata[tid].y = 99;
return 1;
}
return 0;
}
\uuuuu设备\uuuuuu中断(int2*sdata,inttid){
综合指数=tid;
while(sdata[index].x==sdata[tid].x){
索引--;
如果(指数<0)
打破
}
如果(指数+1!=tid){
原子添加(&sdata[index+1].y,sdata[tid].y);
sdata[tid].x=99;
sdata[tid].y=99;
返回1;
}
返回0;
}
我正试图找到最有效的解决方案,但我在CUDA和并行算法方面的经验有限。你可以用它来实现
使用后接
下面是一个例子:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>
#define N 12
typedef thrust::device_vector<int>::iterator dintiter;
int main(){
thrust::device_vector<int> keys(N);
thrust::device_vector<int> values(N);
thrust::device_vector<int> new_keys(N);
thrust::device_vector<int> new_values(N);
thrust::sequence(keys.begin(), keys.end());
thrust::sequence(values.begin(), values.end());
keys[3] = 1;
keys[9] = 1;
keys[8] = 2;
keys[7] = 4;
thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
thrust::pair<dintiter, dintiter> new_end;
new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());
std::cout << "results values:" << std::endl;
thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));
std::cout << std::endl << "results keys:" << std::endl;
thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));
std::cout << std::endl;
return 0;
}
#包括
#包括
#包括
#包括
#包括
#包括
#定义N 12
typedef推力::设备向量::迭代器dintiter;
int main(){
推力::设备_矢量键(N);
推力:设备_矢量值(N);
推力::设备矢量新按键(N);
推力::装置矢量新值(N);
顺序(keys.begin(),keys.end());
序列(values.begin(),values.end());
键[3]=1;
键[9]=1;
键[8]=2;
键[7]=4;
推力::按键排序(keys.begin()、keys.end()、values.begin());
推力:配对新的_端;
new_end=推力::按_键减少_(keys.begin(),keys.end(),values.begin(),new_keys.begin(),new_values.begin());
std::cout从您的帖子中,似乎您需要按键对许多小数组进行排序。引用您自己的话:
这对于较小的数据集来说效果很好,但是对于较大的数据集(尽管仍然在单个块的大小内),单次调用就无法做到这一点
下面你会发现一个完整的例子围绕我的答案和使用
#包括
#包括
#包括
#包括“Utilities.cuh”
使用名称空间cub;
/**********************************/
/*CUB BLOCKSORT内核没有共享*/
/**********************************/
模板
__全局无效BlockSortKernel(浮点*d_值、整数*d_键、浮点*d_值结果、整数*d_键结果)
{
//---专门化BlockLoad、BlockStore和BlockRadixSort集合类型
typedef cub::BlockLoad BlockLoadIntT;
typedef cub::BlockLoad blockloadfloat;
typedef cub::BlockStore BlockStoreIntT;
typedef cub::BlockStore blockstorefloat;
typedef cub::BlockRadixSort BlockRadixSort;
//---为集体分配类型安全、可重复利用的共享内存
__共享联合{
typename BlockLoadInt::TempStorage loadInt;
typename BlockLoadFloat::TempStorage loadFloat;
typename BlockStoreInt::TempStorage storeInt;
typename BlockStoreFloat::TempStorage storeFloat;
typename BlockRadixSortT::TempStorage排序;
}临时储存;
//---获取此块的连续键段(跨线程阻塞)
int thread_key[每个线程的项目数];
浮动线程_值[每个线程的项目_];
int block_offset=blockIdx.x*(块线程*每个线程的项目);
BlockLoadIntT(temp_storage.loadInt).Load(d_键+块偏移,线程键);
blockloadfloat(temp_storage.loadFloat).Load(d_值+block_偏移量,线程_值);
__同步线程();
//---对键进行集体排序
BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(线程键、线程值);
__同步线程();
//---存储已排序的段
BlockStoreIntT(temp_storage.storeInt).Store(d_key_result+block_offset,thread_key);
blockstorefloat(temp\u storage.storeFloat).Store(d\u值\u结果+块\u偏移,线程\u值);
}
/*******************************/
/*CUB块排序内核共享*/
/*******************************/
模板
__全局\uuuuu无效共享\u块内核(浮点*d\u值、整数*d\u键、浮点*d\u值\u结果、整数*d\u键\u结果)
{
//---共享内存分配
__共享\uuuuufloat SharedMemoryArrayValue[块线程*每个线程的项目];
__shared_uuuint sharedMemoryArrayKeys[块线程*每个线程的项目];
//---专门化BlockStore和BlockRadixSort集合类型
typedef cub::BlockRadixSort BlockRadixSort;
//---为集体分配类型安全、可重复利用的共享内存
__共享类型名BlockRadixSortT::临时存储临时存储;
int block_offset=blockIdx.x*(块线程*每个线程的项目);
//---将数据加载到共享内存
对于(int k=0;k__device__ int interReduce(int2 *sdata, int tid) {
int index = tid;
while (sdata[index].x == sdata[tid].x) {
index--;
if (index < 0)
break;
}
if (index+1 != tid) {
atomicAdd(&sdata[index+1].y, sdata[tid].y);
sdata[tid].x = 99;
sdata[tid].y = 99;
return 1;
}
return 0;
}
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>
#define N 12
typedef thrust::device_vector<int>::iterator dintiter;
int main(){
thrust::device_vector<int> keys(N);
thrust::device_vector<int> values(N);
thrust::device_vector<int> new_keys(N);
thrust::device_vector<int> new_values(N);
thrust::sequence(keys.begin(), keys.end());
thrust::sequence(values.begin(), values.end());
keys[3] = 1;
keys[9] = 1;
keys[8] = 2;
keys[7] = 4;
thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
thrust::pair<dintiter, dintiter> new_end;
new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());
std::cout << "results values:" << std::endl;
thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));
std::cout << std::endl << "results keys:" << std::endl;
thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));
std::cout << std::endl;
return 0;
}
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include "Utilities.cuh"
using namespace cub;
/**********************************/
/* CUB BLOCKSORT KERNEL NO SHARED */
/**********************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
// --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
typedef cub::BlockLoad <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadIntT;
typedef cub::BlockLoad <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadFloatT;
typedef cub::BlockStore <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreIntT;
typedef cub::BlockStore <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreFloatT;
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ union {
typename BlockLoadIntT ::TempStorage loadInt;
typename BlockLoadFloatT ::TempStorage loadFloat;
typename BlockStoreIntT ::TempStorage storeInt;
typename BlockStoreFloatT ::TempStorage storeFloat;
typename BlockRadixSortT ::TempStorage sort;
} temp_storage;
// --- Obtain this block's segment of consecutive keys (blocked across threads)
int thread_keys[ITEMS_PER_THREAD];
float thread_values[ITEMS_PER_THREAD];
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadIntT(temp_storage.loadInt).Load(d_keys + block_offset, thread_keys);
BlockLoadFloatT(temp_storage.loadFloat).Load(d_values + block_offset, thread_values);
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(thread_keys, thread_values);
__syncthreads();
// --- Store the sorted segment
BlockStoreIntT(temp_storage.storeInt).Store(d_keys_result + block_offset, thread_keys);
BlockStoreFloatT(temp_storage.storeFloat).Store(d_values_result + block_offset, thread_values);
}
/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result)
{
// --- Shared memory allocation
__shared__ float sharedMemoryArrayValues[BLOCK_THREADS * ITEMS_PER_THREAD];
__shared__ int sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];
// --- Specialize BlockStore and BlockRadixSort collective types
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
// --- Load data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k] = d_values[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
sharedMemoryArrayKeys[threadIdx.x * ITEMS_PER_THREAD + k] = d_keys[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
}
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*) [ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + (threadIdx.x * ITEMS_PER_THREAD))),
*static_cast<float(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayValues + (threadIdx.x * ITEMS_PER_THREAD))));
__syncthreads();
// --- Write data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
d_values_result[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k];
d_keys_result [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k];
}
}
/********/
/* MAIN */
/********/
int main() {
const int numElemsPerArray = 8;
const int numArrays = 4;
const int N = numArrays * numElemsPerArray;
const int numElemsPerThread = 4;
const int RANGE = N * numElemsPerThread;
// --- Allocating and initializing the data on the host
float *h_values = (float *)malloc(N * sizeof(float));
int *h_keys = (int *) malloc(N * sizeof(int));
for (int i = 0 ; i < N; i++) {
h_values[i] = rand() % RANGE;
h_keys[i] = rand() % RANGE;
}
printf("Original\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys[k * numElemsPerArray + i], h_values[k * numElemsPerArray + i]);
// --- Allocating the results on the host
float *h_values_result1 = (float *)malloc(N * sizeof(float));
float *h_values_result2 = (float *)malloc(N * sizeof(float));
int *h_keys_result1 = (int *) malloc(N * sizeof(int));
int *h_keys_result2 = (int *) malloc(N * sizeof(int));
// --- Allocating space for data and results on device
float *d_values; gpuErrchk(cudaMalloc((void **)&d_values, N * sizeof(float)));
int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int)));
float *d_values_result1; gpuErrchk(cudaMalloc((void **)&d_values_result1, N * sizeof(float)));
float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int)));
int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int)));
// --- BlockSortKernel no shared
gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice));
BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result1, d_keys_result1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_values_result1, d_values_result1, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel no shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_result1[k * numElemsPerArray + i]);
// --- BlockSortKernel with shared
gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice));
shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result2, d_keys_result2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_values_result2, d_values_result2, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys_result2, d_keys_result2, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result2[k * numElemsPerArray + i], h_values_result2[k * numElemsPerArray + i]);
return 0;
}
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include "Utilities.cuh"
using namespace cub;
/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
// --- Shared memory allocation
__shared__ float sharedMemoryArrayValuesA[BLOCK_THREADS * ITEMS_PER_THREAD];
__shared__ float sharedMemoryArrayValuesB[BLOCK_THREADS * ITEMS_PER_THREAD];
__shared__ int sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD];
__shared__ int sharedMemoryHelperIndices[BLOCK_THREADS * ITEMS_PER_THREAD];
// --- Specialize BlockStore and BlockRadixSort collective types
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, int> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
// --- Load data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
sharedMemoryArrayValuesA [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
sharedMemoryArrayValuesB [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k] = d_keys [block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k] = threadIdx.x * ITEMS_PER_THREAD + k ;
}
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + (threadIdx.x * ITEMS_PER_THREAD))),
*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + (threadIdx.x * ITEMS_PER_THREAD))));
__syncthreads();
// --- Write data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
d_values_resultA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
d_values_resultB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]];
d_keys_result [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k];
}
}
/********/
/* MAIN */
/********/
int main() {
const int numElemsPerArray = 8;
const int numArrays = 4;
const int N = numArrays * numElemsPerArray;
const int numElemsPerThread = 4;
const int RANGE = N * numElemsPerThread;
// --- Allocating and initializing the data on the host
float *h_valuesA = (float *)malloc(N * sizeof(float));
float *h_valuesB = (float *)malloc(N * sizeof(float));
int *h_keys = (int *) malloc(N * sizeof(int));
for (int i = 0 ; i < N; i++) {
h_valuesA[i] = rand() % RANGE;
h_valuesB[i] = rand() % RANGE;
h_keys[i] = rand() % RANGE;
}
printf("Original\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);
// --- Allocating the results on the host
float *h_values_resultA = (float *)malloc(N * sizeof(float));
float *h_values_resultB = (float *)malloc(N * sizeof(float));
float *h_values_result2 = (float *)malloc(N * sizeof(float));
int *h_keys_result1 = (int *) malloc(N * sizeof(int));
int *h_keys_result2 = (int *) malloc(N * sizeof(int));
// --- Allocating space for data and results on device
float *d_valuesA; gpuErrchk(cudaMalloc((void **)&d_valuesA, N * sizeof(float)));
float *d_valuesB; gpuErrchk(cudaMalloc((void **)&d_valuesB, N * sizeof(float)));
int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int)));
float *d_values_resultA; gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
float *d_values_resultB; gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int)));
int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int)));
// --- BlockSortKernel with shared
gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice));
shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel using shared memory\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);
return 0;
}
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include "Utilities.cuh"
using namespace cub;
/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCKSIZE_X, int BLOCKSIZE_Y, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result)
{
// --- Shared memory allocation
__shared__ float sharedMemoryArrayValuesA [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
__shared__ float sharedMemoryArrayValuesB [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
__shared__ int sharedMemoryArrayKeys [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
__shared__ int sharedMemoryHelperIndices[BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD];
// --- Specialize BlockStore and BlockRadixSort collective types
typedef cub::BlockRadixSort <int , BLOCKSIZE_X, ITEMS_PER_THREAD, int, 4, false, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCKSIZE_Y> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
int block_offset = blockIdx.x * (BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD);
// --- Load data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
sharedMemoryArrayValuesA [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
sharedMemoryArrayValuesB [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
sharedMemoryArrayKeys [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_keys [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k ;
}
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))),
*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))));
__syncthreads();
// --- Write data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) {
d_values_resultA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
d_values_resultB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]];
d_keys_result [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k];
}
}
/********/
/* MAIN */
/********/
int main() {
const int blockSize_x = 2;
const int blockSize_y = 4;
const int numElemsPerArray = blockSize_x * blockSize_y;
const int numArrays = 4;
const int N = numArrays * numElemsPerArray;
const int numElemsPerThread = numElemsPerArray / (blockSize_x * blockSize_y);
const int RANGE = N * numElemsPerThread;
// --- Allocating and initializing the data on the host
float *h_valuesA = (float *)malloc(N * sizeof(float));
float *h_valuesB = (float *)malloc(N * sizeof(float));
int *h_keys = (int *) malloc(N * sizeof(int));
for (int i = 0 ; i < N; i++) {
h_valuesA[i] = rand() % RANGE;
h_valuesB[i] = rand() % RANGE;
h_keys[i] = rand() % RANGE;
}
printf("Original\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]);
// --- Allocating the results on the host
float *h_values_resultA = (float *)malloc(N * sizeof(float));
float *h_values_resultB = (float *)malloc(N * sizeof(float));
float *h_values_result2 = (float *)malloc(N * sizeof(float));
int *h_keys_result1 = (int *) malloc(N * sizeof(int));
int *h_keys_result2 = (int *) malloc(N * sizeof(int));
// --- Allocating space for data and results on device
float *d_valuesA; gpuErrchk(cudaMalloc((void **)&d_valuesA, N * sizeof(float)));
float *d_valuesB; gpuErrchk(cudaMalloc((void **)&d_valuesB, N * sizeof(float)));
int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int)));
float *d_values_resultA; gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float)));
float *d_values_resultB; gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float)));
float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float)));
int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int)));
int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int)));
// --- BlockSortKernel with shared
gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice));
shared_BlockSortKernel<blockSize_x, blockSize_y, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel using shared memory\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]);
return 0;
}