Sorting 如何使用推力对矩阵的行进行排序?
我有一个5000x500矩阵,我想用cuda分别对每一行进行排序。我可以使用arrayfire,但这只是推力::排序上的for循环,这不应该是有效的Sorting 如何使用推力对矩阵的行进行排序?,sorting,cuda,thrust,Sorting,Cuda,Thrust,我有一个5000x500矩阵,我想用cuda分别对每一行进行排序。我可以使用arrayfire,但这只是推力::排序上的for循环,这不应该是有效的 for(尺寸类型w=0;w
for(尺寸类型w=0;w
有没有一种方法可以将各种操作融合在一起,从而使它们并行运行?事实上,我所寻找的是一种将for循环迭代融合到中的通用方法。我可以想到两种可能性,其中一种已经由@JaredHoberock提出。我不知道有什么通用方法可以在推力中融合循环迭代,但第二种方法是更通用的方法。我的猜测是,在这种情况下,第一种方法是两种方法中速度更快的
推力::for_each
操作来选择需要执行的单个排序,您可以通过包含一个推力算法调用来运行这些排序传递给每个的函数中的推力::排序
操作$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 16000
#define DSIZE 1000
int my_mod_start = 0;
int my_mod(){
return (my_mod_start++)/DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
thrust::device_ptr<int> data;
int dsize;
__host__ __device__
void operator()(int start_idx)
{
thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
}
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main(){
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
thrust::host_vector<int> h_data(DSIZE*NSORTS);
thrust::generate(h_data.begin(), h_data.end(), rand);
thrust::device_vector<int> d_data = h_data;
// first time a loop
thrust::device_vector<int> d_result1 = d_data;
thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
unsigned long long mytime = dtime_usec(0);
for (int i = 0; i < NSORTS; i++)
thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;
//vectorized sort
thrust::device_vector<int> d_result2 = d_data;
thrust::host_vector<int> h_segments(DSIZE*NSORTS);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
thrust::device_vector<int> d_segments = h_segments;
mytime = dtime_usec(0);
thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
//nested sort
thrust::device_vector<int> d_result3 = d_data;
sort_functor f = {d_result3.data(), DSIZE};
thrust::device_vector<int> idxs(NSORTS);
thrust::sequence(idxs.begin(), idxs.end());
mytime = dtime_usec(0);
thrust::for_each(idxs.begin(), idxs.end(), f);
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$
$cat t617.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#1.6万吨
#定义DSIZE 1000
int my_mod_start=0;
int my_mod(){
返回(my_mod_start++)/DSIZE;
}
bool验证(推力:装置矢量和d1,推力:装置矢量和d2){
返回推力::相等(d1.begin(),d1.end(),d2.begin());
}
结构排序函数
{
推力:设备ptr数据;
int-dsize;
__主机设备__
void运算符()(int start_idx)
{
推力::排序(推力::设备,数据+(dsize*start\u idx),数据+(dsize*(start\u idx+1));
}
};
#包括
#包括
#定义USECPSEC 10000000ull
无符号长时间dtime\u usec(无符号长时间启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
int main(){
cudaDeviceSetLimit(cudaLimitMallocHeapSize,(16*DSIZE*传感器));
推力:主机向量h_数据(DSIZE*NSORTS);
生成(h_data.begin(),h_data.end(),rand);
推力::设备矢量d_数据=h_数据;
//第一次循环
推力::设备向量d_结果1=d_数据;
推力::设备\u ptr r1ptr=推力::设备\u指针\u转换(d_result1.data());
无符号long long mytime=dtime\u usec(0);
对于(int i=0;i std::你能适应这种方法吗?我会尝试在调用推力::for_each
的内部嵌套一个调用推力::sort
。我正在尝试理解这两种方法…谢谢。好吧!我放弃了。我会用简单的方法来做。非常感谢!你不仅回答了我的问题,还向我展示了如何做很多事情我必须补充一点,我看到了这一点:并且意识到正常的推力排序比任何其他方法都要快(除了手工编码的并行排序方法)。我的数据是无符号的,数据的16个最高有效位都是零。因此,我只是将行号放在16 msb中,并对其进行排序。嗨,你知道如何沿矩阵的行进行argsort吗?这意味着找出矩阵中行元素的顺序,而不是排序后的矩阵。
$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 16000
#define DSIZE 1000
int my_mod_start = 0;
int my_mod(){
return (my_mod_start++)/DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
thrust::device_ptr<int> data;
int dsize;
__host__ __device__
void operator()(int start_idx)
{
thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
}
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main(){
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
thrust::host_vector<int> h_data(DSIZE*NSORTS);
thrust::generate(h_data.begin(), h_data.end(), rand);
thrust::device_vector<int> d_data = h_data;
// first time a loop
thrust::device_vector<int> d_result1 = d_data;
thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
unsigned long long mytime = dtime_usec(0);
for (int i = 0; i < NSORTS; i++)
thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;
//vectorized sort
thrust::device_vector<int> d_result2 = d_data;
thrust::host_vector<int> h_segments(DSIZE*NSORTS);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
thrust::device_vector<int> d_segments = h_segments;
mytime = dtime_usec(0);
thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
//nested sort
thrust::device_vector<int> d_result3 = d_data;
sort_functor f = {d_result3.data(), DSIZE};
thrust::device_vector<int> idxs(NSORTS);
thrust::sequence(idxs.begin(), idxs.end());
mytime = dtime_usec(0);
thrust::for_each(idxs.begin(), idxs.end(), f);
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$