Cuda 使用带步长的推力计数迭代器
我正在寻找一种使用Cuda 使用带步长的推力计数迭代器,cuda,thrust,Cuda,Thrust,我正在寻找一种使用推力::计数迭代器函数的方法,以便并行化以下for循环: for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations { // Body of the loop } for(int-stride=0;stride
推力::计数迭代器
函数的方法,以便并行化以下for循环:
for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations
{
// Body of the loop
}
for(int-stride=0;stride
下面是代码的外观:
struct函子()
{
__主机\设备\无效运算符()(常量int i)
{
//循环体
}
}
推力::计数迭代器it1(0);
推力:计数迭代器it2=it1+N*M;
推力:对于每一个(it1,it2,函子());
我知道,
counting\u iterator
将迭代器增加1,所以有没有办法增加m?这是and的组合
下面,我正在考虑一个例子,其中转换是
D[i] = A[i] + B[i] * C[i]
代码如下:
#include <thrust/for_each.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <iostream>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
struct arbitrary_functor
{
template <typename Tuple>
__host__ __device__
void operator()(Tuple t)
{
// D[i] = A[i] + B[i] * C[i];
thrust::get<3>(t) = thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
}
};
int main(void)
{
// allocate storage
thrust::device_vector<float> A(5);
thrust::device_vector<float> B(5);
thrust::device_vector<float> C(5);
thrust::device_vector<float> D(5);
// initialize input vectors
A[0] = 3; B[0] = 6; C[0] = 2;
A[1] = 4; B[1] = 7; C[1] = 5;
A[2] = 0; B[2] = 2; C[2] = 7;
A[3] = 8; B[3] = 1; C[3] = 4;
A[4] = 2; B[4] = 8; C[4] = 3;
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> posA(A.begin(), A.end(), STRIDE);
strided_range<Iterator> posB(B.begin(), B.end(), STRIDE);
strided_range<Iterator> posC(C.begin(), C.end(), STRIDE);
strided_range<Iterator> posD(D.begin(), D.end(), STRIDE);
// apply the transformation
thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(posA.begin(), posB.begin(), posC.begin(), posD.begin())),
thrust::make_zip_iterator(thrust::make_tuple(posA.end(), posB.end(), posC.end(), posD.end())),
arbitrary_functor());
// print the output
for(int i = 0; i < 5; i++)
std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//印刷用
#包括
#包括
#定义步幅2
模板
类步距
{
公众:
typedef typename推力::迭代器_差异::类型差异_类型;
结构跨步函数:公共推力::一元函数
{
差异式步幅;
步幅函子(差分型步幅)
:步幅(步幅){}
__主机设备__
差分类型运算符()(常数差分类型&i)常数
{
返回步幅*i;
}
};
typedef typename推力::计数迭代器计数迭代器;
typedef typename推力::transform_迭代器TransformIterator;
typedef typename推力::置换迭代器置换迭代器;
//跨步范围迭代器的类型
typedef置换迭代器;
//为范围[第一个,最后一个]构建跨步范围
步幅范围(迭代器优先、迭代器最后、差分类型步幅)
:第一(第一)、最后(最后)、大步(大步){}
迭代器开始(void)常量
{
返回置换迭代器(第一个,TransformIterator(CountingIterator(0),stride_函子(stride));
}
迭代器结束(void)常量
{
return begin()+((last-first)+(stride-1))/stride;
}
受保护的:
迭代器优先;
迭代器last;
差异式步幅;
};
结构任意函数
{
模板
__主机设备__
void运算符()(元组t)
{
//D[i]=A[i]+B[i]*C[i];
推力::get(t)=推力::get(t)+推力::get(t)*推力::get(t);
}
};
内部主(空)
{
//分配存储
推力:装置_矢量A(5);
推力:装置_矢量B(5);
推力:装置_矢量C(5);
推力:装置_矢量D(5);
//初始化输入向量
A[0]=3;B[0]=6;C[0]=2;
A[1]=4;B[1]=7;C[1]=5;
A[2]=0;B[2]=2;C[2]=7;
A[3]=8;B[3]=1;C[3]=4;
A[4]=2;B[4]=8;C[4]=3;
typedef推力::设备向量::迭代器迭代器;
步幅范围posA(A.开始(),A.结束(),步幅);
步幅位置B(B.开始(),B.结束(),步幅);
步幅范围posC(C.开始(),C.结束(),步幅);
步幅范围位置(D.开始(),D.结束(),步幅);
//应用转换
推力::for_each(推力::make_zip_迭代器(推力::make_元组(posA.begin()、posB.begin()、posC.begin()、posD.begin()),
推力::make_zip_迭代器(推力::make_元组(posA.end(),posB.end(),posC.end(),posD.end()),
任意_函子());
//打印输出
对于(int i=0;i<5;i++)
std::cout为什么不在你的函子中将i
变量乘以M
如果编译时已知M
,则可能是:
struct functor
{
__host__ __device__ void operator() (const int my_i)
{
int i = my_i *M;
// Body of the loop
}
};
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (it1 , it2 , functor());
您还可以将计数迭代器包装在转换迭代器中,转换迭代器接受计数迭代器并将其乘以M:
struct functor
{
__host__ __device__ void operator() (const int i)
{
// Body of the loop
}
};
using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());
结构函子
{
__主机\设备\无效运算符()(常量int i)
{
//循环体
}
};
使用命名空间推力::占位符;
推力::计数迭代器it1(0);
推力:计数迭代器it2=it1+N;
推力::对于每个(生成转换迭代器(it1,_1*M),推力::生成转换迭代器(it2,_1*M),函子());
最后一个例子使用,尽管它可以用一个附加的平凡函子等价地实现,该函子返回其参数乘以其参数的参数
下面是一个完整的示例,显示了所有3种方法:
$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;
struct my_functor_1
{
__host__ __device__ void operator() (const int i)
{
printf("functor 1 value: %d\n", i);
}
};
struct my_functor_2
{
__host__ __device__ void operator() (const int my_i)
{
int i = my_i*M;
printf("functor 2 value: %d\n", i);
}
};
struct my_functor_3
{
int my_M;
my_functor_3(int _M) : my_M(_M) {};
__host__ __device__ void operator() (const int my_i)
{
int i = my_i *my_M;
printf("functor 3 value: %d\n", i);
}
};
int main(){
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each(thrust::host, it1, it2, my_functor_1());
thrust::for_each(thrust::host, it1, it2, my_functor_2());
thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
return 0;
}
$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$
$cat t492.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义n5
#定义M4
使用命名空间推力::占位符;
结构my_函子_1
{
__主机\设备\无效运算符()(常量int i)
{
printf(“函子1值:%d\n”,i);
}
};
结构my_函子_2
{
__主机\uuuuuuuuu设备\uuuuuu无效运算符()(const int my\u i)
{
int i=my_i*M;
printf(“函子2值:%d\n”,i);
}
};
结构my_函子_3
{
int我的M;
my_函子_3(int_M):my_M(_M){};
__主机\uuuuuuuuu设备\uuuuuu无效运算符()(const int my\u i)
{
int i=我的i*我的M;
printf(“函子3值:%d\n”,i);
}
};
int main(){
推力::计数迭代器it1(0);
推力:计数迭代器it2=it1+N;
推力::for_each(推力::host,it1,it2,my_functor_1());
推力::for_each(推力::host,it1,it2,my_functor_2());
推力::for_each(推力::host,it1,it2,my_函子_3(M));
推力::for_each(推力::主机,推力::make_transform_迭代器(it1,_1*M),推力::make_transform_迭代器(it2,_1*M),my_functor_1());
返回0;
}
$nvcc-arch=sm_20-o t492 t492.cu
美元/t492
函子1值:0
函子1值:1
函子1值:2
函子1值:3
函子1值:4
函子2值:0
函子2值:4
函子2值:8
函子2值:12
函子2值:16
函子3值:0
函子3值:4
函子3值:8
函子3值:12
函子3值:16
函子1值:0
函子1值:4
函子1值:8
函子1值:12
函子1值:16
$
第三种方法(使用make_transform_iterator)正是我想要的!谢谢
struct functor
{
__host__ __device__ void operator() (const int i)
{
// Body of the loop
}
};
using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());
$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;
struct my_functor_1
{
__host__ __device__ void operator() (const int i)
{
printf("functor 1 value: %d\n", i);
}
};
struct my_functor_2
{
__host__ __device__ void operator() (const int my_i)
{
int i = my_i*M;
printf("functor 2 value: %d\n", i);
}
};
struct my_functor_3
{
int my_M;
my_functor_3(int _M) : my_M(_M) {};
__host__ __device__ void operator() (const int my_i)
{
int i = my_i *my_M;
printf("functor 3 value: %d\n", i);
}
};
int main(){
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each(thrust::host, it1, it2, my_functor_1());
thrust::for_each(thrust::host, it1, it2, my_functor_2());
thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
return 0;
}
$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$