Cuda 使用带步长的推力计数迭代器

Cuda 使用带步长的推力计数迭代器,cuda,thrust,Cuda,Thrust,我正在寻找一种使用推力::计数迭代器函数的方法,以便并行化以下for循环: for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations { // Body of the loop } for(int-stride=0;stride

我正在寻找一种使用
推力::计数迭代器
函数的方法,以便并行化以下for循环:

for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations
{
    // Body of the loop
}
for(int-stride=0;stride
下面是代码的外观:

struct函子()
{
__主机\设备\无效运算符()(常量int i)
{
//循环体
}
}
推力::计数迭代器it1(0);
推力:计数迭代器it2=it1+N*M;
推力:对于每一个(it1,it2,函子());

我知道,
counting\u iterator
将迭代器增加1,所以有没有办法增加m?

这是and的组合

下面,我正在考虑一个例子,其中转换是

D[i] = A[i] + B[i] * C[i]
代码如下:

#include <thrust/for_each.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <iostream>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>

#include <thrust/fill.h>

// for printing
#include <thrust/copy.h>
#include <ostream>

#define STRIDE 2

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

struct arbitrary_functor
{
    template <typename Tuple>
    __host__ __device__
    void operator()(Tuple t)
    {
        // D[i] = A[i] + B[i] * C[i];
        thrust::get<3>(t) = thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
    }
};


int main(void)
{
    // allocate storage
    thrust::device_vector<float> A(5);
    thrust::device_vector<float> B(5);
    thrust::device_vector<float> C(5);
    thrust::device_vector<float> D(5);

    // initialize input vectors
    A[0] = 3;  B[0] = 6;  C[0] = 2; 
    A[1] = 4;  B[1] = 7;  C[1] = 5; 
    A[2] = 0;  B[2] = 2;  C[2] = 7; 
    A[3] = 8;  B[3] = 1;  C[3] = 4; 
    A[4] = 2;  B[4] = 8;  C[4] = 3; 

    typedef thrust::device_vector<float>::iterator Iterator;
    strided_range<Iterator> posA(A.begin(), A.end(), STRIDE);
    strided_range<Iterator> posB(B.begin(), B.end(), STRIDE);
    strided_range<Iterator> posC(C.begin(), C.end(), STRIDE);
    strided_range<Iterator> posD(D.begin(), D.end(), STRIDE);

    // apply the transformation
    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(posA.begin(), posB.begin(), posC.begin(), posD.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(posA.end(), posB.end(), posC.end(), posD.end())),
                 arbitrary_functor());

    // print the output
    for(int i = 0; i < 5; i++)
    std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//印刷用
#包括
#包括
#定义步幅2
模板
类步距
{
公众:
typedef typename推力::迭代器_差异::类型差异_类型;
结构跨步函数:公共推力::一元函数
{
差异式步幅;
步幅函子(差分型步幅)
:步幅(步幅){}
__主机设备__
差分类型运算符()(常数差分类型&i)常数
{
返回步幅*i;
}
};
typedef typename推力::计数迭代器计数迭代器;
typedef typename推力::transform_迭代器TransformIterator;
typedef typename推力::置换迭代器置换迭代器;
//跨步范围迭代器的类型
typedef置换迭代器;
//为范围[第一个,最后一个]构建跨步范围
步幅范围(迭代器优先、迭代器最后、差分类型步幅)
:第一(第一)、最后(最后)、大步(大步){}
迭代器开始(void)常量
{
返回置换迭代器(第一个,TransformIterator(CountingIterator(0),stride_函子(stride));
}
迭代器结束(void)常量
{
return begin()+((last-first)+(stride-1))/stride;
}
受保护的:
迭代器优先;
迭代器last;
差异式步幅;
};
结构任意函数
{
模板
__主机设备__
void运算符()(元组t)
{
//D[i]=A[i]+B[i]*C[i];
推力::get(t)=推力::get(t)+推力::get(t)*推力::get(t);
}
};
内部主(空)
{
//分配存储
推力:装置_矢量A(5);
推力:装置_矢量B(5);
推力:装置_矢量C(5);
推力:装置_矢量D(5);
//初始化输入向量
A[0]=3;B[0]=6;C[0]=2;
A[1]=4;B[1]=7;C[1]=5;
A[2]=0;B[2]=2;C[2]=7;
A[3]=8;B[3]=1;C[3]=4;
A[4]=2;B[4]=8;C[4]=3;
typedef推力::设备向量::迭代器迭代器;
步幅范围posA(A.开始(),A.结束(),步幅);
步幅位置B(B.开始(),B.结束(),步幅);
步幅范围posC(C.开始(),C.结束(),步幅);
步幅范围位置(D.开始(),D.结束(),步幅);
//应用转换
推力::for_each(推力::make_zip_迭代器(推力::make_元组(posA.begin()、posB.begin()、posC.begin()、posD.begin()),
推力::make_zip_迭代器(推力::make_元组(posA.end(),posB.end(),posC.end(),posD.end()),
任意_函子());
//打印输出
对于(int i=0;i<5;i++)

std::cout为什么不在你的函子中将
i
变量乘以
M

如果编译时已知
M
,则可能是:

struct functor 
{
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *M;
      // Body of the loop
   }
};

thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (it1 , it2 , functor());
您还可以将计数迭代器包装在转换迭代器中,转换迭代器接受计数迭代器并将其乘以M:

struct functor 
{
   __host__ __device__ void operator() (const int i)
   {
      // Body of the loop
   }
};

using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());
结构函子 { __主机\设备\无效运算符()(常量int i) { //循环体 } }; 使用命名空间推力::占位符; 推力::计数迭代器it1(0); 推力:计数迭代器it2=it1+N; 推力::对于每个(生成转换迭代器(it1,_1*M),推力::生成转换迭代器(it2,_1*M),函子());
最后一个例子使用,尽管它可以用一个附加的平凡函子等价地实现,该函子返回其参数乘以其参数的参数

下面是一个完整的示例,显示了所有3种方法:

$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;

struct my_functor_1
{
  __host__ __device__  void operator() (const int i)
  {
    printf("functor 1 value: %d\n", i);
  }
};

struct my_functor_2
{
   __host__ __device__ void operator() (const int my_i)
   {
    int i = my_i*M;
    printf("functor 2 value: %d\n", i);
   }
};

struct my_functor_3
{
   int my_M;
   my_functor_3(int _M) : my_M(_M) {};
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *my_M;
      printf("functor 3 value: %d\n", i);
   }
};


int main(){
  thrust::counting_iterator<int> it1(0);
  thrust::counting_iterator<int> it2 = it1 + N;
  thrust::for_each(thrust::host, it1, it2, my_functor_1());
  thrust::for_each(thrust::host, it1, it2, my_functor_2());
  thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
  thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
  return 0;
}


$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$
$cat t492.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义n5
#定义M4
使用命名空间推力::占位符;
结构my_函子_1
{
__主机\设备\无效运算符()(常量int i)
{
printf(“函子1值:%d\n”,i);
}
};
结构my_函子_2
{
__主机\uuuuuuuuu设备\uuuuuu无效运算符()(const int my\u i)
{
int i=my_i*M;
printf(“函子2值:%d\n”,i);
}
};
结构my_函子_3
{
int我的M;
my_函子_3(int_M):my_M(_M){};
__主机\uuuuuuuuu设备\uuuuuu无效运算符()(const int my\u i)
{
int i=我的i*我的M;
printf(“函子3值:%d\n”,i);
}
};
int main(){
推力::计数迭代器it1(0);
推力:计数迭代器it2=it1+N;
推力::for_each(推力::host,it1,it2,my_functor_1());
推力::for_each(推力::host,it1,it2,my_functor_2());
推力::for_each(推力::host,it1,it2,my_函子_3(M));
推力::for_each(推力::主机,推力::make_transform_迭代器(it1,_1*M),推力::make_transform_迭代器(it2,_1*M),my_functor_1());
返回0;
}
$nvcc-arch=sm_20-o t492 t492.cu
美元/t492
函子1值:0
函子1值:1
函子1值:2
函子1值:3
函子1值:4
函子2值:0
函子2值:4
函子2值:8
函子2值:12
函子2值:16
函子3值:0
函子3值:4
函子3值:8
函子3值:12
函子3值:16
函子1值:0
函子1值:4
函子1值:8
函子1值:12
函子1值:16
$
第三种方法(使用make_transform_iterator)正是我想要的!谢谢
struct functor 
{
   __host__ __device__ void operator() (const int i)
   {
      // Body of the loop
   }
};

using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());
$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;

struct my_functor_1
{
  __host__ __device__  void operator() (const int i)
  {
    printf("functor 1 value: %d\n", i);
  }
};

struct my_functor_2
{
   __host__ __device__ void operator() (const int my_i)
   {
    int i = my_i*M;
    printf("functor 2 value: %d\n", i);
   }
};

struct my_functor_3
{
   int my_M;
   my_functor_3(int _M) : my_M(_M) {};
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *my_M;
      printf("functor 3 value: %d\n", i);
   }
};


int main(){
  thrust::counting_iterator<int> it1(0);
  thrust::counting_iterator<int> it2 = it1 + N;
  thrust::for_each(thrust::host, it1, it2, my_functor_1());
  thrust::for_each(thrust::host, it1, it2, my_functor_2());
  thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
  thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
  return 0;
}


$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$