Cuda 使用带步长的推力计数迭代器_Cuda_Thrust

Cuda 使用带步长的推力计数迭代器

cuda

Cuda 使用带步长的推力计数迭代器,cuda,thrust,Cuda,Thrust,我正在寻找一种使用推力：：计数迭代器函数的方法，以便并行化以下for循环： for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations { // Body of the loop } for（int-stride=0；stride

我正在寻找一种使用

推力：：计数迭代器

函数的方法，以便并行化以下for循环：

for (int stride = 0 ; stride < N * M ; stride+=M) // N iterations
{
    // Body of the loop
}

for（int-stride=0；stride


下面是代码的外观：
struct函子（）
{
__主机\设备\无效运算符（）（常量int i）
{
//循环体
}
}
推力：：计数迭代器it1（0）；
推力：计数迭代器it2=it1+N*M；
推力：对于每一个（it1，it2，函子（））；

我知道，counting\u iterator
将迭代器增加1，所以有没有办法增加m？
这是and的组合
下面，我正在考虑一个例子，其中转换是
D[i] = A[i] + B[i] * C[i]

代码如下：
#include <thrust/for_each.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <iostream>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>

#include <thrust/fill.h>

// for printing
#include <thrust/copy.h>
#include <ostream>

#define STRIDE 2

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

struct arbitrary_functor
{
    template <typename Tuple>
    __host__ __device__
    void operator()(Tuple t)
    {
        // D[i] = A[i] + B[i] * C[i];
        thrust::get<3>(t) = thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
    }
};


int main(void)
{
    // allocate storage
    thrust::device_vector<float> A(5);
    thrust::device_vector<float> B(5);
    thrust::device_vector<float> C(5);
    thrust::device_vector<float> D(5);

    // initialize input vectors
    A[0] = 3;  B[0] = 6;  C[0] = 2; 
    A[1] = 4;  B[1] = 7;  C[1] = 5; 
    A[2] = 0;  B[2] = 2;  C[2] = 7; 
    A[3] = 8;  B[3] = 1;  C[3] = 4; 
    A[4] = 2;  B[4] = 8;  C[4] = 3; 

    typedef thrust::device_vector<float>::iterator Iterator;
    strided_range<Iterator> posA(A.begin(), A.end(), STRIDE);
    strided_range<Iterator> posB(B.begin(), B.end(), STRIDE);
    strided_range<Iterator> posC(C.begin(), C.end(), STRIDE);
    strided_range<Iterator> posD(D.begin(), D.end(), STRIDE);

    // apply the transformation
    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(posA.begin(), posB.begin(), posC.begin(), posD.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(posA.end(), posB.end(), posC.end(), posD.end())),
                 arbitrary_functor());

    // print the output
    for(int i = 0; i < 5; i++)
    std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl;
}

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//印刷用
#包括
#包括
#定义步幅2
模板
类步距
{
公众：
typedef typename推力：：迭代器_差异：：类型差异_类型；
结构跨步函数：公共推力：：一元函数
{
差异式步幅；
步幅函子（差分型步幅）
：步幅（步幅）{}
__主机设备__
差分类型运算符（）（常数差分类型&i）常数
{
返回步幅*i；
}
};
typedef typename推力：：计数迭代器计数迭代器；
typedef typename推力：：transform_迭代器TransformIterator；
typedef typename推力：：置换迭代器置换迭代器；
//跨步范围迭代器的类型
typedef置换迭代器；
//为范围[第一个，最后一个]构建跨步范围
步幅范围（迭代器优先、迭代器最后、差分类型步幅）
：第一（第一）、最后（最后）、大步（大步）{}
迭代器开始（void）常量
{
返回置换迭代器（第一个，TransformIterator（CountingIterator（0），stride_函子（stride））；
}
迭代器结束（void）常量
{
return begin（）+（（last-first）+（stride-1））/stride；
}
受保护的：
迭代器优先；
迭代器last；
差异式步幅；
};
结构任意函数
{
模板
__主机设备__
void运算符（）（元组t）
{
//D[i]=A[i]+B[i]*C[i]；
推力：：get（t）=推力：：get（t）+推力：：get（t）*推力：：get（t）；
}
};
内部主（空）
{
//分配存储
推力：装置_矢量A（5）；
推力：装置_矢量B（5）；
推力：装置_矢量C（5）；
推力：装置_矢量D（5）；
//初始化输入向量
A[0]=3；B[0]=6；C[0]=2；
A[1]=4；B[1]=7；C[1]=5；
A[2]=0；B[2]=2；C[2]=7；
A[3]=8；B[3]=1；C[3]=4；
A[4]=2；B[4]=8；C[4]=3；
typedef推力：：设备向量：：迭代器迭代器；
步幅范围posA（A.开始（），A.结束（），步幅）；
步幅位置B（B.开始（），B.结束（），步幅）；
步幅范围posC（C.开始（），C.结束（），步幅）；
步幅范围位置（D.开始（），D.结束（），步幅）；
//应用转换
推力：：for_each（推力：：make_zip_迭代器（推力：：make_元组（posA.begin（）、posB.begin（）、posC.begin（）、posD.begin（）），
推力：：make_zip_迭代器（推力：：make_元组（posA.end（），posB.end（），posC.end（），posD.end（）），
任意_函子（））；
//打印输出
对于（int i=0；i<5；i++）
std：：cout为什么不在你的函子中将i
变量乘以M

如果编译时已知M
，则可能是：
struct functor 
{
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *M;
      // Body of the loop
   }
};

thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (it1 , it2 , functor());

您还可以将计数迭代器包装在转换迭代器中，转换迭代器接受计数迭代器并将其乘以M：
struct functor 
{
   __host__ __device__ void operator() (const int i)
   {
      // Body of the loop
   }
};

using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());

结构函子
{
__主机\设备\无效运算符（）（常量int i）
{
//循环体
}
};
使用命名空间推力：：占位符；
推力：：计数迭代器it1（0）；
推力：计数迭代器it2=it1+N；
推力：：对于每个（生成转换迭代器（it1，_1*M），推力：：生成转换迭代器（it2，_1*M），函子（））；

最后一个例子使用，尽管它可以用一个附加的平凡函子等价地实现，该函子返回其参数乘以其参数的参数
下面是一个完整的示例，显示了所有3种方法：
$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;

struct my_functor_1
{
  __host__ __device__  void operator() (const int i)
  {
    printf("functor 1 value: %d\n", i);
  }
};

struct my_functor_2
{
   __host__ __device__ void operator() (const int my_i)
   {
    int i = my_i*M;
    printf("functor 2 value: %d\n", i);
   }
};

struct my_functor_3
{
   int my_M;
   my_functor_3(int _M) : my_M(_M) {};
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *my_M;
      printf("functor 3 value: %d\n", i);
   }
};


int main(){
  thrust::counting_iterator<int> it1(0);
  thrust::counting_iterator<int> it2 = it1 + N;
  thrust::for_each(thrust::host, it1, it2, my_functor_1());
  thrust::for_each(thrust::host, it1, it2, my_functor_2());
  thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
  thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
  return 0;
}


$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$

$cat t492.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义n5
#定义M4
使用命名空间推力：：占位符；
结构my_函子_1
{
__主机\设备\无效运算符（）（常量int i）
{
printf（“函子1值：%d\n”，i）；
}
};
结构my_函子_2
{
__主机\uuuuuuuuu设备\uuuuuu无效运算符（）（const int my\u i）
{
int i=my_i*M；
printf（“函子2值：%d\n”，i）；
}
};
结构my_函子_3
{
int我的M；
my_函子_3（int_M）：my_M（_M）{}；
__主机\uuuuuuuuu设备\uuuuuu无效运算符（）（const int my\u i）
{
int i=我的i*我的M；
printf（“函子3值：%d\n”，i）；
}
};
int main（）{
推力：：计数迭代器it1（0）；
推力：计数迭代器it2=it1+N；
推力：：for_each（推力：：host，it1，it2，my_functor_1（））；
推力：：for_each（推力：：host，it1，it2，my_functor_2（））；
推力：：for_each（推力：：host，it1，it2，my_函子_3（M））；
推力：：for_each（推力：：主机，推力：：make_transform_迭代器（it1，_1*M），推力：：make_transform_迭代器（it2，_1*M），my_functor_1（））；
返回0；
}
$nvcc-arch=sm_20-o t492 t492.cu
美元/t492
函子1值：0
函子1值：1
函子1值：2
函子1值：3
函子1值：4
函子2值：0
函子2值：4
函子2值：8
函子2值：12
函子2值：16
函子3值：0
函子3值：4
函子3值：8
函子3值：12
函子3值：16
函子1值：0
函子1值：4
函子1值：8
函子1值：12
函子1值：16
$
第三种方法（使用make_transform_iterator）正是我想要的！谢谢
struct functor 
{
   __host__ __device__ void operator() (const int i)
   {
      // Body of the loop
   }
};

using namespace thrust::placeholders;
thrust::counting_iterator<int> it1(0);
thrust::counting_iterator<int> it2 = it1 + N;
thrust::for_each (make_transform_iterator(it1, _1 * M) , thrust::make_transform_iterator(it2, _1 * M) , functor());

$ cat t492.cu
#include <stdio.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#define N 5
#define M 4
using namespace thrust::placeholders;

struct my_functor_1
{
  __host__ __device__  void operator() (const int i)
  {
    printf("functor 1 value: %d\n", i);
  }
};

struct my_functor_2
{
   __host__ __device__ void operator() (const int my_i)
   {
    int i = my_i*M;
    printf("functor 2 value: %d\n", i);
   }
};

struct my_functor_3
{
   int my_M;
   my_functor_3(int _M) : my_M(_M) {};
   __host__ __device__ void operator() (const int my_i)
   {
      int i = my_i *my_M;
      printf("functor 3 value: %d\n", i);
   }
};


int main(){
  thrust::counting_iterator<int> it1(0);
  thrust::counting_iterator<int> it2 = it1 + N;
  thrust::for_each(thrust::host, it1, it2, my_functor_1());
  thrust::for_each(thrust::host, it1, it2, my_functor_2());
  thrust::for_each(thrust::host, it1, it2, my_functor_3(M));
  thrust::for_each(thrust::host, thrust::make_transform_iterator(it1, _1 * M), thrust::make_transform_iterator(it2, _1 * M), my_functor_1());
  return 0;
}


$ nvcc -arch=sm_20 -o t492 t492.cu
$ ./t492
functor 1 value: 0
functor 1 value: 1
functor 1 value: 2
functor 1 value: 3
functor 1 value: 4
functor 2 value: 0
functor 2 value: 4
functor 2 value: 8
functor 2 value: 12
functor 2 value: 16
functor 3 value: 0
functor 3 value: 4
functor 3 value: 8
functor 3 value: 12
functor 3 value: 16
functor 1 value: 0
functor 1 value: 4
functor 1 value: 8
functor 1 value: 12
functor 1 value: 16
$