Cuda 3种不同尺寸矢量的推力复变换_Cuda_Thrust

Cuda 3种不同尺寸矢量的推力复变换

cuda

Cuda 3种不同尺寸矢量的推力复变换,cuda,thrust,Cuda,Thrust,您好，我在C+中有一个循环，我试图将它转换为推力，但没有得到相同的结果。。。有什么想法吗？多谢各位 C++代码 for (i=0;i<n;i++) for (j=0;j<n;j++) values[i]=values[i]+(binv[i*n+j]*d[j]); thrust::fill(values.begin(), values.end(), 0); thrust::transform(make_zip_iterator(make_tuple(

您好，我在C+中有一个循环，我试图将它转换为推力，但没有得到相同的结果。。。有什么想法吗？多谢各位

C++代码

for (i=0;i<n;i++) 
    for (j=0;j<n;j++) 
      values[i]=values[i]+(binv[i*n+j]*d[j]);

thrust::fill(values.begin(), values.end(), 0);
thrust::transform(make_zip_iterator(make_tuple(
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
                binv.begin(),
                thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))))),
                make_zip_iterator(make_tuple(
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))) + n,
                binv.end(),
                thrust::make_permutation_iterator(d.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n))) + n)),
                thrust::make_permutation_iterator(values.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n))),
                function1()
                );

推力功能

struct IndexDivFunctor: thrust::unary_function<int, int>
{
  int n;

  IndexDivFunctor(int n_) : n(n_) {}

  __host__ __device__
  int operator()(int idx)
  {
    return idx / n;
  }
};

struct IndexModFunctor: thrust::unary_function<int, int>
{
  int n;

  IndexModFunctor(int n_) : n(n_) {}

  __host__ __device__
  int operator()(int idx)
  {
    return idx % n;
  }
};


struct function1
{
  template <typename Tuple>
  __host__ __device__
  double operator()(Tuple v)
  {
    return thrust::get<0>(v) + thrust::get<1>(v) * thrust::get<2>(v);
  }
};

struct IndexDivFunctor:推力：：一元函数
{
int n；
IndexDivFunctor（int n_uu1）：n（n_u2{}
__主机设备__
int运算符（）（int idx）
{
返回idx/n；
}
};
结构IndexModFunctor:推力：：一元函数
{
int n；
IndexModFunctor（int n_u1;）：n（n_1;）{
__主机设备__
int运算符（）（int idx）
{
返回idx%n；
}
};
结构函数1
{
模板
__主机设备__
双运算符（）（元组v）
{
返回推力：：get（v）+推力：：get（v）*推力：：get（v）；
}
};

您的结果有多大差异？这是一个完全不同的答案，还是只在最后几个数字上有所不同？循环是只执行一次，还是某种迭代过程

由于精度问题，浮点运算，尤其是那些重复相加或相乘某些值的运算，是不关联的。此外，如果使用快速数学优化，操作可能不是IEEE编译器

对于初学者，请查看维基百科关于浮点数的部分：

首先是一些一般性评论。你的循环

for (i=0;i<n;i++) 
    for (j=0;j<n;j++) 
      v[i]=v[i]+(B[i*n+j]*d[j]);

然后执行以下操作来计算矩阵向量乘法

  typedef thrust::device_vector<int> iVec;
  typedef thrust::device_vector<double> dVec;

  typedef thrust::counting_iterator<int> countIt;
  typedef thrust::transform_iterator<IndexDivFunctor, countIt> columnIt;
  typedef thrust::transform_iterator<IndexModFunctor, countIt> rowIt;

  // Assuming the following allocations on the device
  dVec B(n*n), v(n), d(n);

  // transformation iterators mapping to vector rows and columns
  columnIt cv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n));
  columnIt cv_end   = cv_begin + (n*n);

  rowIt rv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n));
  rowIt rv_end   = rv_begin + (n*n);

  dVec temp(n*n);
  thrust::transform(make_zip_iterator(
                      make_tuple(
                        B.begin(),
                        thrust::make_permutation_iterator(d.begin(),rv_begin) ) ),
                    make_zip_iterator(
                      make_tuple(
                        B.end(),
                        thrust::make_permutation_iterator(d.end(),rv_end) ) ),
                    temp.begin(),
                    functor());

  iVec outkey(n);
  dVec Bd(n);
  thrust::reduce_by_key(cv_begin, cv_end, temp.begin(), outkey.begin(), Bd.begin());
  thrust::transform(v.begin(), v.end(), Bd.begin(), v.begin(), thrust::plus<double>());

typedef推力：：设备向量iVec；
类型定义推力：：设备向量dVec；
typedef推力：：计数\迭代器countIt；
typedef推力：：变换迭代器columnIt；
typedef推力：：变换迭代器rowIt；
//假设设备上的以下分配
dvecb（n*n），v（n），d（n）；
//映射到向量行和向量列的转换迭代器
columnIt cv_begin=推力：：make_transform_迭代器（推力：：make_counting_迭代器（0），IndexDivFunctor（n））；
columnIt cv_end=cv_begin+（n*n）；
rowIt rv_begin=推力：：生成转换迭代器（推力：：生成计数迭代器（0），IndexModFunctor（n））；
rowIt rv_end=rv_begin+（n*n）；
dVec温度（n*n）；
转换（生成zip迭代器）(
制造元组(
B.开始（），
推力：：make_置换_迭代器（d.begin（），rv_begin）），
make_-zip_迭代器(
制造元组(
B.结束（），
推力：：make_置换_迭代器（d.end（），rv_end）），
临时开始（），
函子（））；
iVec输出键（n）；
dVec-Bd（n）；
推力：按键减少（cv_开始，cv_结束，临时开始（），outkey.begin（），Bd.begin（））；
推力：：变换（v.begin（），v.end（），Bd.begin（），v.begin（），推力：：plus（））；

当然，与使用CUBLAS专门设计的矩阵向量乘法代码（如

dgemv

）相比，这是一种效率极低的计算方法。

谢谢您的回答。但问题不在于浮点，它完全不同，尽管我只运行了一次。为什么你认为这是对的？我责备精确性，因为根据我的经验，这是最常见的差异来源。当然，除非有一些简单的bug，我在你的代码中没有看到。你们怎么确定问题不存在？差异有多大？你在哪种GPU上运行它？我在gtx 460上运行，带有arch20，矢量是双精度的。可能是值向量写入自身吗？如果主机和设备上都有双倍值，这应该不是问题。仍然想知道普通代码和转换代码之间的差异大小。它是完全不同还是相差较小？

  typedef thrust::device_vector<int> iVec;
  typedef thrust::device_vector<double> dVec;

  typedef thrust::counting_iterator<int> countIt;
  typedef thrust::transform_iterator<IndexDivFunctor, countIt> columnIt;
  typedef thrust::transform_iterator<IndexModFunctor, countIt> rowIt;

  // Assuming the following allocations on the device
  dVec B(n*n), v(n), d(n);

  // transformation iterators mapping to vector rows and columns
  columnIt cv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexDivFunctor(n));
  columnIt cv_end   = cv_begin + (n*n);

  rowIt rv_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), IndexModFunctor(n));
  rowIt rv_end   = rv_begin + (n*n);

  dVec temp(n*n);
  thrust::transform(make_zip_iterator(
                      make_tuple(
                        B.begin(),
                        thrust::make_permutation_iterator(d.begin(),rv_begin) ) ),
                    make_zip_iterator(
                      make_tuple(
                        B.end(),
                        thrust::make_permutation_iterator(d.end(),rv_end) ) ),
                    temp.begin(),
                    functor());

  iVec outkey(n);
  dVec Bd(n);
  thrust::reduce_by_key(cv_begin, cv_end, temp.begin(), outkey.begin(), Bd.begin());
  thrust::transform(v.begin(), v.end(), Bd.begin(), v.begin(), thrust::plus<double>());