Cuda 嵌套推力：：填充不适用于不同的输入值_Cuda_Gpu_Thrust

Cuda 嵌套推力：：填充不适用于不同的输入值

cuda

Cuda 嵌套推力：：填充不适用于不同的输入值,cuda,gpu,thrust,Cuda,Gpu,Thrust,我已经测试了一个最低限度的测试代码，用数组a的每个元素填充数组c 它表明，当使用常量输入调用嵌套的推力：：填充时，它会正确地填充该输入值的输入数组但是，如果输入值是一个可变值，即值数组的每个元素，则它可能仅用一个第一个或最后一个值填充输入数组 #include <thrust/inner_product.h> #include <thrust/functional.h> #include <thrust/device_vector.h> #include &

我已经测试了一个最低限度的测试代码，用数组a的每个元素填充数组c

它表明，当使用常量输入调用嵌套的推力：：填充时，它会正确地填充该输入值的输入数组

但是，如果输入值是一个可变值，即值数组的每个元素，则它可能仅用一个第一个或最后一个值填充输入数组

#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>

#include <thrust/execution_policy.h>

#include <iostream>
#include <cmath>
#include <boost/concept_check.hpp>

struct bFuntor 
{
    bFuntor(int* av__, int* cv__, const int& N__) : av_(av__), cv_(cv__), N_(N__) {};

    __host__ __device__
    int operator()(const int& idx)
    {
      thrust::device_ptr<int> cv_dpt = thrust::device_pointer_cast(cv_);
      thrust::device_ptr<int> cv_dpt1 = thrust::device_pointer_cast(cv_+N_);

      thrust::detail::normal_iterator<thrust::device_ptr<int>> c0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt);
      thrust::detail::normal_iterator<thrust::device_ptr<int>> c1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt1);

      // ** this thrust::fill with varied values does not work
      thrust::fill(thrust::device,c0,c1,av_[idx]);

      // ** this thrust::fill with constant works
//       thrust::fill(thrust::device,c0,c1,10);

      printf("fill result:\n");
      for (int i=0; i<N_; i++)
        printf("fill value: %d -> return value: %d \n",av_[idx],cv_[i]);
      printf("\n");

      return cv_dpt[idx];
    }

    int* av_;
    int* cv_;
    int N_;
};

int main(void)
{
      int N = 2;
      std::vector<int> av = {0,1};
      std::vector<int> cv = {-1,-2};

      thrust::device_vector<int> av_d(N);
      thrust::device_vector<int> cv_d(N);
      av_d = av; cv_d = cv; 

      // call with nested manner
      thrust::transform(thrust::counting_iterator<int>(0),
            thrust::counting_iterator<int>(N),
            cv_d.begin(),
            bFuntor(thrust::raw_pointer_cast(av_d.data()),
            thrust::raw_pointer_cast(cv_d.data()),
                  N));    

      return 0;
}

恒定输入值的输出情况：

fill result:
fill value: 0 -> return value: 1 
fill value: 1 -> return value: 1 
fill value: 0 -> return value: 1 
fill value: 1 -> return value: 1

fill result:
fill value: 10 -> return value: 10 
fill value: 10 -> return value: 10 
fill value: 10 -> return value: 10 
fill value: 10 -> return value: 10

这个推力有问题吗？或者它不应该这样使用

这是一个数据竞争的例子：

int operator()(const int& idx)
{
  thrust::device_ptr<int> cv_dpt = thrust::device_pointer_cast(cv_);
  thrust::device_ptr<int> cv_dpt1 = thrust::device_pointer_cast(cv_+N_);

  thrust::detail::normal_iterator<thrust::device_ptr<int>> c0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt);
  thrust::detail::normal_iterator<thrust::device_ptr<int>> c1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt1);


  thrust::fill(thrust::device,c0,c1,av_[idx]);

  //.....
}

这里，对函子的每次调用都将尝试用不同的值填充相同的迭代器范围c0到c1。显然，当多个函子调用并行发生时，这将产生问题。

这与您在上一个问题中解释的问题完全相同。在这种情况下，我不使用推力：：变换并编写其函子。它消除了函子中不存在的变量初始化错误的风险。至于推力：：填充接口，它的第四个输入是一个常量T&值，只要我遵守这个接口，它就应该工作，不是吗？你要求两个不同的并行数据操作用不同的值填充同一个数组。你认为会发生什么？我已经将c数组从gpu的全局内存更改为bFunctor操作符中的本地内存。因此，每个线程将有单独的c数组，并行的推力：：填充将填充不同的本地内存c数组。它在任何情况下都能正常工作。我想问马克这个问题的答案。非常感谢你！