Cuda 主旨：为什么总是执行主机代码，而不管___Cuda_Thrust

Cuda 主旨：为什么总是执行主机代码，而不管__

cuda

Cuda 主旨：为什么总是执行主机代码，而不管__,cuda,thrust,Cuda,Thrust,我尝试在代码中定义两个分支：一个用于CUDA执行，另一个不使用它（考虑到未来的OMP）。但是当我使用宏\uuuuu CUDA\u ARCH\uuuuu时，似乎总是执行主机代码。但我认为推力在默认情况下使用CUDA（以及设备代码的分支）。我的代码怎么了？这是： #include <thrust/transform.h> #include <thrust/functional.h>

我尝试在代码中定义两个分支：一个用于CUDA执行，另一个不使用它（考虑到未来的OMP）。但是当我使用宏

\uuuuu CUDA\u ARCH\uuuuu

时，似乎总是执行主机代码。但我认为推力在默认情况下使用CUDA（以及设备代码的分支）。我的代码怎么了？这是：

#include <thrust/transform.h>                                 
#include <thrust/functional.h>                                
#include <thrust/iterator/counting_iterator.h>                
#include <stdio.h>                                            

struct my_op                                                  
{                                                             
    my_op(int init_const) : constanta(init_const) {}      
    __host__ __device__ int operator()(const int &x) const
    {                                                     
        #if defined(__CUDA_ARCH__)                    
            return 2 * x * constanta;    // never executed - why?
        #else                                     
            return x * constanta;        // always executed                 
        #endif                       
    }                                                     

private:                                                      
    int constanta;                                        
};                                                            

int main()                                                    
{                                                             
 int data[7] = { 0, 0, 0, 0, 0, 0, 0 };                        
 thrust::counting_iterator<int> first(10);                     
 thrust::counting_iterator<int> last = first + 7;              

 int init_value = 1;                                           
 my_op op(init_value);                                         

 thrust::transform(first, last, data, op);                     
 for each (int el in data)                                     
    std::cout << el << " ";                               

 std::cout << std::endl;                                       
}

#包括
#包括
#包括
#包括
结构我的
{                                                             
my_op（int init_const）：康斯坦塔（init_const）{
__主机\uuuuu\uuuu设备\uuuuu int运算符（）（常量int&x）常量
{                                                     
#如果已定义（uuu CUDA_uuarch_uuuuu）
返回2*x*constanta；//从未执行-为什么？
#否则
返回x*constanta；//始终执行
#恩迪夫
}                                                     
私人：
康斯坦塔国际酒店；
};                                                            
int main（）
{                                                             
int data[7]={0,0,0,0,0,0,0}；
推力：首先计算迭代器（10）；
推力：：计数迭代器last=first+7；
int init_值=1；
my_op op（初始值）；
推力：：转换（第一，最后，数据，op）；
对于每个（数据中的整数）
std:：cout推力正在选择主机路径，因为提供给推力转换操作的数据项之一位于主机内存中：
 thrust::transform(first, last, data, op); 
                                ^^^^

如果您希望在设备上运行一个推力算法，一般来说，您传递给/来自的所有容器数据也必须驻留在设备内存中
下面是对您的代码的一个修改，它演示了如果我们用驻留在设备上的容器替换数据
，推力将遵循设备路径：
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>

struct my_op
{
    my_op(int init_const) : constanta(init_const) {}
    __host__ __device__ int operator()(const int &x) const
    {
        #if defined(__CUDA_ARCH__)
            return 2 * x * constanta;    // never executed - why?
        #else
            return x * constanta;        // always executed
        #endif
    }

private:
    int constanta;
};

int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
 thrust::counting_iterator<int> first(10);
 thrust::counting_iterator<int> last = first + 7;
 thrust::device_vector<int> d_data(7);

 int init_value = 1;
 my_op op(init_value);

 thrust::transform(first, last, d_data.begin(), op);
 for (int el = 0; el < 7; el++) {
    int dat = d_data[el];
    std::cout << dat  << " ";    }

 std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$

$cat t13.cu
#包括
#包括
#包括
#包括
#包括
结构我的
{
my_op（int init_const）：康斯坦塔（init_const）{
__主机\uuuuu\uuuu设备\uuuuu int运算符（）（常量int&x）常量
{
#如果已定义（uuu CUDA_uuarch_uuuuu）
返回2*x*constanta；//从未执行-为什么？
#否则
返回x*constanta；//始终执行
#恩迪夫
}
私人：
康斯坦塔国际酒店；
};
int main（）
{
//int data[7]={0,0,0,0,0,0,0}；
推力：首先计算迭代器（10）；
推力：：计数迭代器last=first+7；
推力：设备矢量数据（7）；
int init_值=1；
my_op op（初始值）；
转换（第一个，最后一个，d_data.begin（），op）；
对于（int el=0；el<7；el++）{
int dat=d_数据[el]；
std:：cout推力正在选择主机路径，因为提供给推力转换操作的数据项之一位于主机内存中：
 thrust::transform(first, last, data, op); 
                                ^^^^

如果您希望在设备上运行一个推力算法，一般来说，您传递给/来自的所有容器数据也必须驻留在设备内存中
下面是对您的代码的一个修改，它演示了如果我们用驻留在设备上的容器替换数据
，推力将遵循设备路径：
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>

struct my_op
{
    my_op(int init_const) : constanta(init_const) {}
    __host__ __device__ int operator()(const int &x) const
    {
        #if defined(__CUDA_ARCH__)
            return 2 * x * constanta;    // never executed - why?
        #else
            return x * constanta;        // always executed
        #endif
    }

private:
    int constanta;
};

int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
 thrust::counting_iterator<int> first(10);
 thrust::counting_iterator<int> last = first + 7;
 thrust::device_vector<int> d_data(7);

 int init_value = 1;
 my_op op(init_value);

 thrust::transform(first, last, d_data.begin(), op);
 for (int el = 0; el < 7; el++) {
    int dat = d_data[el];
    std::cout << dat  << " ";    }

 std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$

$cat t13.cu
#包括
#包括
#包括
#包括
#包括
结构我的
{
my_op（int init_const）：康斯坦塔（init_const）{
__主机\uuuuu\uuuu设备\uuuuu int运算符（）（常量int&x）常量
{
#如果已定义（uuu CUDA_uuarch_uuuuu）
返回2*x*constanta；//从未执行-为什么？
#否则
返回x*constanta；//始终执行
#恩迪夫
}
私人：
康斯坦塔国际酒店；
};
int main（）
{
//int data[7]={0,0,0,0,0,0,0}；
推力：首先计算迭代器（10）；
推力：：计数迭代器last=first+7；
推力：设备矢量数据（7）；
int init_值=1；
my_op op（初始值）；
转换（第一个，最后一个，d_data.begin（），op）；
对于（int el=0；el<7；el++）{
int dat=d_数据[el]；
标准：：cout