__带有推力执行策略的CUDA_ARCH__标志_Cuda_Gpu_Thrust

__带有推力执行策略的CUDA_ARCH__标志

cuda

__带有推力执行策略的CUDA_ARCH__标志,cuda,gpu,thrust,Cuda,Gpu,Thrust,我有一个\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu设备函数，它是调用推力库的“排序”函数的包装器。在这个包装器中，我使用\uuuu CUDA\u ARCH\uuu标志将执行策略从主机调用时设置为“推力：：设备”，从设备调用时设置为“推力：：顺序”。以下代码段生成运行时错误- #ifndef __CUDA_ARCH__ thrust::stable_sort(thrust::device, dat

我有一个

\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu设备

函数，它是调用推力库的“排序”函数的包装器。在这个包装器中，我使用

\uuuu CUDA\u ARCH\uuu

标志将执行策略从主机调用时设置为“推力：：设备”，从设备调用时设置为“推力：：顺序”。以下代码段生成运行时错误-

#ifndef __CUDA_ARCH__
    thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
#else
    thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
#endif

\ifndef\uuuu CUDA\uarch__
推力：：稳定_排序（推力：：设备，数据，数据+num，customGreater（））；
#否则
推力：：稳定_排序（推力：：seq，data，data+num，customGreater（））；
#恩迪夫

错误是-

意外的标准异常： What（）是：合并\排序：第二步失败：设备函数无效

根据我的理解，CUDA_ARCH可用于条件编译。我请求帮助理解抛出此错误的原因。

您似乎正在踩踏板。简而言之，在某些算法（包括排序）中使用隐藏的功能。在代码中使用

\uuuuu CUDA\u ARCH\uuuuu

宏（它围绕使用CUB的推力算法调用）会干扰CUB代码，而CUB代码希望能够对所有路径使用此宏

一种可能的解决方法是“自行调度”：

$cat t142.cu
#包括
#包括
#包括
样板
结构自定义更大{
__主机设备布尔运算符（）（T&t1、T&t2）{
返回（t1>t2）；}
};
样板
__主机设备__
作废我的排序包装（T*数据，大小\u T num）{
int hostdev=0；//0=设备代码
#ifndef_uuucuda_u拱门__
hostdev=1；//1=主机代码
#恩迪夫
if（hostdev==0）推力：：稳定_排序（推力：：seq，data，data+num，customGreater（））；
else-推力：：稳定排序（推力：：设备、数据、数据+num、customGreater（））；
}
样板
__全局\无效我的\开发\排序（T*数据，大小\数量）{
我的排序包装器（数据，num）；
}
typedef int-mytype；
const size_t sz=10；
int main（）{
mytype*d_数据；
Cudamaloc（&d_数据，sz*sizeof（mytype））；
cudaMemset（d_数据，0，sz*sizeof（mytype））；
我的排序包装器（d_数据，sz）；
my_dev_sort（d_数据，sz）；
cudaDeviceSynchronize（）；
}
$nvcc t142.cu-o t142
$cuda memcheck./t142
==========CUDA-MEMCHECK
======错误摘要：0个错误
$

通过这种实现，使用

\uuu CUDA\u ARCH\uu

宏不会干扰推力算法的编译

另一种可能的解决方法是对这两种情况都使用

推力：：设备

策略（无调度-只调用推力算法）。除了CUDA动态并行的情况外，

推力：：设备

在设备代码中使用时将“衰减”为

推力：：序列

我希望这些建议只有在推力算法在底层实现中使用CUB功能时才有必要/相关

如果你不喜欢这种行为，你可以提交一份文件。

看来你是在踩踏板。简而言之，在某些算法（包括排序）中使用隐藏的功能。在代码中使用

\uuuuu CUDA\u ARCH\uuuuu

宏（它围绕使用CUB的推力算法调用）会干扰CUB代码，而CUB代码希望能够对所有路径使用此宏

一种可能的解决方法是“自行调度”：

$cat t142.cu
#包括
#包括
#包括
样板
结构自定义更大{
__主机设备布尔运算符（）（T&t1、T&t2）{
返回（t1>t2）；}
};
样板
__主机设备__
作废我的排序包装（T*数据，大小\u T num）{
int hostdev=0；//0=设备代码
#ifndef_uuucuda_u拱门__
hostdev=1；//1=主机代码
#恩迪夫
if（hostdev==0）推力：：稳定_排序（推力：：seq，data，data+num，customGreater（））；
else-推力：：稳定排序（推力：：设备、数据、数据+num、customGreater（））；
}
样板
__全局\无效我的\开发\排序（T*数据，大小\数量）{
我的排序包装器（数据，num）；
}
typedef int-mytype；
const size_t sz=10；
int main（）{
mytype*d_数据；
Cudamaloc（&d_数据，sz*sizeof（mytype））；
cudaMemset（d_数据，0，sz*sizeof（mytype））；
我的排序包装器（d_数据，sz）；
my_dev_sort（d_数据，sz）；
cudaDeviceSynchronize（）；
}
$nvcc t142.cu-o t142
$cuda memcheck./t142
==========CUDA-MEMCHECK
======错误摘要：0个错误
$

通过这种实现，使用

\uuu CUDA\u ARCH\uu

宏不会干扰推力算法的编译

另一种可能的解决方法是对这两种情况都使用

推力：：设备

策略（无调度-只调用推力算法）。除了CUDA动态并行的情况外，

推力：：设备

在设备代码中使用时将“衰减”为

推力：：序列

我希望这些建议只有在推力算法在底层实现中使用CUB功能时才有必要/相关

如果您不喜欢这种行为，您可以提交一份。

不幸的是，我们无法解决这个问题。这里的问题是，NVCC编译器需要在主机编译期间查看所有

\uuuuu global\uuuuuu

函数模板实例化（例如，未定义

\uuuuuu CUDA\u ARCH\uuuuu

），否则内核将被视为未使用并丢弃。有关更多详细信息，请参阅

正如Robert所建议的，像这样的解决方法应该很好：

#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
   return (t1 > t2);}
};

#if defined(__CUDA_ARCH__)
  #define DEVICE_COMPILATION 1
#else
  #define DEVICE_COMPILATION 0
#endif

template <typename T>
__host__ __device__
void my_sort(T *data, size_t num){
  if (DEVICE_COMPILATION)
    thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
  else
    thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
}

template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
  my_sort(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
  mytype *d_data;
  cudaMallocManaged(&d_data, sz*sizeof(mytype));
  cudaMemset(d_data, 0, sz*sizeof(mytype));
  my_sort(d_data, sz);
  my_dev_sort<<<1,1>>>(d_data, sz);
  cudaFree(d_data);
  cudaDeviceSynchronize();
}

#包括
#包括
#包括
样板
结构自定义更大{
__主机设备布尔运算符（）（T&t1、T&t2）{
返回（t1>t2）；}
};
#如果已定义（uuu CUDA_uuarch_uuuuu）
#定义设备\u编译1
#否则
#定义设备\u编译0
#恩迪夫
样板
__主机设备__
作废我的排序（T*数据，大小\u T num）{
if（设备编译）
推力：：稳定_排序（推力：：设备，数据，数据+num，customGreater（））；
其他的
推力：：稳定_排序（推力：：seq，data，data+num，customGreater（））；
}
样板
__全局\无效我的\开发\排序（T*数据，大小\数量）{
my_排序（数据，num）；
}
typedef int-mytype；
const size_t sz=10；
int main（）{
mytype*d_数据；
cudaMallocManaged（&d_）数据，sz*sizeof（mytype
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
   return (t1 > t2);}
};

#if defined(__CUDA_ARCH__)
  #define DEVICE_COMPILATION 1
#else
  #define DEVICE_COMPILATION 0
#endif

template <typename T>
__host__ __device__
void my_sort(T *data, size_t num){
  if (DEVICE_COMPILATION)
    thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
  else
    thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
}

template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
  my_sort(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
  mytype *d_data;
  cudaMallocManaged(&d_data, sz*sizeof(mytype));
  cudaMemset(d_data, 0, sz*sizeof(mytype));
  my_sort(d_data, sz);
  my_dev_sort<<<1,1>>>(d_data, sz);
  cudaFree(d_data);
  cudaDeviceSynchronize();
}