linux下的nvcc抱怨：包含设备代码中不支持的向量_Linux_Cuda_Nvcc

linux下的nvcc抱怨：包含设备代码中不支持的向量

linux cuda

linux下的nvcc抱怨：包含设备代码中不支持的向量,linux,cuda,nvcc,Linux,Cuda,Nvcc,我有以下代码 #include <cuda.h> #include <cuda_runtime.h> #ifdef _MSC_VER #include <intrin.h> #else #include <x86intrin.h> #endif //A bitset for the variable assignments //The state for non existing variable 0 is stored as w

我有以下代码

#include <cuda.h>
#include <cuda_runtime.h>
#ifdef _MSC_VER
    #include <intrin.h>
#else
    #include <x86intrin.h>
#endif

//A bitset for the variable assignments
//The state for non existing variable 0 is stored as well, just to avoid +1/-1 adjustments
struct Atom_t {
    enum where { device, host};
    enum BoolOp {opXor, opOr, opAnd };
public:   //TODO make private later
    int VarCount;
    bool isValid;
    union {
        uint32_t raw[1]; //don't worry about alignment, the compiler will not use aligned read/writes anyway.}
        uint64_t raw64[1];
        __m256i avxraw[1];                                           
    };
public:
    __host__ __device__ friend bool operator==(const Atom_t& a, const Atom_t& b);
};

__host__ __device__ bool operator==(const Atom_t& a, const Atom_t& b) {
    const auto IntCount = a.IntCount();
    if (IntCount != b.IntCount()) { return false; }
#ifdef __CUDA_ARCH__
    __shared__ bool isDifferent;
    isDifferent = false;
    
    for (auto i = ThreadId(); i < IntCount; i += BlockDim()) {
        if (a.raw[i] != b.raw[i] || isDifferent) {
            isDifferent = true;
            break;
        }
    }
    syncthreads();
    return !isDifferent;
#else
    auto result = true;
    #ifdef _DEBUG
        for (auto i = 0; i < IntCount; i++) {
            if (a.raw[i] != b.raw[i]) { result = false; }
        }
    #endif
    auto AvxCount = a.Avx2Count();
    if (AvxCount != b.Avx2Count()) { if (result) { print("Atom_t == is incorrect"); } assert1(!result); return false; }
    for (auto i = 0; i < AvxCount; i++) {
        const auto packedCompare = _mm256_cmpeq_epi8(a.avxraw[i], b.avxraw[i]);
        const auto bitmask = _mm256_movemask_epi8(packedCompare);
        if (bitmask != -1) { if (result) { print("Atom_t == is incorrect"); } assert1(!result); return false; }
    }
#endif
#ifndef __CUDA_ARCH__
    assert(result);
#endif
    return true;
}

但是那不行。

找到了
问题不在于方法中的代码，而在于cuda视图中是否存在

\u m256i

以下修补程序修复了此问题：

struct Atom_t {
    enum where { device, host};
    enum BoolOp {opXor, opOr, opAnd };
public:   //TODO make private later
    int VarCount;
    bool isValid;
    union {
        uint32_t raw[1]; //don't worry about alignment, the compiler will not use aligned read/writes anyway.}
        uint64_t raw64[1];
#ifndef __CUDA_ARCH__   //hide the vectorized datastruct from cuda's view
        __m256i avxraw[1];
#endif
    };

既然nvcc没有看到矢量化的数据类型，它就不再担心了

但是，向量不在设备代码中，只在主机代码中

错误是由以下行引起的：

__m256i avxraw[1];

在主机代码和设备代码编译轨迹中都可见

根据我的测试，这可能是一个可行的解决办法：

$ cat t32.cpp
#ifdef _MSC_VER
    #include <intrin.h>
#else
    #include <x86intrin.h>
#endif
#include <iostream>
typedef char dummy[sizeof(__m256i)];

struct Atom_t {
    enum where { device, host};
    enum BoolOp {opXor, opOr, opAnd };
public:   //TODO make private later
    int VarCount;
    bool isValid;
    union {
        uint32_t raw[1]; 
        uint64_t raw64[1];
#ifndef FOO   //hide the vectorized datastruct from cuda's view
        __m256i avxraw[1];
#else
        alignas(32) dummy foo[1];
#endif
    };
};



int main(){
        std::cout << sizeof(__m256i) << std::endl;
        std::cout << sizeof(Atom_t) << std::endl;
}
$ g++ t32.cpp -o t32
$ ./t32
32
64
$ g++ t32.cpp -o t32 -DFOO
$ ./t32
32
64

$cat t32.cpp
#ifdef硕士学位
#包括
#否则
#包括
#恩迪夫
#包括
typedef char dummy[sizeof（_m256i）]；
结构原子{
枚举，其中{设备，主机}；
枚举BoolOp{opXor，opOr，opAnd}；
public://TODO稍后将变为private
int变量计数；
bool是有效的；
联合{
uint32_t原始[1]；
uint64_t raw64[1]；
#ifndef FOO//从cuda的视图中隐藏矢量化数据结构
__m256i avxraw[1]；
#否则
alignas（32）伪foo[1]；
#恩迪夫
};
};
int main（）{
std:：cout“但是，向量不在设备代码中，只在主机代码中。”错误是由以下行引起的：\uuuu m256i avxraw[1]；
在主机代码和设备代码编译轨迹中都可见。由于其他原因，您的代码无法编译，因此很难提出解决方案。此解决方案具有潜力（我相信）使结构在主机代码和设备代码编译轨迹中具有不同的大小。但如果您对此感到满意，那就太好了。在CUDA中，这通常被认为是一件坏事。这让我觉得有问题。sizeof（_m256i）为32，因此我预计这会影响联合体的大小，从而影响结构的大小。我还没有完全验证这一点。由于您发布的代码在没有出现此问题的情况下也无法编译，因此很难使用它来建议替代方案或检查类似的内容。已确认。结构的大小在n主机和设备编译路径。使用风险自负。
$ cat t32.cpp
#ifdef _MSC_VER
    #include <intrin.h>
#else
    #include <x86intrin.h>
#endif
#include <iostream>
typedef char dummy[sizeof(__m256i)];

struct Atom_t {
    enum where { device, host};
    enum BoolOp {opXor, opOr, opAnd };
public:   //TODO make private later
    int VarCount;
    bool isValid;
    union {
        uint32_t raw[1]; 
        uint64_t raw64[1];
#ifndef FOO   //hide the vectorized datastruct from cuda's view
        __m256i avxraw[1];
#else
        alignas(32) dummy foo[1];
#endif
    };
};



int main(){
        std::cout << sizeof(__m256i) << std::endl;
        std::cout << sizeof(Atom_t) << std::endl;
}
$ g++ t32.cpp -o t32
$ ./t32
32
64
$ g++ t32.cpp -o t32 -DFOO
$ ./t32
32
64