在推力cuda中使用float4时出现内存问题_Cuda_Thrust

在推力cuda中使用float4时出现内存问题

cuda

在推力cuda中使用float4时出现内存问题,cuda,thrust,Cuda,Thrust,我在cuda中使用float4时遇到了内存问题将float4“buggyVariable”作为成员添加到函子中似乎会导致浮点数据左移1个浮点在CUDA_animateParticles中，我清楚地将Y设置为0，将Z设置为1 然而，当运行函子并在OpenGL中绘制它时。我得到了Xposition=1的粒子，这表明在函子中Y是1 我还用float2和float3进行了测试，它们似乎工作得很好因此，这似乎是内存对齐问题或bug 有人能解释一下吗？谢谢你的帮助 #include <thrus

我在cuda中使用float4时遇到了内存问题

将float4“buggyVariable”作为成员添加到函子中似乎会导致浮点数据左移1个浮点

在CUDA_animateParticles中，我清楚地将Y设置为0，将Z设置为1

然而，当运行函子并在OpenGL中绘制它时。我得到了Xposition=1的粒子，这表明在函子中Y是1

我还用float2和float3进行了测试，它们似乎工作得很好

因此，这似乎是内存对齐问题或bug

有人能解释一下吗？谢谢你的帮助

#include <thrust/sort.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include "cutil_math.h"

struct animateParticles_functor
{
    float4 buggyVariable; //why does adding this variable cause following floats to get wrong values???
    float pex, pey, pez, pew;

    __host__ __device__
    animateParticles_functor( float x, float y, float z, float w) :
        pex(x), pey(y), pez(z), pew(w)
    {
    }

    template <typename Tuple>
    __host__ __device__
    void operator()(Tuple t)
    {
        if(pey > 0)
            thrust::get<0>(t) = make_float4(1, 0, 0, 0); //true if y is bugged
        else
            thrust::get<0>(t) = make_float4(0, 0, 0, 0); //false if its not bugged

        return;
    }
}

void CUDA_animateParticles(float4* cuda_devicePointer_vboPosition, float3* cuda_devicePointer_particleVelocitys, unsigned int numParticles, float4 particleEmitter)
{
    thrust::device_ptr<float4> d_pos(cuda_devicePointer_vboPosition);
    thrust::device_ptr<float3> d_vel(cuda_devicePointer_particleVelocitys);

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_pos, d_vel)),
        thrust::make_zip_iterator(thrust::make_tuple(d_pos + numParticles, d_vel + numParticles)),
        animateParticles_functor(0, 0, 1, 0) //notice that i set Z to 1 and not Y to 0
    );
}

#包括
#包括
#包括
#包括“cutil_math.h”
结构animateParticles_函子
{
float4-buggyVariable；//为什么添加此变量会导致以下浮点得到错误的值？？？
浮动pex、pey、pez、pew；
__主机设备__
animateParticles_函子（浮点x、浮点y、浮点z、浮点w）：
皮克斯（x），皮克斯（y），皮兹（z），皮尤（w）
{
}
模板
__主机设备__
void运算符（）（元组t）
{
如果（pey>0）
推力：：get（t）=make_float4（1，0，0，0）；//如果y被窃听，则为true
其他的
推力：：get（t）=make_float4（0，0，0，0）；//如果没有安装错误，则为false
返回；
}
}
无效CUDA_动画粒子（浮点4*CUDA_设备指针位置，浮点3*CUDA_设备指针位置，无符号整数粒子，浮点4粒子发射器）
{
推力：设备位置（cuda设备指针位置）；
推力：设备ptr d级（cuda设备指针）；
推力：每个(
推力：：make_zip_迭代器（推力：：make_元组（d_pos，d_vel）），
推力：：make_-zip_迭代器（推力：：make_元组（d_-pos+numParticles，d_-vel+numParticles）），
animateParticles_函子（0,0,1,0）//注意，我将Z设置为1，而不是Y设置为0
);
}

我认为MSVC有一些风格，

nvcc

和

cl.exe

不能在

sizeof（float4）

上达成一致

尝试将您对

float4

的使用替换为

my\u float4

：

struct my_float4
{
  float x, y, z, w;
};

无论使用哪种编译器，float4的大小不是很明显吗？对齐会使事情复杂化。