C++ 浮动1 vs CUDA中的浮动_C++_C_Cuda

C++ 浮动1 vs CUDA中的浮动

c++ c cuda

C++ 浮动1 vs CUDA中的浮动,c++,c,cuda,C++,C,Cuda,我注意到cuda中有一个float1struct类型。例如，在使用浮点数组与浮点1数组相比，简单的浮点数组是否有性能优势 struct __device_builtin__ float1 { float x; }; 在float4中，由于对齐方式为4x4bytes=16字节，因此根据具体情况，性能会有所提高。它是否仅用于带有float1参数的\uuu设备\uuu功能中的特殊用途提前感谢。在@Talonmes发表评论之后，我比较了使用CUDA推力和在float和float1之间切换的

我注意到cuda中有一个

float1

struct类型。例如，在使用

浮点数组

与

浮点1数组

相比，简单的

浮点数组

是否有性能优势

struct __device_builtin__ float1
{
    float x;
};

在

float4

中，由于对齐方式为4x4bytes=16字节，因此根据具体情况，性能会有所提高。它是否仅用于带有

float1

参数的

\uuu设备\uuu

功能中的特殊用途

提前感谢。

在@Talonmes发表评论之后，我比较了使用CUDA推力和在

float

和

float1

之间切换的向量范数计算。我考虑过GT210卡（cc 1.2）上的

N=1000000

元素数组。对于这两种情况，范数的计算似乎花费了完全相同的时间，即大约

3.4s

，因此没有性能改进。从下面的代码中可以看出，

float

使用起来可能比

float1

稍微舒适一些

最后，请注意，

float4

的优势来自对齐方式

\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu

，而不是

\uuuuuuuuuuuuu
#include <thrust\device_vector.h>
#include <thrust\transform_reduce.h>

struct square
{
    __host__ __device__ float operator()(float x)
    {
        return x * x;
    }
};

struct square1
{
    __host__ __device__ float operator()(float1 x)
    {
        return x.x * x.x;
    }
};

void main() {

    const int N = 1000000;

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<float> d_vec(N,3.f);

    cudaEventRecord(start, 0);
    float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction:  %3.1f ms \n", time);

    printf("Result of reduction = %f\n",reduction);

    thrust::host_vector<float1>   h_vec1(N);
    for (int i=0; i<N; i++) h_vec1[i].x = 3.f;
    thrust::device_vector<float1> d_vec1=h_vec1;

    cudaEventRecord(start, 0);
    float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction1:  %3.1f ms \n", time);

    printf("Result of reduction1 = %f\n",reduction1);

    getchar();

}

#包括
#包括
结构广场
{
__主机\设备\浮点运算符（）（浮点x）
{
返回x*x；
}
};
结构平方1
{
__主机\设备\浮点运算符（）（浮点1 x）
{
返回x.x*x.x；
}
};
void main（）{
常数int N=1000000；
浮动时间；
cudaEvent\u t启动、停止；
cudaEventCreate（&start）；
cudaEventCreate（&stop）；
推力：设备向量d向量（N，3.f）；
cudaEventRecord（开始，0）；
浮点归约=sqrt（推力：：变换\归约（向量开始（），向量结束（），平方（），0.0f，推力：：加号（））；
cudaEventRecord（停止，0）；
CUDAEVENTS同步（停止）；
CUDAEVENTERASEDTIME（时间、开始、停止（&T））；
printf（“已用时间减少：%3.1f ms\n”，时间）；
printf（“减少的结果=%f\n”，减少）；
推力：主机向量h向量1（N）；
对于（int i=0；我记得在StackOverflow的一篇帖子的评论中提到，\uuuu device\u builtin\uuuu
对性能没有影响，但我再也找不到那篇帖子了。我找到了这篇帖子：。我认为它只是为了支持开发人员在为di生成多组可执行代码时使用编译器技巧来保存源代码不同大小的元组。@Archeasoftware您是否愿意将您的评论扩展到一个新的答案，以补充我的？它可能对未来的用户有用，我将对此进行投票。我同意您和talonmies所说的，尽管我还没有测试过您的代码。无论如何，似乎是合法的。