浮动4不比cuda中的浮动快

浮动4不比cuda中的浮动快,c,cuda,C,Cuda,编辑:njuffa是正确的此版本是使用-G编译的,它禁用了所有优化。随着加载和存储的矢量化,新的SASS速度更快 基于经典示例,我在cuda中修改了向量加法的两个版本。问题是,float4版本的长度是float版本的两倍,数据量减少了4倍。对这两个内核的分析清楚地表明,float4版本平均每个事务执行4个加载和4个存储,而float版本仅对这两个事务执行一个加载和4个存储。这听起来像是一个关于浮动4访问不一致的noob问题,顺便说一句,下面的PTX确认了这一点,但我找不到位置 我正在使用Cuda

编辑:njuffa是正确的此版本是使用-G编译的,它禁用了所有优化。随着加载和存储的矢量化,新的SASS速度更快

基于经典示例,我在cuda中修改了向量加法的两个版本。问题是,float4版本的长度是float版本的两倍,数据量减少了4倍。对这两个内核的分析清楚地表明,float4版本平均每个事务执行4个加载和4个存储,而float版本仅对这两个事务执行一个加载和4个存储。这听起来像是一个关于浮动4访问不一致的noob问题,顺便说一句,下面的PTX确认了这一点,但我找不到位置

我正在使用Cuda 7.0 rc和quadro K4000

你知道去哪里找吗

编译选项

__aligned__ keyword ?


__global__ void add_float(float *c, const float *a, const float *b)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    c[i] = a[i] + b[i];
}

__global__ void add_float4(float4 *c, const float4 *a, const  float4 *b) {

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    float4 a1 = a[i];
    float4 b1 = b[i];

    float4 c1;
    c1.x = a1.x + b1.x;
    c1.y = a1.y + b1.y;
    c1.z = a1.z + b1.z;
    c1.w = a1.w + b1.w;

    c[i] = c1;
}
线路上的PTX:

float4 a1 = a[i];
说:

SASS objdump说:

    /*0108*/                   MOV R10, R0;                             /* 0x2800000000029de4 */
    /*0110*/                   ISET.LT.AND R11, R0, RZ, PT;             /* 0x108e0000fc02dc23 */
    /*0118*/                   MOV32I R13, 0x4;                         /* 0x1800000010035de2 */
    /*0120*/                   ISETP.LE.U32.AND P0, PT, R13, 0x20, PT;  /* 0x198ec00080d1dc03 */
    /*0128*/                   ISUB R12, 0x20, R13;                     /* 0x4800c00080d31e03 */
    /*0130*/                   SHL R11, R11, R13;                       /* 0x6000000034b2dc03 */
    /*0138*/                   SHR.U32 R14, R10, R12;                   /* 0x5800000030a39c03 */
                                                                        /* 0x22c2804282328047 */
    /*0148*/                   IADD R11, R11, R14;                      /* 0x4800000038b2dc03 */
    /*0150*/              @!P0 IADD R12, R13, -0x20;                    /* 0x4800ffff80d32003 */
    /*0158*/              @!P0 SHL R11, R10, R12;                       /* 0x6000000030a2e003 */
    /*0160*/                   SHL R10, R10, R13;                       /* 0x6000000034a29c03 */
    /*0168*/                   MOV R10, R10;                            /* 0x2800000028029de4 */
    /*0170*/                   MOV R11, R11;                            /* 0x280000002c02dde4 */
    /*0178*/                   IADD R8.CC, R8, R10;                     /* 0x4801000028821c03 */
                                                                        /* 0x228042c042828047 */
    /*0188*/                   IADD.X R9, R9, R11;                      /* 0x480000002c925c43 */
    /*0190*/                   MOV R8, R8;                              /* 0x2800000020021de4 */
    /*0198*/                   MOV R9, R9;                              /* 0x2800000024025de4 */
    /*01a0*/                   LD.E R10, [R8];                          /* 0x8400000000829c85 */
    /*01a8*/                   IADD R12.CC, R8, 0x4;                    /* 0x4801c00010831c03 */
    /*01b0*/                   IADD.X R13, R9, RZ;                      /* 0x48000000fc935c43 */
    /*01b8*/                   MOV R12, R12;                            /* 0x2800000030031de4 */
                                                                        /* 0x2202828042c2e287 */
    /*01c8*/                   MOV R13, R13;                            /* 0x2800000034035de4 */
    /*01d0*/                   LD.E R11, [R12];                         /* 0x8400000000c2dc85 */
    /*01d8*/                   IADD R12.CC, R8, 0x8;                    /* 0x4801c00020831c03 */
    /*01e0*/                   IADD.X R13, R9, RZ;                      /* 0x48000000fc935c43 */
    /*01e8*/                   MOV R12, R12;                            /* 0x2800000030031de4 */
    /*01f0*/                   MOV R13, R13;                            /* 0x2800000034035de4 */
    /*01f8*/                   LD.E R12, [R12];                         /* 0x8400000000c31c85 */
                                                                        /* 0x2282c202828042c7 */
    /*0208*/                   IADD R8.CC, R8, 0xc;                     /* 0x4801c00030821c03 */
    /*0210*/                   IADD.X R9, R9, RZ;                       /* 0x48000000fc925c43 */
    /*0218*/                   MOV R8, R8;                              /* 0x2800000020021de4 */
    /*0220*/                   MOV R9, R9;                              /* 0x2800000024025de4 */
    /*0228*/                   LD.E R8, [R8];                           /* 0x8400000000821c85 */
    /*0230*/                   IADD R14.CC, R2, 0xc;                    /* 0x4801c00030239c03 */
    /*0238*/                   IADD.X R15, R3, RZ;                      /* 0x48000000fc33dc43 */
                                                                        /* 0x22828042c2e28047 */
    /*0248*/                   MOV R14, R14;                            /* 0x2800000038039de4 */
    /*0250*/                   MOV R15, R15;                            /* 0x280000003c03dde4 */
    /*0258*/                   ST.E [R14], R8;                          /* 0x9400000000e21c85 */
    /*0260*/                   IADD R8.CC, R2, 0x8;                     /* 0x4801c00020221c03 */
    /*0268*/                   IADD.X R9, R3, RZ;                       /* 0x48000000fc325c43 */
    /*0270*/                   MOV R8, R8;                              /* 0x2800000020021de4 */
    /*0278*/                   MOV R9, R9;                              /* 0x2800000024025de4 */
                                                                        /* 0x22c2e2828042c2e7 */
    /*0288*/                   ST.E [R8], R12;                          /* 0x9400000000831c85 */
    /*0290*/                   IADD R8.CC, R2, 0x4;                     /* 0x4801c00010221c03 */
    /*0298*/                   IADD.X R9, R3, RZ;                       /* 0x48000000fc325c43 */
    /*02a0*/                   MOV R8, R8;                              /* 0x2800000020021de4 */
    /*02a8*/                   MOV R9, R9;                              /* 0x2800000024025de4 */
    /*02b0*/                   ST.E [R8], R11;                          /* 0x940000000082dc85 */
    /*02b8*/                   IADD R8.CC, R2, RZ;                      /* 0x48010000fc221c03 */
                                                                        /* 0x22820042e2828047 */
    /*02c8*/                   IADD.X R9, R3, RZ;                       /* 0x48000000fc325c43 */
    /*02d0*/                   MOV R8, R8;                              /* 0x2800000020021de4 */
    /*02d8*/                   MOV R9, R9;                              /* 0x2800000024025de4 */
    /*02e0*/                   ST.E [R8], R10;                          /* 0x9400000000829c85 */
以下是剩下的:

void CudaTest()
{
    int size = 8192;

    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    float *host_a = (float*)malloc(4 * size * sizeof(float));
    float *host_b = (float*)malloc(4 * size * sizeof(float));
    float *host_c = (float*)malloc(4 * size * sizeof(float));

    float4 *dev_a4 = 0;
    float4 *dev_b4 = 0;
    float4 *dev_c4 = 0;
    float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
    float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
    float4 *host_c4 = (float4*)malloc(size * sizeof(float4));

    for (int i = 0; i < 4 * size; i++)
    {
        host_a[i] = rand() / RAND_MAX;
        host_b[i] = rand() / RAND_MAX;
    }

    for (int i = 0; i < size; i++)
    {
        host_a4[i].x = rand() / RAND_MAX;
        host_a4[i].y = rand() / RAND_MAX;
        host_a4[i].z = rand() / RAND_MAX;
        host_a4[i].w = rand() / RAND_MAX;
        host_b4[i].x = rand() / RAND_MAX;
        host_b4[i].y = rand() / RAND_MAX;
        host_b4[i].z = rand() / RAND_MAX;
        host_b4[i].w = rand() / RAND_MAX;
    }

    // Choose which GPU to run on, change this on a multi-GPU system.
    CUDA_CALL(cudaSetDevice(0));

    // Allocate GPU buffers for three vectors (two input, one output)    .
    CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
    CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
    CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));

    // Copy input vectors from host memory to GPU buffers.
    CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));

    int local = 256;
    int N = size / local;
    // Launch a kernel on the GPU with one thread for each element.
    add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
    // Check for any errors launching the kernel
    CUDA_CALL(cudaGetLastError());

    add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
    // Check for any errors launching the kernel
    CUDA_CALL(cudaGetLastError());

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    CUDA_CALL(cudaDeviceSynchronize());

    // Copy output vector from GPU buffer to host memory.
    CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
    CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}
void CudaTest()
{
int size=8192;
浮动*dev_a=0;
浮动*dev_b=0;
浮点数*dev_c=0;
float*host_a=(float*)malloc(4*size*sizeof(float));
float*host_b=(float*)malloc(4*size*sizeof(float));
float*host_c=(float*)malloc(4*size*sizeof(float));
浮动4*dev_a4=0;
浮动4*dev_b4=0;
浮动4*dev_c4=0;
float4*host_a4=(float4*)malloc(size*sizeof(float4));
float4*host_b4=(float4*)malloc(size*sizeof(float4));
float4*host_c4=(float4*)malloc(size*sizeof(float4));
对于(int i=0;i<4*size;i++)
{
host_a[i]=rand()/rand_MAX;
host_b[i]=rand()/rand_MAX;
}
对于(int i=0;i(开发c、开发a、开发b);
//检查启动内核时是否有任何错误
CUDA_调用(cudaGetLastError());
添加4>(开发c4、开发a4、开发b4);
//检查启动内核时是否有任何错误
CUDA_调用(cudaGetLastError());
//cudaDeviceSynchronize等待内核完成,然后返回
//在启动过程中遇到的任何错误。
CUDA_调用(cudaDeviceSynchronize());
//将输出向量从GPU缓冲区复制到主机内存。
CUDA_调用(cudaMemcpy(主机c、开发c、4*size*sizeof(浮点)、cudaMemcpyDeviceToHost));
CUDA_调用(cudaMemcpy(主机c4、开发c4、大小*sizeof(浮动4)、cudaMemcpyDeviceToHost));
}

使用GPU硬件提供的矢量加载/存储指令被认为是编译器应用的性能优化,因为代码使用标量加载和存储是完全功能的。当代码由
nvcc
-G
编译时(通常用于调试),所有优化(包括加载和存储的矢量化)都将关闭


要检查加载/存储矢量化,重要的是查看正在执行的实际机器代码(SASS),而不是PTX,PTX只是一个中间代码,由名为
ptxas
的优化编译器组件编译成SASS,该组件由驱动程序
nvcc
调用。在nvcc生成的可执行文件上运行
cuobjdump--dump sass
,检查机器代码。

您使用什么目标体系结构和CUDA版本来生成所显示的PTX?在CUDA 6.5上,我得到了预期的矢量化加载和存储指令。PTX只是一种中间格式。要检查矢量化加载和存储是否存在,请检查使用
cuobjdump--dump SASS
提取的机器代码(SASS)。我已经为Njuffa添加了SASS objdump。泰龙,我不知道你在说什么。我正在使用cuda 7.0为K4000 quadro进行构建。nvcc的标志是compute_30,sm_30。上面的SASS代码表明您正在使用
-G
标志进行编译,这会导致禁用所有优化(包括矢量化)。感谢Njuffa指出这一点。现在它的工作原理与预期一致。
void CudaTest()
{
    int size = 8192;

    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    float *host_a = (float*)malloc(4 * size * sizeof(float));
    float *host_b = (float*)malloc(4 * size * sizeof(float));
    float *host_c = (float*)malloc(4 * size * sizeof(float));

    float4 *dev_a4 = 0;
    float4 *dev_b4 = 0;
    float4 *dev_c4 = 0;
    float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
    float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
    float4 *host_c4 = (float4*)malloc(size * sizeof(float4));

    for (int i = 0; i < 4 * size; i++)
    {
        host_a[i] = rand() / RAND_MAX;
        host_b[i] = rand() / RAND_MAX;
    }

    for (int i = 0; i < size; i++)
    {
        host_a4[i].x = rand() / RAND_MAX;
        host_a4[i].y = rand() / RAND_MAX;
        host_a4[i].z = rand() / RAND_MAX;
        host_a4[i].w = rand() / RAND_MAX;
        host_b4[i].x = rand() / RAND_MAX;
        host_b4[i].y = rand() / RAND_MAX;
        host_b4[i].z = rand() / RAND_MAX;
        host_b4[i].w = rand() / RAND_MAX;
    }

    // Choose which GPU to run on, change this on a multi-GPU system.
    CUDA_CALL(cudaSetDevice(0));

    // Allocate GPU buffers for three vectors (two input, one output)    .
    CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
    CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
    CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
    CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));

    // Copy input vectors from host memory to GPU buffers.
    CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));

    int local = 256;
    int N = size / local;
    // Launch a kernel on the GPU with one thread for each element.
    add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
    // Check for any errors launching the kernel
    CUDA_CALL(cudaGetLastError());

    add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
    // Check for any errors launching the kernel
    CUDA_CALL(cudaGetLastError());

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    CUDA_CALL(cudaDeviceSynchronize());

    // Copy output vector from GPU buffer to host memory.
    CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
    CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}