
CUDA结构对齐正在减慢我的代码速度(可编译示例),cuda,memory-alignment,Cuda,Memory Alignment,我有一个模拟,计算带电粒子在电场和磁场中运动的三维矢量我试图在CUDA中使用\uuuuu align\uuuuu说明符来加速这个过程,认为可能是全局内存读取和写入的限制因素,但是使用\uuuuu align\uuuuu最终会减慢速度(可能是因为它增加了总内存需求)。我还尝试使用float3和float4,但它们的性能相似 我已经创建了此代码的简化版本,并将其粘贴到下面以显示我的问题下面的代码应该是可编译的,通过将第四行的CASE的定义更改为0、1或2,可以尝试上面描述的不同选项。

\uuuuu align\uuuuu
\uuuuu align\uuuuu


  • 有没有一个原因是我在内存合并方面的尝试正在减慢而不是加快我的代码
  • 对于这样一个“令人尴尬的并行”代码,有没有其他明显的事情是我没有做的,而我可以做的,以获得比60倍的加速比
  • 谢谢大家!



    // CASE 0: Regular struct with 3 floats
    // CASE 1: Aligned struct using __align__(16) with 3 floats
    // CASE 2: float3
    #define CASE        0   // define to either 0, 1 or 2 as described above
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <Windows.h>
    #include <stdio.h>
    #include <math.h>
    #include <time.h>
    #include <malloc.h>
    #include <sys/stat.h>
    #define CEX         10  // x-value of electric field (dimensionless and arbitrary)
    #define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
    #define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
    #define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
    #define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
    #define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)
    #define FACTOR      15  // I played around with these numbers until I got the best speedup
    #define THREADS     256 // I played around with these numbers until I got the best speedup
    typedef struct{
        float x;
        float y;
        float z;
    } VecCPU;           //Struct for vectors for CPU calculation
    // Fastest method seems to be a regular unaligned struct with 3 floats
    #if CASE==0
    typedef struct {
        float x;
        float y;
        float z;
    } VecGPU;
    #if CASE==1
    // This method seems to be less fast.  It is an attempt to align for memory coalescence
    typedef struct __align__(16){
        float x;
        float y;
        float z;
    } VecGPU;
    // Using float3 seems to be about the same as defining our own vector3 structure
    #if CASE==2
    typedef float3 VecGPU;
    VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation
    __constant__ VecGPU *pos_d, *vel_d;     // pointers in constant memory which we will point to data in global memory
    void ParticleMoverCPU(int np, int ts, float dt){
        int n = 0;
        while (n < np){
            VecCPU vminus, tvec, vprime, vplus;
            float tvec_fact;
            int it = 0;
            while (it < ts){
                // ----- Update velocities by the Boris method ------ //
                vminus.x = vel_c[n].x + CEX*0.5*dt;
                vminus.y = vel_c[n].y + CEY*0.5*dt;
                vminus.z = vel_c[n].z + CEZ*0.5*dt;
                tvec.x = CBX*0.5*dt;
                tvec.y = CBY*0.5*dt;
                tvec.z = CBZ*0.5*dt;
                tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
                vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
                vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
                vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
                vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
                vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
                vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
                vel_c[n].x = vplus.x + CEX*0.5*dt;
                vel_c[n].y = vplus.y + CEY*0.5*dt;
                vel_c[n].z = vplus.z + CEZ*0.5*dt;
                // ------ Update Particle positions -------------- //
                pos_c[n].x += vel_c[n].x*dt;
                pos_c[n].y += vel_c[n].y*dt;
                pos_c[n].z += vel_c[n].z*dt;
    __global__ void ParticleMoverGPU(register int np,register int ts, register float dt){
        register int n = threadIdx.x + blockDim.x * blockIdx.x;
        while (n < np){
            register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
            register float tvec_fact;
            register int it = 0;
            while (it < ts){
                // ----- Update velocities by the Boris method ------ //
                vminus.x = vel_d[n].x + CEX*0.5*dt;
                vminus.y = vel_d[n].y + CEY*0.5*dt;
                vminus.z = vel_d[n].z + CEZ*0.5*dt;
                tvec.x = CBX*0.5*dt;
                tvec.y = CBY*0.5*dt;
                tvec.z = CBZ*0.5*dt;
                tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
                vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
                vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
                vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
                vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
                vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
                vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
                vel_d[n].x = vplus.x + CEX*0.5*dt;
                vel_d[n].y = vplus.y + CEY*0.5*dt;
                vel_d[n].z = vplus.z + CEZ*0.5*dt;
                // ------ Update Particle positions -------------- //
                pos_d[n].x += vel_d[n].x*dt;
                pos_d[n].y += vel_d[n].y*dt;
                pos_d[n].z += vel_d[n].z*dt;
            n += blockDim.x*gridDim.x;
    int main(void){
        int np = 50000;                                         // Number of Particles
        const int ts = 1000;                                    // Number of Time-steps
        const float dt = 1E-3;                                  // Time-step value
        // ----------- CPU ----------- //
        pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
        vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity
        for (int n = 0; n < np; n++){
            pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
            vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
        printf("Starting CPU kernel\n");
        clock_t startCPU;
        float CPUtime;
        startCPU = clock();
        ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
        CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
        printf("CPU kernel finished\n");
        // Ouput final CPU computation time
        printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
        // ------------ GPU ----------- //
        cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
        int blocks = deviceProp.multiProcessorCount;
        VecGPU *pos_g, *vel_g, *pos_l, *vel_l;
        pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for positions on the CPU
        vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for velocities on the CPU
        cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np);      // allocate memory for positions on the GPU
        cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np);      // allocate memory for velocities on the GPU
        cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*));   // copy memory address of position to the constant memory pointer pos_d
        cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*));   // copy memory address of velocity to the constant memory pointer vel_d
        for (int n = 0; n < np; n++){
            pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
            vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
        cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
        cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
        printf("Starting GPU kernel\n");
        // start cuda timer
        cudaEvent_t start, stop;
        cudaEventRecord(start, 0);
        ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt);             // Launch GPU kernel
        //stop cuda timer
        cudaEventRecord(stop, 0);
        float elapsedTime;
        cudaEventElapsedTime(&elapsedTime, start, stop);
        printf("GPU kernel finished\n");
        cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy positions from GPU memory back to CPU
        cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy velocities from GPU memory back to CPU
        // Ouput GPU computation time
        printf("GPUtime = %6.1f ms\n", elapsedTime);
        // Output speedup factor
        printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
        // free allocated memory


    \uuuu align\uuu16)


  • \uuuu常量\uuuu
  • 到处乱扔
  • 如果您没有,您应该使用适当的cuda错误检查。这只是我所说的一句话。我认为这段代码中没有任何API级别的错误
  • 你不清楚什么是“合并”。数据的对齐只会对内存事务合并的能力产生切向影响。更重要的是,warp中的相邻线程为给定内存事务生成的实际地址——它们是否指相邻内存位置?如果是这样,事情可能会很好地融合在一起。如果不是,可能不是。因此,您有一个“自然”占用12字节的数据结构,在一种情况下(较慢的一种),您告诉它占用16字节。这到底是做什么的?要回答这个问题,我们必须查看给定的交易:

        vminus.x = vel_d[n].x + CEX*0.5*dt;


    mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17    ...
    vel_d:  x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
             *           *           *           *           *       ...

  • 但我们可以做得更好。您有一个经典的AoS(结构阵列)数据存储方案,这对于GPU编程来说是非常糟糕的。标准的性能增强是将AoS转换为SoA存储。这意味着将

    mem idx:  0  1  2  3  4  5  6  7  8  9  ...
    vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9  ...
              *  *  *  *  *  *  *  *  *  *  ...

        vminus.x = vel_d_x[n] + CEX*0.5*dt;
        vminus.y = vel_d_y[n] + CEY*0.5*dt;
        vminus.z = vel_d_z[n] + CEZ*0.5*dt;
  • 下面的代码实现了上面的一些功能,包括GPU端的AoS->SoA转换,应该比任何情况都要快

    #定义CEX 10//x-电场值(无量纲和任意)
    #定义电场的CEY 0.1//y值(无量纲和任意)
    #定义电场的CEZ 0.1//z值(无量纲和任意)
    #定义磁场的CBX 0.1//x值(无量纲和任意)
    #定义磁场的CBY 0.1//x值(无量纲和任意)
    #定义CBZ 10//x-磁场值(无量纲和任意)
    #define THREADS 256//我一直在使用这些数字,直到获得最佳加速
    typedef float3矢量图形处理器;
    mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17    ...
    vel_d:  x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
             *           *           *           *           *       ...
    mem idx:  0  1  2  3  4  5  6  7  8  9  ...
    vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9  ...
              *  *  *  *  *  *  *  *  *  *  ...
        vminus.x = vel_d_x[n] + CEX*0.5*dt;
        vminus.y = vel_d_y[n] + CEY*0.5*dt;
        vminus.z = vel_d_z[n] + CEZ*0.5*dt;
