CUDA结构对齐正在减慢我的代码速度（可编译示例）_Cuda_Memory Alignment

CUDA结构对齐正在减慢我的代码速度（可编译示例）

cuda

CUDA结构对齐正在减慢我的代码速度（可编译示例）,cuda,memory-alignment,Cuda,Memory Alignment,我有一个模拟，计算带电粒子在电场和磁场中运动的三维矢量我试图在CUDA中使用\uuuuu align\uuuuu说明符来加速这个过程，认为可能是全局内存读取和写入的限制因素，但是使用\uuuuu align\uuuuu最终会减慢速度（可能是因为它增加了总内存需求）。我还尝试使用float3和float4，但它们的性能相似我已经创建了此代码的简化版本，并将其粘贴到下面以显示我的问题下面的代码应该是可编译的，通过将第四行的CASE的定义更改为0、1或2，可以尝试上面描述的不同选项。两个功能，par

我有一个模拟，计算带电粒子在电场和磁场中运动的三维矢量我试图在CUDA中使用
\uuuuu align\uuuuu
说明符来加速这个过程，认为可能是全局内存读取和写入的限制因素，但是使用
\uuuuu align\uuuuu
最终会减慢速度（可能是因为它增加了总内存需求）。我还尝试使用
float3
和
float4
，但它们的性能相似
我已经创建了此代码的简化版本，并将其粘贴到下面以显示我的问题下面的代码应该是可编译的，通过将第四行的
CASE
的定义更改为
0
、
1
或
2
，可以尝试上面描述的不同选项。两个功能，
particleOverCPU
和
particleOverGPU
被定义为比较CPU和GPU的性能

有没有一个原因是我在内存合并方面的尝试正在减慢而不是加快我的代码

对于这样一个“令人尴尬的并行”代码，有没有其他明显的事情是我没有做的，而我可以做的，以获得比60倍的加速比
谢谢大家!
CPU-英特尔至强E5620@2.40GHz
GPU-英伟达特斯拉C2070

// CASE 0: Regular struct with 3 floats // CASE 1: Aligned struct using __align__(16) with 3 floats // CASE 2: float3 #define CASE 0 // define to either 0, 1 or 2 as described above #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <Windows.h> #include <stdio.h> #include <math.h> #include <time.h> #include <malloc.h> #include <sys/stat.h> #define CEX 10 // x-value of electric field (dimensionless and arbitrary) #define CEY 0.1 // y-value of electric field (dimensionless and arbitrary) #define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary) #define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary) #define FACTOR 15 // I played around with these numbers until I got the best speedup #define THREADS 256 // I played around with these numbers until I got the best speedup typedef struct{ float x; float y; float z; } VecCPU; //Struct for vectors for CPU calculation // Fastest method seems to be a regular unaligned struct with 3 floats #if CASE==0 typedef struct { float x; float y; float z; } VecGPU; #endif #if CASE==1 // This method seems to be less fast. It is an attempt to align for memory coalescence typedef struct __align__(16){ float x; float y; float z; } VecGPU; #endif // Using float3 seems to be about the same as defining our own vector3 structure #if CASE==2 typedef float3 VecGPU; #endif VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation __constant__ VecGPU *pos_d, *vel_d; // pointers in constant memory which we will point to data in global memory void ParticleMoverCPU(int np, int ts, float dt){ int n = 0; while (n < np){ VecCPU vminus, tvec, vprime, vplus; float tvec_fact; int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_c[n].x + CEX*0.5*dt; vminus.y = vel_c[n].y + CEY*0.5*dt; vminus.z = vel_c[n].z + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_c[n].x = vplus.x + CEX*0.5*dt; vel_c[n].y = vplus.y + CEY*0.5*dt; vel_c[n].z = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_c[n].x += vel_c[n].x*dt; pos_c[n].y += vel_c[n].y*dt; pos_c[n].z += vel_c[n].z*dt; it++; } n++; } } __global__ void ParticleMoverGPU(register int np,register int ts, register float dt){ register int n = threadIdx.x + blockDim.x * blockIdx.x; while (n < np){ register VecGPU vminus, tvec, vprime, vplus;// , vtemp; register float tvec_fact; register int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_d[n].x + CEX*0.5*dt; vminus.y = vel_d[n].y + CEY*0.5*dt; vminus.z = vel_d[n].z + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_d[n].x = vplus.x + CEX*0.5*dt; vel_d[n].y = vplus.y + CEY*0.5*dt; vel_d[n].z = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_d[n].x += vel_d[n].x*dt; pos_d[n].y += vel_d[n].y*dt; pos_d[n].z += vel_d[n].z*dt; it++; } n += blockDim.x*gridDim.x; } } int main(void){ int np = 50000; // Number of Particles const int ts = 1000; // Number of Time-steps const float dt = 1E-3; // Time-step value // ----------- CPU ----------- // pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity for (int n = 0; n < np; n++){ pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables } printf("Starting CPU kernel\n"); clock_t startCPU; float CPUtime; startCPU = clock(); ParticleMoverCPU(np, ts, dt); // Launch CPU kernel CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC; printf("CPU kernel finished\n"); // Ouput final CPU computation time printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3); // ------------ GPU ----------- // cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect) cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); int blocks = deviceProp.multiProcessorCount; VecGPU *pos_g, *vel_g, *pos_l, *vel_l; pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for positions on the CPU vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for velocities on the CPU cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np); // allocate memory for velocities on the GPU cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*)); // copy memory address of position to the constant memory pointer pos_d cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*)); // copy memory address of velocity to the constant memory pointer vel_d for (int n = 0; n < np; n++){ pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU) vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU) } cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory printf("Starting GPU kernel\n"); // start cuda timer cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt); // Launch GPU kernel //stop cuda timer cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("GPU kernel finished\n"); cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy positions from GPU memory back to CPU cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy velocities from GPU memory back to CPU // Ouput GPU computation time printf("GPUtime = %6.1f ms\n", elapsedTime); // Output speedup factor printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime); // free allocated memory cudaFree(pos_l); cudaFree(vel_l); free(pos_g); free(vel_g); free(pos_c); free(vel_c); }
对于
情况1
（
对齐（16）
向量结构），我得到：
对于
案例2
（使用
float3
），我得到：
如果我使用
float4
而不是
float3
，我会得到类似于
\uuuu align\uuu16）
方法的结果
谢谢

\uuuu常量\uuuu
内存中的指针是在浪费时间。我不知道你为什么要跳过那些铁环
到处乱扔
register
都是浪费时间。在告诉编译器尽可能使用寄存器方面，您并不比编译器聪明

如果您没有，您应该使用适当的cuda错误检查。这只是我所说的一句话。我认为这段代码中没有任何API级别的错误

你不清楚什么是“合并”。数据的对齐只会对内存事务合并的能力产生切向影响。更重要的是，warp中的相邻线程为给定内存事务生成的实际地址——它们是否指相邻内存位置？如果是这样，事情可能会很好地融合在一起。如果不是，可能不是。因此，您有一个“自然”占用12字节的数据结构，在一种情况下（较慢的一种），您告诉它占用16字节。这到底是做什么的？要回答这个问题，我们必须查看给定的交易：

vminus.x = vel_d[n].x + CEX*0.5*dt;
上述事务正在请求
vel_d
向量的x分量。在“非对齐”情况下，该数据将按如下方式存储，上述事务将“询问”带星号的数量（每个扭曲32个）：
在“对齐”的情况下，上述模式如下所示：

mem idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ... vel_d: x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ... * * * * * ...
因此，我们可以看到，当您指定align指令时，打包密度较低，并且给定的128字节缓存线为给定事务提供的必要项较少。因此，在align情况下，必须从全局内存中检索更多缓存线以满足此一次读取请求。这可能是您看到的约10-20%差异的原因

但我们可以做得更好。您有一个经典的AoS（结构阵列）数据存储方案，这对于GPU编程来说是非常糟糕的。标准的性能增强是将AoS转换为SoA存储。这意味着将
pos
和
vel
向量的
x
、
y
、
z
组件分解成单独的数组，然后访问这些数组。（或者，由于您在一个线程中处理所有组件，您可以尝试执行向量加载。但这是一个。）然后，所需的存储和加载模式变为：

mem idx: 0 1 2 3 4 5 6 7 8 9 ... vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... * * * * * * * * * * ...
代码可能如下所示：

vminus.x = vel_d_x[n] + CEX*0.5*dt; vminus.y = vel_d_y[n] + CEY*0.5*dt; vminus.z = vel_d_z[n] + CEZ*0.5*dt;

下面的代码实现了上面的一些功能，包括GPU端的AoS->SoA转换，应该比任何情况都要快

$ cat t895.cu // CASE 0: Regular struct with 3 floats // CASE 1: Aligned struct using __align__(16) with 3 floats // CASE 2: float3 #define CASE 0 // define to either 0, 1 or 2 as described above #include <stdio.h> #include <math.h> #include <time.h> #include <malloc.h> #include <sys/stat.h> #define CEX 10 // x-value of electric field (dimensionless and arbitrary) #define CEY 0.1 // y-value of electric field (dimensionless and arbitrary) #define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary) #define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary) #define FACTOR 15 // I played around with these numbers until I got the best speedup #define THREADS 256 // I played around with these numbers until I got the best speedup typedef struct{ float x; float y; float z; } VecCPU; //Struct for vectors for CPU calculation // Fastest method seems to be a regular unaligned struct with 3 floats #if CASE==0 typedef struct { float x; float y; float z; } VecGPU; #endif #if CASE==1 // This method seems to be less fast. It is an attempt to align for memory coalescence typedef struct __align__(16){ float x; float y; float z; } VecGPU; #endif // Using float3 seems to be about the same as defining our own vector3 structure #if CASE==2 typedef float3 VecGPU; #endif VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation void ParticleMoverCPU(int np, int ts, float dt){ int n = 0; while (n < np){ VecCPU vminus, tvec, vprime, vplus; float tvec_fact; int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_c[n].x + CEX*0.5*dt; vminus.y = vel_c[n].y + CEY*0.5*dt; vminus.z = vel_c[n].z + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_c[n].x = vplus.x + CEX*0.5*dt; vel_c[n].y = vplus.y + CEY*0.5*dt; vel_c[n].z = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_c[n].x += vel_c[n].x*dt; pos_c[n].y += vel_c[n].y*dt; pos_c[n].z += vel_c[n].z*dt; it++; } n++; } } __global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){ int n = threadIdx.x + blockDim.x * blockIdx.x; while (n < np){ VecGPU vminus, tvec, vprime, vplus;// , vtemp; register float tvec_fact; register int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_d_x[n] + CEX*0.5*dt; vminus.y = vel_d_y[n] + CEY*0.5*dt; vminus.z = vel_d_z[n] + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_d_x[n] = vplus.x + CEX*0.5*dt; vel_d_y[n] = vplus.y + CEY*0.5*dt; vel_d_z[n] = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_d_x[n] += vel_d_x[n]*dt; pos_d_y[n] += vel_d_y[n]*dt; pos_d_z[n] += vel_d_z[n]*dt; it++; } n += blockDim.x*gridDim.x; } } int main(void){ int np = 50000; // Number of Particles const int ts = 1000; // Number of Time-steps const float dt = 1E-3; // Time-step value // ----------- CPU ----------- // pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity for (int n = 0; n < np; n++){ pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables } printf("Starting CPU kernel\n"); clock_t startCPU; float CPUtime; startCPU = clock(); ParticleMoverCPU(np, ts, dt); // Launch CPU kernel CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC; printf("CPU kernel finished\n"); // Ouput final CPU computation time printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3); // ------------ GPU ----------- // cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect) cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); int blocks = deviceProp.multiProcessorCount; float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z; pos_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU pos_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU pos_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU cudaMalloc((void**)&pos_l_x, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_x, sizeof(float)*np); // allocate memory for velocities on the GPU cudaMalloc((void**)&pos_l_y, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_y, sizeof(float)*np); // allocate memory for velocities on the GPU cudaMalloc((void**)&pos_l_z, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_z, sizeof(float)*np); // allocate memory for velocities on the GPU for (int n = 0; n < np; n++){ pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU) vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU) } cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory printf("Starting GPU kernel\n"); // start cuda timer cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt); // Launch GPU kernel //stop cuda timer cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("GPU kernel finished\n"); // Ouput GPU computation time printf("GPUtime = %6.1f ms\n", elapsedTime); // Output speedup factor printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime); } $ nvcc -O3 -o t895 t895.cu $ ./t895 Starting CPU kernel CPU kernel finished CPUtime = 923.6 ms Starting GPU kernel GPU kernel finished GPUtime = 12.3 ms CASE=0, Speedup = 74.95 $

$cat t895.cu //案例0：具有3个浮点数的常规结构 //案例1：使用带有3个浮点数的_align__16）对齐结构 //案例2:3 #定义案例0//如上所述定义为0、1或2 #包括 #包括 #包括 #包括 #包括 #定义CEX 10//x-电场值（无量纲和任意） #定义电场的CEY 0.1//y值（无量纲和任意） #定义电场的CEZ 0.1//z值（无量纲和任意） #定义磁场的CBX 0.1//x值（无量纲和任意） #定义磁场的CBY 0.1//x值（无量纲和任意） #定义CBZ 10//x-磁场值（无量纲和任意） #定义因子15//我一直在使用这些数字，直到获得最佳加速比 #define THREADS 256//我一直在使用这些数字，直到获得最佳加速类型定义结构{ 浮动x；浮动y；浮动z； }向量CPU//用于CPU计算的向量的结构 //最快的方法似乎是一个有3个浮点数的常规未对齐结构 #如果CASE==0 类型定义结构{ 浮动x；浮动y；浮动z； }矢量图形处理器； #恩迪夫 #如果CASE==1 //这种方法似乎不那么快。这是一种为内存合并而对齐的尝试类型定义结构对齐（16）{ 浮动x；浮动y；浮动z； }矢量图形处理器； #恩迪夫 //使用float3似乎与定义我们自己的vector3结构差不多 #如果案例==2 typedef float3矢量图形处理器； #恩迪夫 VecCPU*pos_c，*vel_c；//用于CPU计算的全局位置和速度矢量空隙颗粒 mem idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ... vel_d: x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ... * * * * * ... mem idx: 0 1 2 3 4 5 6 7 8 9 ... vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... * * * * * * * * * * ... vminus.x = vel_d_x[n] + CEX*0.5*dt; vminus.y = vel_d_y[n] + CEY*0.5*dt; vminus.z = vel_d_z[n] + CEZ*0.5*dt; $ cat t895.cu // CASE 0: Regular struct with 3 floats // CASE 1: Aligned struct using __align__(16) with 3 floats // CASE 2: float3 #define CASE 0 // define to either 0, 1 or 2 as described above #include <stdio.h> #include <math.h> #include <time.h> #include <malloc.h> #include <sys/stat.h> #define CEX 10 // x-value of electric field (dimensionless and arbitrary) #define CEY 0.1 // y-value of electric field (dimensionless and arbitrary) #define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary) #define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary) #define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary) #define FACTOR 15 // I played around with these numbers until I got the best speedup #define THREADS 256 // I played around with these numbers until I got the best speedup typedef struct{ float x; float y; float z; } VecCPU; //Struct for vectors for CPU calculation // Fastest method seems to be a regular unaligned struct with 3 floats #if CASE==0 typedef struct { float x; float y; float z; } VecGPU; #endif #if CASE==1 // This method seems to be less fast. It is an attempt to align for memory coalescence typedef struct __align__(16){ float x; float y; float z; } VecGPU; #endif // Using float3 seems to be about the same as defining our own vector3 structure #if CASE==2 typedef float3 VecGPU; #endif VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation void ParticleMoverCPU(int np, int ts, float dt){ int n = 0; while (n < np){ VecCPU vminus, tvec, vprime, vplus; float tvec_fact; int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_c[n].x + CEX*0.5*dt; vminus.y = vel_c[n].y + CEY*0.5*dt; vminus.z = vel_c[n].z + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_c[n].x = vplus.x + CEX*0.5*dt; vel_c[n].y = vplus.y + CEY*0.5*dt; vel_c[n].z = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_c[n].x += vel_c[n].x*dt; pos_c[n].y += vel_c[n].y*dt; pos_c[n].z += vel_c[n].z*dt; it++; } n++; } } __global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){ int n = threadIdx.x + blockDim.x * blockIdx.x; while (n < np){ VecGPU vminus, tvec, vprime, vplus;// , vtemp; register float tvec_fact; register int it = 0; while (it < ts){ // ----- Update velocities by the Boris method ------ // vminus.x = vel_d_x[n] + CEX*0.5*dt; vminus.y = vel_d_y[n] + CEY*0.5*dt; vminus.z = vel_d_z[n] + CEZ*0.5*dt; tvec.x = CBX*0.5*dt; tvec.y = CBY*0.5*dt; tvec.z = CBZ*0.5*dt; tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z); vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y; vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z; vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x; vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact; vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact; vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact; vel_d_x[n] = vplus.x + CEX*0.5*dt; vel_d_y[n] = vplus.y + CEY*0.5*dt; vel_d_z[n] = vplus.z + CEZ*0.5*dt; // ------ Update Particle positions -------------- // pos_d_x[n] += vel_d_x[n]*dt; pos_d_y[n] += vel_d_y[n]*dt; pos_d_z[n] += vel_d_z[n]*dt; it++; } n += blockDim.x*gridDim.x; } } int main(void){ int np = 50000; // Number of Particles const int ts = 1000; // Number of Time-steps const float dt = 1E-3; // Time-step value // ----------- CPU ----------- // pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity for (int n = 0; n < np; n++){ pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables } printf("Starting CPU kernel\n"); clock_t startCPU; float CPUtime; startCPU = clock(); ParticleMoverCPU(np, ts, dt); // Launch CPU kernel CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC; printf("CPU kernel finished\n"); // Ouput final CPU computation time printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3); // ------------ GPU ----------- // cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect) cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); int blocks = deviceProp.multiProcessorCount; float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z; pos_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU pos_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU pos_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU vel_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU cudaMalloc((void**)&pos_l_x, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_x, sizeof(float)*np); // allocate memory for velocities on the GPU cudaMalloc((void**)&pos_l_y, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_y, sizeof(float)*np); // allocate memory for velocities on the GPU cudaMalloc((void**)&pos_l_z, sizeof(float)*np); // allocate memory for positions on the GPU cudaMalloc((void**)&vel_l_z, sizeof(float)*np); // allocate memory for velocities on the GPU for (int n = 0; n < np; n++){ pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU) vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU) } cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory printf("Starting GPU kernel\n"); // start cuda timer cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt); // Launch GPU kernel //stop cuda timer cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("GPU kernel finished\n"); // Ouput GPU computation time printf("GPUtime = %6.1f ms\n", elapsedTime); // Output speedup factor printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime); } $ nvcc -O3 -o t895 t895.cu $ ./t895 Starting CPU kernel CPU kernel finished CPUtime = 923.6 ms Starting GPU kernel GPU kernel finished GPUtime = 12.3 ms CASE=0, Speedup = 74.95 $