CUDA结构对齐正在减慢我的代码速度(可编译示例)
我有一个模拟,计算带电粒子在电场和磁场中运动的三维矢量我试图在CUDA中使用CUDA结构对齐正在减慢我的代码速度(可编译示例),cuda,memory-alignment,Cuda,Memory Alignment,我有一个模拟,计算带电粒子在电场和磁场中运动的三维矢量我试图在CUDA中使用\uuuuu align\uuuuu说明符来加速这个过程,认为可能是全局内存读取和写入的限制因素,但是使用\uuuuu align\uuuuu最终会减慢速度(可能是因为它增加了总内存需求)。我还尝试使用float3和float4,但它们的性能相似 我已经创建了此代码的简化版本,并将其粘贴到下面以显示我的问题下面的代码应该是可编译的,通过将第四行的CASE的定义更改为0、1或2,可以尝试上面描述的不同选项。两个功能,par
\uuuuu align\uuuuu
说明符来加速这个过程,认为可能是全局内存读取和写入的限制因素,但是使用\uuuuu align\uuuuu
最终会减慢速度(可能是因为它增加了总内存需求)。我还尝试使用float3
和float4
,但它们的性能相似
我已经创建了此代码的简化版本,并将其粘贴到下面以显示我的问题下面的代码应该是可编译的,通过将第四行的CASE
的定义更改为0
、1
或2
,可以尝试上面描述的不同选项。两个功能,particleOverCPU
和particleOverGPU
被定义为比较CPU和GPU的性能
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE 0 // define to either 0, 1 or 2 as described above
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>
#define CEX 10 // x-value of electric field (dimensionless and arbitrary)
#define CEY 0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary)
#define FACTOR 15 // I played around with these numbers until I got the best speedup
#define THREADS 256 // I played around with these numbers until I got the best speedup
typedef struct{
float x;
float y;
float z;
} VecCPU; //Struct for vectors for CPU calculation
// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
float x;
float y;
float z;
} VecGPU;
#endif
#if CASE==1
// This method seems to be less fast. It is an attempt to align for memory coalescence
typedef struct __align__(16){
float x;
float y;
float z;
} VecGPU;
#endif
// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif
VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d; // pointers in constant memory which we will point to data in global memory
void ParticleMoverCPU(int np, int ts, float dt){
int n = 0;
while (n < np){
VecCPU vminus, tvec, vprime, vplus;
float tvec_fact;
int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_c[n].x + CEX*0.5*dt;
vminus.y = vel_c[n].y + CEY*0.5*dt;
vminus.z = vel_c[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_c[n].x = vplus.x + CEX*0.5*dt;
vel_c[n].y = vplus.y + CEY*0.5*dt;
vel_c[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_c[n].x += vel_c[n].x*dt;
pos_c[n].y += vel_c[n].y*dt;
pos_c[n].z += vel_c[n].z*dt;
it++;
}
n++;
}
}
__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){
register int n = threadIdx.x + blockDim.x * blockIdx.x;
while (n < np){
register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
register float tvec_fact;
register int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_d[n].x + CEX*0.5*dt;
vminus.y = vel_d[n].y + CEY*0.5*dt;
vminus.z = vel_d[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_d[n].x = vplus.x + CEX*0.5*dt;
vel_d[n].y = vplus.y + CEY*0.5*dt;
vel_d[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_d[n].x += vel_d[n].x*dt;
pos_d[n].y += vel_d[n].y*dt;
pos_d[n].z += vel_d[n].z*dt;
it++;
}
n += blockDim.x*gridDim.x;
}
}
int main(void){
int np = 50000; // Number of Particles
const int ts = 1000; // Number of Time-steps
const float dt = 1E-3; // Time-step value
// ----------- CPU ----------- //
pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position
vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity
for (int n = 0; n < np; n++){
pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables
vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables
}
printf("Starting CPU kernel\n");
clock_t startCPU;
float CPUtime;
startCPU = clock();
ParticleMoverCPU(np, ts, dt); // Launch CPU kernel
CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
printf("CPU kernel finished\n");
// Ouput final CPU computation time
printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
// ------------ GPU ----------- //
cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect)
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int blocks = deviceProp.multiProcessorCount;
VecGPU *pos_g, *vel_g, *pos_l, *vel_l;
pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for positions on the CPU
vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for velocities on the CPU
cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np); // allocate memory for velocities on the GPU
cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*)); // copy memory address of position to the constant memory pointer pos_d
cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*)); // copy memory address of velocity to the constant memory pointer vel_d
for (int n = 0; n < np; n++){
pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
}
cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
printf("Starting GPU kernel\n");
// start cuda timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt); // Launch GPU kernel
//stop cuda timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU kernel finished\n");
cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy positions from GPU memory back to CPU
cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy velocities from GPU memory back to CPU
// Ouput GPU computation time
printf("GPUtime = %6.1f ms\n", elapsedTime);
// Output speedup factor
printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
// free allocated memory
cudaFree(pos_l);
cudaFree(vel_l);
free(pos_g);
free(vel_g);
free(pos_c);
free(vel_c);
}
对于情况1
(对齐(16)
向量结构),我得到:
对于案例2
(使用float3
),我得到:
如果我使用float4
而不是float3
,我会得到类似于\uuuu align\uuu16)
方法的结果
谢谢
\uuuu常量\uuuu
内存中的指针是在浪费时间。我不知道你为什么要跳过那些铁环李>
register
都是浪费时间。在告诉编译器尽可能使用寄存器方面,您并不比编译器聪明 vminus.x = vel_d[n].x + CEX*0.5*dt;
上述事务正在请求vel_d
向量的x分量。在“非对齐”情况下,该数据将按如下方式存储,上述事务将“询问”带星号的数量(每个扭曲32个):
在“对齐”的情况下,上述模式如下所示:
mem idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
vel_d: x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
* * * * * ...
因此,我们可以看到,当您指定align指令时,打包密度较低,并且给定的128字节缓存线为给定事务提供的必要项较少。因此,在align情况下,必须从全局内存中检索更多缓存线以满足此一次读取请求。这可能是您看到的约10-20%差异的原因pos
和vel
向量的x
、y
、z
组件分解成单独的数组,然后访问这些数组。(或者,由于您在一个线程中处理所有组件,您可以尝试执行向量加载。但这是一个。)然后,所需的存储和加载模式变为:
mem idx: 0 1 2 3 4 5 6 7 8 9 ...
vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ...
* * * * * * * * * * ...
代码可能如下所示:
vminus.x = vel_d_x[n] + CEX*0.5*dt;
vminus.y = vel_d_y[n] + CEY*0.5*dt;
vminus.z = vel_d_z[n] + CEZ*0.5*dt;
$ cat t895.cu
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE 0 // define to either 0, 1 or 2 as described above
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>
#define CEX 10 // x-value of electric field (dimensionless and arbitrary)
#define CEY 0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary)
#define FACTOR 15 // I played around with these numbers until I got the best speedup
#define THREADS 256 // I played around with these numbers until I got the best speedup
typedef struct{
float x;
float y;
float z;
} VecCPU; //Struct for vectors for CPU calculation
// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
float x;
float y;
float z;
} VecGPU;
#endif
#if CASE==1
// This method seems to be less fast. It is an attempt to align for memory coalescence
typedef struct __align__(16){
float x;
float y;
float z;
} VecGPU;
#endif
// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif
VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation
void ParticleMoverCPU(int np, int ts, float dt){
int n = 0;
while (n < np){
VecCPU vminus, tvec, vprime, vplus;
float tvec_fact;
int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_c[n].x + CEX*0.5*dt;
vminus.y = vel_c[n].y + CEY*0.5*dt;
vminus.z = vel_c[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_c[n].x = vplus.x + CEX*0.5*dt;
vel_c[n].y = vplus.y + CEY*0.5*dt;
vel_c[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_c[n].x += vel_c[n].x*dt;
pos_c[n].y += vel_c[n].y*dt;
pos_c[n].z += vel_c[n].z*dt;
it++;
}
n++;
}
}
__global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){
int n = threadIdx.x + blockDim.x * blockIdx.x;
while (n < np){
VecGPU vminus, tvec, vprime, vplus;// , vtemp;
register float tvec_fact;
register int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_d_x[n] + CEX*0.5*dt;
vminus.y = vel_d_y[n] + CEY*0.5*dt;
vminus.z = vel_d_z[n] + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_d_x[n] = vplus.x + CEX*0.5*dt;
vel_d_y[n] = vplus.y + CEY*0.5*dt;
vel_d_z[n] = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_d_x[n] += vel_d_x[n]*dt;
pos_d_y[n] += vel_d_y[n]*dt;
pos_d_z[n] += vel_d_z[n]*dt;
it++;
}
n += blockDim.x*gridDim.x;
}
}
int main(void){
int np = 50000; // Number of Particles
const int ts = 1000; // Number of Time-steps
const float dt = 1E-3; // Time-step value
// ----------- CPU ----------- //
pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position
vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity
for (int n = 0; n < np; n++){
pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables
vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables
}
printf("Starting CPU kernel\n");
clock_t startCPU;
float CPUtime;
startCPU = clock();
ParticleMoverCPU(np, ts, dt); // Launch CPU kernel
CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
printf("CPU kernel finished\n");
// Ouput final CPU computation time
printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
// ------------ GPU ----------- //
cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect)
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int blocks = deviceProp.multiProcessorCount;
float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z;
pos_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
pos_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
pos_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
cudaMalloc((void**)&pos_l_x, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_x, sizeof(float)*np); // allocate memory for velocities on the GPU
cudaMalloc((void**)&pos_l_y, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_y, sizeof(float)*np); // allocate memory for velocities on the GPU
cudaMalloc((void**)&pos_l_z, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_z, sizeof(float)*np); // allocate memory for velocities on the GPU
for (int n = 0; n < np; n++){
pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU)
vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU)
}
cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
printf("Starting GPU kernel\n");
// start cuda timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt); // Launch GPU kernel
//stop cuda timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU kernel finished\n");
// Ouput GPU computation time
printf("GPUtime = %6.1f ms\n", elapsedTime);
// Output speedup factor
printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
}
$ nvcc -O3 -o t895 t895.cu
$ ./t895
Starting CPU kernel
CPU kernel finished
CPUtime = 923.6 ms
Starting GPU kernel
GPU kernel finished
GPUtime = 12.3 ms
CASE=0, Speedup = 74.95
$
$cat t895.cu
//案例0:具有3个浮点数的常规结构
//案例1:使用带有3个浮点数的_align__16)对齐结构
//案例2:3
#定义案例0//如上所述定义为0、1或2
#包括
#包括
#包括
#包括
#包括
#定义CEX 10//x-电场值(无量纲和任意)
#定义电场的CEY 0.1//y值(无量纲和任意)
#定义电场的CEZ 0.1//z值(无量纲和任意)
#定义磁场的CBX 0.1//x值(无量纲和任意)
#定义磁场的CBY 0.1//x值(无量纲和任意)
#定义CBZ 10//x-磁场值(无量纲和任意)
#定义因子15//我一直在使用这些数字,直到获得最佳加速比
#define THREADS 256//我一直在使用这些数字,直到获得最佳加速
类型定义结构{
浮动x;
浮动y;
浮动z;
}向量CPU//用于CPU计算的向量的结构
//最快的方法似乎是一个有3个浮点数的常规未对齐结构
#如果CASE==0
类型定义结构{
浮动x;
浮动y;
浮动z;
}矢量图形处理器;
#恩迪夫
#如果CASE==1
//这种方法似乎不那么快。这是一种为内存合并而对齐的尝试
类型定义结构对齐(16){
浮动x;
浮动y;
浮动z;
}矢量图形处理器;
#恩迪夫
//使用float3似乎与定义我们自己的vector3结构差不多
#如果案例==2
typedef float3矢量图形处理器;
#恩迪夫
VecCPU*pos_c,*vel_c;//用于CPU计算的全局位置和速度矢量
空隙颗粒
mem idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
vel_d: x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
* * * * * ...
mem idx: 0 1 2 3 4 5 6 7 8 9 ...
vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ...
* * * * * * * * * * ...
vminus.x = vel_d_x[n] + CEX*0.5*dt;
vminus.y = vel_d_y[n] + CEY*0.5*dt;
vminus.z = vel_d_z[n] + CEZ*0.5*dt;
$ cat t895.cu
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE 0 // define to either 0, 1 or 2 as described above
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>
#define CEX 10 // x-value of electric field (dimensionless and arbitrary)
#define CEY 0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary)
#define FACTOR 15 // I played around with these numbers until I got the best speedup
#define THREADS 256 // I played around with these numbers until I got the best speedup
typedef struct{
float x;
float y;
float z;
} VecCPU; //Struct for vectors for CPU calculation
// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
float x;
float y;
float z;
} VecGPU;
#endif
#if CASE==1
// This method seems to be less fast. It is an attempt to align for memory coalescence
typedef struct __align__(16){
float x;
float y;
float z;
} VecGPU;
#endif
// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif
VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation
void ParticleMoverCPU(int np, int ts, float dt){
int n = 0;
while (n < np){
VecCPU vminus, tvec, vprime, vplus;
float tvec_fact;
int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_c[n].x + CEX*0.5*dt;
vminus.y = vel_c[n].y + CEY*0.5*dt;
vminus.z = vel_c[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_c[n].x = vplus.x + CEX*0.5*dt;
vel_c[n].y = vplus.y + CEY*0.5*dt;
vel_c[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_c[n].x += vel_c[n].x*dt;
pos_c[n].y += vel_c[n].y*dt;
pos_c[n].z += vel_c[n].z*dt;
it++;
}
n++;
}
}
__global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){
int n = threadIdx.x + blockDim.x * blockIdx.x;
while (n < np){
VecGPU vminus, tvec, vprime, vplus;// , vtemp;
register float tvec_fact;
register int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_d_x[n] + CEX*0.5*dt;
vminus.y = vel_d_y[n] + CEY*0.5*dt;
vminus.z = vel_d_z[n] + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_d_x[n] = vplus.x + CEX*0.5*dt;
vel_d_y[n] = vplus.y + CEY*0.5*dt;
vel_d_z[n] = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_d_x[n] += vel_d_x[n]*dt;
pos_d_y[n] += vel_d_y[n]*dt;
pos_d_z[n] += vel_d_z[n]*dt;
it++;
}
n += blockDim.x*gridDim.x;
}
}
int main(void){
int np = 50000; // Number of Particles
const int ts = 1000; // Number of Time-steps
const float dt = 1E-3; // Time-step value
// ----------- CPU ----------- //
pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position
vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity
for (int n = 0; n < np; n++){
pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables
vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables
}
printf("Starting CPU kernel\n");
clock_t startCPU;
float CPUtime;
startCPU = clock();
ParticleMoverCPU(np, ts, dt); // Launch CPU kernel
CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
printf("CPU kernel finished\n");
// Ouput final CPU computation time
printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
// ------------ GPU ----------- //
cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect)
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int blocks = deviceProp.multiProcessorCount;
float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z;
pos_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_x = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
pos_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_y = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
pos_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for positions on the CPU
vel_g_z = (float*)malloc(sizeof(float)*np); // allocate memory for velocities on the CPU
cudaMalloc((void**)&pos_l_x, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_x, sizeof(float)*np); // allocate memory for velocities on the GPU
cudaMalloc((void**)&pos_l_y, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_y, sizeof(float)*np); // allocate memory for velocities on the GPU
cudaMalloc((void**)&pos_l_z, sizeof(float)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l_z, sizeof(float)*np); // allocate memory for velocities on the GPU
for (int n = 0; n < np; n++){
pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU)
vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU)
}
cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
printf("Starting GPU kernel\n");
// start cuda timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt); // Launch GPU kernel
//stop cuda timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU kernel finished\n");
// Ouput GPU computation time
printf("GPUtime = %6.1f ms\n", elapsedTime);
// Output speedup factor
printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
}
$ nvcc -O3 -o t895 t895.cu
$ ./t895
Starting CPU kernel
CPU kernel finished
CPUtime = 923.6 ms
Starting GPU kernel
GPU kernel finished
GPUtime = 12.3 ms
CASE=0, Speedup = 74.95
$