N体CUDA优化_Cuda_Simulation - Fatal编程技术网

N体CUDA优化

cuda

N体CUDA优化,cuda,simulation,Cuda,Simulation,我正在CUDA中开发一个N体算法，我想学习一些优化技巧我已经设法让16384实体在一台拥有27流式多处理器的NVIDIA Geforce GTX 260上以20Flops的速度运行 KernelcomputeForces函数在95%的时间里是一个缓慢的过程，我想知道是否还有其他方法可以优化我的代码据我所见，我已经针对内存空间局部性和内存写入进行了优化。在CUDA文档的某个地方，它说共享内存更快，但我不知道如何利用它。我将工作划分为16块，每个块上有512个线程，但这对我来说有点模糊请帮忙，

我正在CUDA中开发一个N体算法，我想学习一些优化技巧

我已经设法让

实体在一台拥有

流式多处理器的NVIDIA Geforce GTX 260上以

20Flops

的速度运行

KernelcomputeForces

函数在

95%

的时间里是一个缓慢的过程，我想知道是否还有其他方法可以优化我的代码

据我所见，我已经针对内存空间局部性和内存写入进行了优化。在CUDA文档的某个地方，它说共享内存更快，但我不知道如何利用它。我将工作划分为

块，每个块上有

个线程，但这对我来说有点模糊

请帮忙，谢谢你读到这篇文章

n   is number of bodies

gm  is the gpu mass pointer

gpx is the gpu position x pointer

gpy is the gpu position y pointer

gpz is the gpu position z pointer

gfx is the gpu force x pointer

gfy is the gpu force y pointer

gfz is the gpu force z pointer

相关的核函数

__global__ void KernelcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int numThreads = blockDim.x * gridDim.x;

    float GRAVITY = 0.00001f;

    //compare all with all
    for( unsigned int ia=tid; ia<n; ia+=numThreads ){
        float lfx = 0.0f;
        float lfy = 0.0f;
        float lfz = 0.0f;

        for( unsigned int ib=0; ib<n; ib++ ){
            //compute distance
            float dx = ( gpx[ib] - gpx[ia]);
            float dy = ( gpy[ib] - gpy[ia] );
            float dz = ( gpz[ib] - gpz[ia] );
            //float distance = sqrt( dx*dx + dy*dy + dz*dz );
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            //prevent slingshots and division by zero
            //distance += 0.1f;
            distanceSquared += 0.01f;

            //calculate gravitational magnitude between the bodies
            //float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distance * distance * distance * distance );
            float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distanceSquared );

            //calculate forces for the bodies
            //magnitude times direction
            lfx += magnitude * ( dx );
            lfy += magnitude * ( dy );
            lfz += magnitude * ( dz );
        }

        //stores local memory to global memory
        gfx[ia] = lfx;
        gfy[ia] = lfy;
        gfz[ia] = lfz;
    }
}

extern void GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){  
    dim3 gridDim( 16, 1, 1 ); //specifys how many blocks in three possible dimensions
    dim3 blockDim( 512, 1, 1 ); //threads per block
    KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
}

\uuuu全局\uuuuu无效内核计算力（无符号整数n、浮点*gm、浮点*gpx、浮点*gpy、浮点*gpz、浮点*gfx、浮点*gfy、浮点*gfz）{
int tid=blockDim.x*blockIdx.x+threadIdx.x；
int numThreads=blockDim.x*gridDim.x；
浮子重力=0.00001f；
//比较所有与所有
对于（unsigned int ia=tid；ia在执行任何其他操作之前，请尝试运行更多的块。给定的块仅在单个SM上运行-通过仅使用16个块，您可以保证大约40%的GPU容量处于空闲状态。27的倍数应该是GTX260-216上的最佳块数。您还可能会发现减少t的数量每个块的线程数不会影响性能，因此每个线程的工作量大致相同，但只需使用足够的块来覆盖GPU中的所有SM即可
编辑：
为了说明这一点，考虑一下内核的这个小测试工具：
template<int blocksize, int gridsize>
extern float GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){  
    float time;

    dim3 gridDim( gridsize, 1, 1 ); //specifys how many blocks in three possible dimensions
    dim3 blockDim( blocksize, 1, 1 ); //threads per block

    cudaEvent_t start, stop;
    errchk( cudaEventCreate(&start) );
    errchk( cudaEventCreate(&stop) );

    errchk( cudaEventRecord(start, 0) );
    KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
    rterrchk;

    errchk( cudaEventRecord(stop, 0) );
    errchk( cudaEventSynchronize(stop) );
    errchk( cudaEventElapsedTime(&time, start, stop) );

    return time;
}

int main(void)
{
    const int n = 16384;
    size_t gsize = sizeof(float) * size_t(n);

    float * g[4], * _g[7];

    errchk( cudaSetDevice(1) );  // GTX 275

    for(int i=0; i<7; i++) 
        errchk( cudaMalloc((void **)&_g[i], gsize) ); 

    for(int i=0; i<4; i++)
        g[i] = (float *)malloc(gsize);

    for(int i=0; i<n; i++)
    for(int j=0; j<4; j++)
    *(g[j]+i) = (float)drand48();

    for(int i=0; i<4; i++) 
        errchk( cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice) ); 

    // Warm up to take context establishment time out.
    GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]);

    // Bench runs
    printf("(1,1)@(512,1,1): %f\n", GPUcomputeForces<1,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(8,1)@(512,1,1): %f\n", GPUcomputeForces<8,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(16,1)@(512,1,1): %f\n", GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(30,1)@(256,1,1): %f\n", GPUcomputeForces<30,256>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(60,1)@(128,1,1): %f\n", GPUcomputeForces<60,128>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(120,1)@(64,1,1): %f\n", GPUcomputeForces<120,64>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
    printf("(240,1)@(32,1,1): %f\n", GPUcomputeForces<240,32>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );

    cudaThreadExit();

    return 0;
}

Ie：即使使用非常小的数据块大小，运行至少与卡上MP相同数量的数据块对于提高性能也是至关重要的。
cuda探查器将告诉您有关占用率、条件分支、全局内存使用（缓存未命中或未恢复读取，取决于cuda版本）的信息，等等。这是优化内核的一个重要部分。
共享内存将是此类内核中一个有用的优化-它允许合并粒子位置和质量的读取，这在GT200上非常重要。我发现这大约是您版本的两倍（使用128块128线程的16384粒子启动）：
模板
__全球的__
void KernelcomputeForces1（无符号整数n1，浮点*gm，浮点*gpx，
浮点*gpy，浮点*gpz，浮点*gfx，浮点*gfy，浮点*gfz）
{
__共享浮点数lgpx[blocksize]，lgpy[blocksize]，
lgpz[blocksize]，lgm[blocksize]；
恒浮重力=0.00001f；
//比较所有与所有
浮球lfx=0.0f，lfy=0.0f，lfz=0.0f；
int ia=blockDim.x*blockIdx.x+threadIdx.x；
浮点lgpx0=gpx[ia]，lgpy0=gpy[ia]，
lgpz0=gpz[ia]，lgm0=gm[ia]；
对于（unsigned int ib=0；ib@Talonmes已经回答了这个问题，说明了共享内存如何有助于提高性能。因此，没有更多需要添加的内容了。我只想提供一个完整的代码，包含不同的优化步骤，包括平铺、共享内存和洗牌操作，一个nd显示在开普勒K20c上进行的一些测试结果
下面的代码是围绕@talonmies所展示的内容设计的，它具有5
内核函数，即
内核计算力
：无优化
KernelcomputeForces\u Shared
：利用源块和共享内存中的平铺
KernelcomputeForces\u Tiling
：利用目标块中的Tiling
KernelcomputeForces\u Tiling\u Shared
：利用源和目标块中的平铺，并利用共享内存
KernelcomputeForces\u Tiling\u Shuffle
：与上面相同，但使用Shuffle操作而不是共享内存
也许，与已经发表在文献（）中的内容和已经作为代码提供的内容相比（参见上面的答案，最后一个内核是唯一的新东西。但是我对N-body做了一些研究，发现发布这个答案很有用，可能对下一个用户有用
这是密码
#include <stdio.h>

#define GRAVITATIONAL_CONST 6.67*1e−11
#define SOFTENING 1e-9f

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        __syncthreads();

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    const int laneid = threadIdx.x & 31;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=32) {

            // --- Loading data to shared memory
            float x_src = x_d[ib + laneid];
            float y_src = y_d[ib + laneid];
            float z_src = z_d[ib + laneid];
            float m_src = m_d[ib + laneid];

#pragma unroll 32
            for(unsigned int ic=0; ic<32; ic++) {

                // --- Compute relative distances
                float dx = (__shfl(x_src, ic) - x_reg);
                float dy = (__shfl(y_src, ic) - y_reg);
                float dz = (__shfl(z_src, ic) - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1);       // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);                // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(GRIDSIZE, 1, 1);      // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);    // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/********************/
/* CPU CALCULATIONS */
/********************/
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {  

    for (unsigned int i=0; i<N; i++) {

        float invDist, invDist3;

        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_h[ib] - x_h[i]);
            float dy = (y_h[ib] - y_h[i]);
            float dz = (z_h[ib] - z_h[i]);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = 1.f / sqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;

        }

        // --- Stores local memory to global memory
        fx_h[i] = fx_temp;
        fy_h[i] = fy_temp;
        fz_h[i] = fz_temp;
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    const int N = 16384;

    size_t gsize = sizeof(float) * size_t(N);

    float * g[10], * _g[7];

    for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize)); 

    for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);

    for(int i=0; i<N; i++)
        for(int j=0; j<4; j++)
            *(g[j]+i) = (float)rand();

    for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice)); 

    // --- Warm up to take context establishment time out.
    GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
    //GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);

    // --- Bench runs
    printf("1024: %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512:  %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256:  %f\n",    GPUcomputeForces<256>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128:  %f\n",    GPUcomputeForces<128>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64:   %f\n",    GPUcomputeForces<64>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32:   %f\n",    GPUcomputeForces<32>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    printf("16, 1024: %f\n",    GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  1024: %f\n",    GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("4,  1024: %f\n",    GPUcomputeForces_Tiling<4,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 512: %f\n",     GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 512: %f\n",     GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  512: %f\n",     GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 256:  %f\n",    GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 256:  %f\n",    GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 256:  %f\n",    GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,128:  %f\n",    GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 128:  %f\n",    GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 128:  %f\n",    GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,64:   %f\n",    GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64:   %f\n",    GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 64:   %f\n",    GPUcomputeForces_Tiling<64,64>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost)); 

    CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);

    for(int i=0; i<N; i++)
        for(int j=8; j<10; j++) {

            if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
                printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
                return;
            }
        }

    printf("Test passed!\n");

    cudaDeviceReset();

    return 0;
}

#包括
#定义重力常数6.67*1e−11
#定义1e-9f
/********************/
/*CUDA错误检查*/
/********************/
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
内联void gpuAssert（cudaError\u t代码，char*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
如果（中止）退出（代码）；
}
}
/**********/
/*iDivUp*/
/**********/
intidivup（inta，intb）{返回（（a%b）！=0）？（a/b+1）：（a/b）；}
/*******************/
/*核函数*/
/*******************/
模板
__全球的__
无效内核计算力（浮点*m_d，浮点*x_d，浮点*y_d，浮点*z_d，浮点*fx_d，浮点*fy_d，浮点*fz_d，无符号整数N）
{
int tid=blockDim.x*blockIdx.x+threadIdx.x；
如果（tid对于（unsigned int ib=0；ib我想为上面提供的答案提供一个单独的、额外的答案，我认为这将对下一个用户有用
进一步优化N体问题的一种方法是采用基于树的方法，如Barnes-Hut方法，该方法在年被并行化
M. Burtscher, K. Pingali, "An Efficient CUDA Implementation of the Tree-Based Barnes Hut n-Body

算法”，GPU计算宝石，翡翠版
谁的CUDA
#include <stdio.h>

#define GRAVITATIONAL_CONST 6.67*1e−11
#define SOFTENING 1e-9f

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    if (tid < N) {

        float invDist, invDist3;

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_d[ib] - x_reg);
            float dy = (y_d[ib] - y_reg);
            float dz = (z_d[ib] - z_reg);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = rsqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;
        }
        __syncthreads();

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        __shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {

            // --- Loading data to shared memory
            x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
            y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
            z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
            m_sh[threadIdx.x] = m_d[ib + threadIdx.x];

            __syncthreads();

#pragma unroll
            for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {

                // --- Compute relative distances
                float dx = (x_sh[ic] - x_reg);
                float dy = (y_sh[ic] - y_reg);
                float dz = (z_sh[ic] - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
    float invDist, invDist3;

    const int laneid = threadIdx.x & 31;

    for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
                  tid < N;
                  tid += blockDim.x*gridDim.x) {

        // --- Initialize register accumulators for forces
        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Move target particle data to registers
        float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib+=32) {

            // --- Loading data to shared memory
            float x_src = x_d[ib + laneid];
            float y_src = y_d[ib + laneid];
            float z_src = z_d[ib + laneid];
            float m_src = m_d[ib + laneid];

#pragma unroll 32
            for(unsigned int ic=0; ic<32; ic++) {

                // --- Compute relative distances
                float dx = (__shfl(x_src, ic) - x_reg);
                float dy = (__shfl(y_src, ic) - y_reg);
                float dz = (__shfl(z_src, ic) - z_reg);
                float distanceSquared = dx*dx + dy*dy + dz*dz;

                // --- Prevent slingshots and division by zero
                distanceSquared += SOFTENING;

                float invDist = rsqrtf(distanceSquared);
                float invDist3 = invDist * invDist * invDist;

                // --- Calculate gravitational magnitude between the bodies
                float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;

                // --- Calculate forces for the bodies: magnitude times direction
                fx_temp += magnitude*dx;
                fy_temp += magnitude*dy;
                fz_temp += magnitude*dz;
            }
            __syncthreads();
        }

        // --- Stores local memory to global memory
        fx_d[tid] = fx_temp;
        fy_d[tid] = fy_temp;
        fz_d[tid] = fz_temp;
    }
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1);       // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);                // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    //KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {  
    float time;

    dim3 grid(GRIDSIZE, 1, 1);      // --- Specifys how many blocks in three possible dimensions
    dim3 block(BLOCKSIZE, 1, 1);    // --- Threads per block

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start, 0));
    //KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time, start, stop));

    return time;
}

/********************/
/* CPU CALCULATIONS */
/********************/
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {  

    for (unsigned int i=0; i<N; i++) {

        float invDist, invDist3;

        float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;

        // --- Interact all with all    
        for(unsigned int ib=0; ib<N; ib++) {

            // --- Compute relative distances
            float dx = (x_h[ib] - x_h[i]);
            float dy = (y_h[ib] - y_h[i]);
            float dz = (z_h[ib] - z_h[i]);
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            // --- Prevent slingshots and division by zero
            distanceSquared += SOFTENING;

            float invDist = 1.f / sqrtf(distanceSquared);
            float invDist3 = invDist * invDist * invDist;

            // --- Calculate gravitational magnitude between the bodies
            float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;

            // --- Calculate forces for the bodies: magnitude times direction
            fx_temp += magnitude*dx;
            fy_temp += magnitude*dy;
            fz_temp += magnitude*dz;

        }

        // --- Stores local memory to global memory
        fx_h[i] = fx_temp;
        fy_h[i] = fy_temp;
        fz_h[i] = fz_temp;
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    const int N = 16384;

    size_t gsize = sizeof(float) * size_t(N);

    float * g[10], * _g[7];

    for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize)); 

    for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);

    for(int i=0; i<N; i++)
        for(int j=0; j<4; j++)
            *(g[j]+i) = (float)rand();

    for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice)); 

    // --- Warm up to take context establishment time out.
    GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
    //GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);

    // --- Bench runs
    printf("1024: %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512:  %f\n",    GPUcomputeForces<512>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256:  %f\n",    GPUcomputeForces<256>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128:  %f\n",    GPUcomputeForces<128>   (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64:   %f\n",    GPUcomputeForces<64>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32:   %f\n",    GPUcomputeForces<32>    (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    printf("16, 1024: %f\n",    GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  1024: %f\n",    GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("4,  1024: %f\n",    GPUcomputeForces_Tiling<4,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 512: %f\n",     GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 512: %f\n",     GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("8,  512: %f\n",     GPUcomputeForces_Tiling<8,512>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 256:  %f\n",    GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 256:  %f\n",    GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("16, 256:  %f\n",    GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,128:  %f\n",    GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 128:  %f\n",    GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("32, 128:  %f\n",    GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,64:   %f\n",    GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64:   %f\n",    GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("64, 64:   %f\n",    GPUcomputeForces_Tiling<64,64>  (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("512,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("256,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
    printf("128,32:   %f\n",    GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));

    for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost)); 

    CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);

    for(int i=0; i<N; i++)
        for(int j=8; j<10; j++) {

            if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
                printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
                return;
            }
        }

    printf("Test passed!\n");

    cudaDeviceReset();

    return 0;
}

M. Burtscher, K. Pingali, "An Efficient CUDA Implementation of the Tree-Based Barnes Hut n-Body

N           Barnes-Hut          Shared          Shuffle
16384       19.1                5.7             7.2
32768       46.9                25.5            21.7
65536       107.7               102.6           74.6
131072      255.1               355             296.2
262144      548.3               1408.8          1108.9
524288      1246                5434            4688
1048576     2674                21544           18632
2097152     5664                85980           74454