使用共享内存的CUDA中的3D boxcar过滤器

使用共享内存的CUDA中的3D boxcar过滤器,cuda,shared-memory,Cuda,Shared Memory,我正在尝试查看对对象中的问题使用共享内存是否可以提高执行时间并导致某些加速: 不使用共享内存的内核函数 __global__ void 3dc(const int nx, const int ny, const int nz, const float* in1, const float* in2, const float* in3, const float* in4, float* out) { int i, j, k; int tidx = threadIdx.x

我正在尝试查看对对象中的问题使用共享内存是否可以提高执行时间并导致某些加速:

不使用共享内存的内核函数

__global__ void  3dc(const int nx, const int ny, const int nz, const float* in1, 
    const float* in2, const float* in3, const float* in4, float* out)
{
    int i, j, k;

    int tidx = threadIdx.x + blockIdx.x*blockDim.x;

    if(tidx < (nx)*(ny)*(nz)){
        k = tidx/((nx)*(ny));
        j = (tidx - k*(nx)*(ny))/(nx);
        i = tidx - k*(nx)*(ny) - j*(nx);

        out[i + nx*j + nx*ny*k] = 
            in1[i     + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in1[i     + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*(j+1) + nx*ny*(k+1)];
    } 
} // 3dc
__global__ void 3d_shared_memory(const int nx, const int ny, const int nz, const float* in1, const float* in2, const float* in3, const float* in4, float* out){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    __shared__ float smem1[16][16][4];
    __shared__ float smem2[16][16][4];
    __shared__ float smem3[16][16][4];
    __shared__ float smem4[16][16][4];

    if ((idx < nx) && (idy < ny) && (idz < nz)){
        smem1[threadIdx.x][threadIdx.y][threadIdx.z] = in1[idz * nx * ny + idy * nx + idx];
        smem2[threadIdx.x][threadIdx.y][threadIdx.z] = in2[idz * nx * ny + idy * nx + idx];
        smem3[threadIdx.x][threadIdx.y][threadIdx.z] = in3[idz * nx * ny + idy * nx + idx];
        smem4[threadIdx.x][threadIdx.y][threadIdx.z] = in4[idz * nx * ny + idy * nx + idx];                        
        __syncthreads();

        for(int k = 0; k < 3; k++){
            for(int j = 0; j < 15; j++){
                for(int i = 0; i < 15; i++){
                    out[idz * nx * ny + idy * nx + idx] = smem1[i][j][k] + smem1[i+1][j][k] + smem1[i+1][j+1][k] + smem1[i][j+1][k] + smem1[i][j][k+1] + smem1[i+1][j][k+1] + smem1[i+1][j+1][k+1] + smem1[i][j+1][k+1] +
                        smem2[i][j][k] + smem2[i+1][j][k] + smem2[i+1][j+1][k] + smem2[i][j+1][k] + smem2[i][j][k+1] + smem2[i+1][j][k+1] + smem2[i+1][j+1][k+1] + smem2[i][j+1][k+1] +
                        smem3[i][j][k] + smem3[i+1][j][k] + smem3[i+1][j+1][k] + smem3[i][j+1][k] + smem3[i][j][k+1] + smem3[i+1][j][k+1] + smem3[i+1][j+1][k+1] + smem3[i][j+1][k+1] +
                        smem4[i][j][k] + smem4[i+1][j][k] + smem4[i+1][j+1][k] + smem4[i][j+1][k] + smem4[i][j][k+1] + smem4[i+1][j][k+1] + smem4[i+1][j+1][k+1] + smem4[i][j+1][k+1];
                }
            }
        }

    }

} //3d_shared_memory example
\uuuu全局\uuuuu无效3dc(常数int nx、常数int ny、常数int nz、常数float*in1、,
常量浮点*in2,常量浮点*in3,常量浮点*in4,浮点*out)
{
int i,j,k;
int tidx=threadIdx.x+blockIdx.x*blockDim.x;
如果(tidx<(nx)*(ny)*(nz)){
k=tidx/((nx)*(ny));
j=(tidx-k*(nx)*(ny))/(nx);
i=tidx-k*(nx)*(ny)-j*(nx);
输出[i+nx*j+nx*ny*k]=
in1[i+nx*j+nx*ny*k]+
in1[(i+1)+nx*j+nx*ny*k]+
in1[(i+1)+nx*(j+1)+nx*ny*k]+
in1[i+nx*(j+1)+nx*ny*k]+
in1[i+nx*j+nx*ny*(k+1)]+
in1[(i+1)+nx*j+nx*ny*(k+1)]+
in1[(i+1)+nx*(j+1)+nx*ny*(k+1)]+
in1[i+nx*(j+1)+nx*ny*(k+1)]+
in2[i+nx*j+nx*ny*k]+
in2[(i+1)+nx*j+nx*ny*k]+
in2[(i+1)+nx*(j+1)+nx*ny*k]+
in2[i+nx*(j+1)+nx*ny*k]+
in2[i+nx*j+nx*ny*(k+1)]+
in2[(i+1)+nx*j+nx*ny*(k+1)]+
in2[(i+1)+nx*(j+1)+nx*ny*(k+1)]+
in2[i+nx*(j+1)+nx*ny*(k+1)]+
in3[i+nx*j+nx*ny*k]+
in3[(i+1)+nx*j+nx*ny*k]+
in3[(i+1)+nx*(j+1)+nx*ny*k]+
in3[i+nx*(j+1)+nx*ny*k]+
in3[i+nx*j+nx*ny*(k+1)]+
in3[(i+1)+nx*j+nx*ny*(k+1)]+
in3[(i+1)+nx*(j+1)+nx*ny*(k+1)]+
in3[i+nx*(j+1)+nx*ny*(k+1)]+
in4[i+nx*j+nx*ny*k]+
in4[(i+1)+nx*j+nx*ny*k]+
in4[(i+1)+nx*(j+1)+nx*ny*k]+
in4[i+nx*(j+1)+nx*ny*k]+
in4[i+nx*j+nx*ny*(k+1)]+
in4[(i+1)+nx*j+nx*ny*(k+1)]+
in4[(i+1)+nx*(j+1)+nx*ny*(k+1)]+
in4[i+nx*(j+1)+nx*ny*(k+1)];
} 
}//3dc
使用共享内存的内核函数

__global__ void  3dc(const int nx, const int ny, const int nz, const float* in1, 
    const float* in2, const float* in3, const float* in4, float* out)
{
    int i, j, k;

    int tidx = threadIdx.x + blockIdx.x*blockDim.x;

    if(tidx < (nx)*(ny)*(nz)){
        k = tidx/((nx)*(ny));
        j = (tidx - k*(nx)*(ny))/(nx);
        i = tidx - k*(nx)*(ny) - j*(nx);

        out[i + nx*j + nx*ny*k] = 
            in1[i     + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in1[i     + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*(j+1) + nx*ny*(k+1)];
    } 
} // 3dc
__global__ void 3d_shared_memory(const int nx, const int ny, const int nz, const float* in1, const float* in2, const float* in3, const float* in4, float* out){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    __shared__ float smem1[16][16][4];
    __shared__ float smem2[16][16][4];
    __shared__ float smem3[16][16][4];
    __shared__ float smem4[16][16][4];

    if ((idx < nx) && (idy < ny) && (idz < nz)){
        smem1[threadIdx.x][threadIdx.y][threadIdx.z] = in1[idz * nx * ny + idy * nx + idx];
        smem2[threadIdx.x][threadIdx.y][threadIdx.z] = in2[idz * nx * ny + idy * nx + idx];
        smem3[threadIdx.x][threadIdx.y][threadIdx.z] = in3[idz * nx * ny + idy * nx + idx];
        smem4[threadIdx.x][threadIdx.y][threadIdx.z] = in4[idz * nx * ny + idy * nx + idx];                        
        __syncthreads();

        for(int k = 0; k < 3; k++){
            for(int j = 0; j < 15; j++){
                for(int i = 0; i < 15; i++){
                    out[idz * nx * ny + idy * nx + idx] = smem1[i][j][k] + smem1[i+1][j][k] + smem1[i+1][j+1][k] + smem1[i][j+1][k] + smem1[i][j][k+1] + smem1[i+1][j][k+1] + smem1[i+1][j+1][k+1] + smem1[i][j+1][k+1] +
                        smem2[i][j][k] + smem2[i+1][j][k] + smem2[i+1][j+1][k] + smem2[i][j+1][k] + smem2[i][j][k+1] + smem2[i+1][j][k+1] + smem2[i+1][j+1][k+1] + smem2[i][j+1][k+1] +
                        smem3[i][j][k] + smem3[i+1][j][k] + smem3[i+1][j+1][k] + smem3[i][j+1][k] + smem3[i][j][k+1] + smem3[i+1][j][k+1] + smem3[i+1][j+1][k+1] + smem3[i][j+1][k+1] +
                        smem4[i][j][k] + smem4[i+1][j][k] + smem4[i+1][j+1][k] + smem4[i][j+1][k] + smem4[i][j][k+1] + smem4[i+1][j][k+1] + smem4[i+1][j+1][k+1] + smem4[i][j+1][k+1];
                }
            }
        }

    }

} //3d_shared_memory example
\uuuuu全局\uuuuuuu无效3d\u共享内存(常数整型nx、常数整型ny、常数整型nz、常数浮点*in1、常数浮点*in2、常数浮点*in3、常数浮点*in4、浮点*out){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
intidy=blockIdx.y*blockDim.y+threadIdx.y;
int idz=blockIdx.z*blockDim.z+threadIdx.z;
__共享浮点数smem1[16][16][4];
__共享浮点数smem2[16][16][4];
__共享浮点数smem3[16][16][4];
__共享浮点数smem4[16][16][4];
如果((idx

共享内存代码总是较慢。有没有更好的方法利用共享内存解决此问题?提前感谢您的建议。

我将提供此帖子的最新答案,以便将其从未答复列表中删除

您基本上是使用共享内存在3D中实现boxcar过滤器。除了上述评论中已经提到的原因外,我还发现了两个可能的原因,即为什么您在使用共享内存时没有遇到加速:

  • 共享内存加载和存储未合并
  • 当需要进行重要的线程协作时,您不会考虑这种情况,因为boxcar的大小是
    2
  • 下面,我将提供一个代码来比较仅使用全局内存和共享内存的情况。该代码是对Robert Crovella在上发布的代码的修改

    此代码的结果,对于
    DATASIZE\u X DATASIZE\u Y X DATASIZE\u Z=1024 X 1024 X 64

    GT540M案例

    开普勒K20c案例

    守则:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    #define BOXCAR_SIZE 6
    
    #define DATASIZE_X 1024
    #define DATASIZE_Y 1024
    #define DATASIZE_Z 64
    
    #define BLOCKSIZE_X 8
    #define BLOCKSIZE_Y 8
    #define BLOCKSIZE_Z 8
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    /*****************************/
    /* BOXCAR WITH SHARED MEMORY */
    /*****************************/
    __global__ void boxcar_shared(int* __restrict__ output, const int* __restrict__ input)
    {
        __shared__ int smem[(BLOCKSIZE_Z + (BOXCAR_SIZE-1))][(BLOCKSIZE_Y + (BOXCAR_SIZE-1))][(BLOCKSIZE_X + (BOXCAR_SIZE-1))];
    
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int idz = blockIdx.z*blockDim.z + threadIdx.z;
    
        if ((idx < (DATASIZE_X+BOXCAR_SIZE-1)) && (idy < (DATASIZE_Y+BOXCAR_SIZE-1)) && (idz < (DATASIZE_Z+BOXCAR_SIZE-1))){
    
            smem[threadIdx.z][threadIdx.y][threadIdx.x]=input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];
    
        if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (idz < DATASIZE_Z))
            smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x] = input[(idz + (BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];
    
        if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idy < DATASIZE_Y))
            smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];
    
        if ((threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idx < DATASIZE_X))
            smem[threadIdx.z][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
    
        if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y))
            smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];
    
        if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idx < DATASIZE_X))
            smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
    
        if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
            smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
    
        if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
            smem[threadIdx.z+(BOXCAR_SIZE-1)][threadIdx.y+(BOXCAR_SIZE-1)][threadIdx.x+(BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
    }
    
        __syncthreads();
    
        if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){
    
            int temp = 0;
    
            for (int i=0; i<BOXCAR_SIZE; i++)
                for (int j=0; j<BOXCAR_SIZE; j++)
                    for (int k=0; k<BOXCAR_SIZE; k++)
                        temp = temp + smem[threadIdx.z + i][threadIdx.y + j][threadIdx.x + k];
    
            output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
        }
    }
    
    /********************************/
    /* BOXCAR WITHOUT SHARED MEMORY */
    /********************************/
    __global__ void boxcar(int* __restrict__ output, const int* __restrict__ input)
    {
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        int idz = blockIdx.z*blockDim.z + threadIdx.z;
    
        if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){
    
            int temp = 0;
            for (int i=0; i<BOXCAR_SIZE; i++)
                for (int j=0; j<BOXCAR_SIZE; j++)
                    for (int k=0; k<BOXCAR_SIZE; k++)
                        temp = temp + input[(k+idz)*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (j+idy)*(DATASIZE_X+BOXCAR_SIZE-1) + (i+idx)];
    
            output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
        }
    }
    
    /********/
    /* MAIN */
    /********/
    int main(void)
    {
        int i, j, k, u, v, w, temp;
    
        // --- these are just for timing
        clock_t t0, t1, t2, t3;
        double t1sum=0.0f;
        double t2sum=0.0f;
        double t3sum=0.0f;
    
        const int nx = DATASIZE_X;
        const int ny = DATASIZE_Y;
        const int nz = DATASIZE_Z;
    
        const int wx = BOXCAR_SIZE;
        const int wy = BOXCAR_SIZE;
        const int wz = BOXCAR_SIZE;
    
        // --- start timing
        t0 = clock();
    
        // --- CPU memory allocations
        int *input, *output, *ref_output; 
        if ((input  = (int*)malloc(((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))) == 0)    { fprintf(stderr, "malloc Fail \n"); return 1; }
        if ((output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }
        if ((ref_output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }
    
        // --- Data generation
        srand(time(NULL));
        for(int i=0; i<(nz+(wz-1)); i++)
            for(int j=0; j<(ny+(wy-1)); j++)
                for (int k=0; k<(nx+(wx-1)); k++)
                    input[i*(ny+(wy-1))*(nx+(wx-1))+j*(nx+(wx-1))+k] = rand(); 
    
        t1 = clock();
    
        // --- Allocate GPU space for data and results
        int *d_output, *d_input;  // storage for input
        gpuErrchk(cudaMalloc((void**)&d_input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))));
        gpuErrchk(cudaMalloc((void**)&d_output, ((nx*ny*nz)*sizeof(int))));
    
        // --- Copy data from GPU to CPU
        gpuErrchk(cudaMemcpy(d_input, input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int)), cudaMemcpyHostToDevice));
    
        const dim3 blockSize(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
        const dim3 gridSize(((DATASIZE_X+BLOCKSIZE_X-1)/BLOCKSIZE_X), ((DATASIZE_Y+BLOCKSIZE_Y-1)/BLOCKSIZE_Y), ((DATASIZE_Z+BLOCKSIZE_Z-1)/BLOCKSIZE_Z));
    
        float time;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        boxcar_shared<<<gridSize,blockSize>>>(d_output, d_input);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time:  %3.4f ms \n", time);
    
        // --- Copy result from GPU to CPU
        gpuErrchk(cudaMemcpy(output, d_output, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost));
    
        t2 = clock();
        t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
        printf(" Device compute took %3.2f seconds.  Beginning host compute.\n", t2sum);
    
        // --- Host-side computations
        for (int u=0; u<nz; u++)
            for (int v=0; v<ny; v++)
                for (int w=0; w<nx; w++){
                    temp = 0;
                    for (int i=0; i<wz; i++)
                        for (int j=0; j<wy; j++)
                            for (int k=0; k<wx; k++)
                                temp = temp + input[(i+u)*(ny+(wy-1))*(nx+(wx-1))+(j+v)*(nx+(wx-1))+(k+w)];
                    ref_output[u*ny*nx + v*nx + w] = temp;
                }
    
        t3 = clock();
        t3sum = ((double)(t3-t2))/CLOCKS_PER_SEC;
        printf(" Host compute took %3.2f seconds.  Comparing results.\n", t3sum);
    
        // --- Check CPU and GPU results
        for (int i=0; i<nz; i++)
            for (int j=0; j<ny; j++)
                for (int k=0; k<nx; k++)
                    if (ref_output[i*ny*nx + j*nx + k] != output[i*ny*nx + j*nx + k]) {
                        printf("Mismatch at x= %d, y= %d, z= %d  Host= %d, Device = %d\n", i, j, k, ref_output[i*ny*nx + j*nx + k], output[i*ny*nx + j*nx + k]);
                        return 1;
                    }
        printf("Results match!\n");
    
        // --- Freeing memory
        free(input);
        free(output);
        gpuErrchk(cudaFree(d_input));
        gpuErrchk(cudaFree(d_output));
    
        cudaDeviceReset();
    
        return 0;
    }
    
    #包括
    #包括
    #包括
    #定义BOXCAR_尺寸6
    #定义数据大小\u X 1024
    #定义DATASIZE_Y 1024
    #定义DATASIZE_Z 64
    #定义块大小×8
    #定义块大小_Y 8
    #定义块大小_Z 8
    /********************/
    /*CUDA错误检查*/
    /********************/
    #定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
    内联void gpuAssert(cudaError\u t代码,char*文件,int行,bool abort=true)
    {
    如果(代码!=cudaSuccess)
    {
    fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
    如果(中止)退出(代码);
    }
    }
    /*****************************/
    /*共享内存的BOXCAR*/
    /*****************************/
    __全局无效boxcar共享(int*\uuu限制\uuuu输出,const int*\uu限制\uuu输入)
    {
    __共享整型