CUDA中二维扩散(热)方程解的优化

CUDA中二维扩散(热)方程解的优化,cuda,nvidia,cuda-gdb,Cuda,Nvidia,Cuda Gdb,我已经检查了之前关于这个问题的问题,但看不出它在这里有什么关系 我正在用CUDA解2d扩散方程,结果发现我的GPU代码比CPU代码慢 这是我的密码: //kernel definition __global__ void diffusionSolver(double* A, int n_x,int n_y) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y

我已经检查了之前关于这个问题的问题,但看不出它在这里有什么关系

我正在用CUDA解2d扩散方程,结果发现我的GPU代码比CPU代码慢

这是我的密码:

//kernel definition
__global__ void diffusionSolver(double* A, int n_x,int n_y)
{

int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;


if(i<n_x && j <n_y && i*(n_x-i-1)*j*(n_y-j-1)!=0)
    A[i+n_y*j] = A[i+n_y*j] + (A[i-1+n_y*j]+A[i+1+n_y*j]+A[i+(j-1)*n_y]+A[i+(j+1)*n_y] -4.0*A[i+n_y*j])/40.0;


}
//内核定义
__全局无效扩散解决方案(双*A,整数n_x,整数n_y)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=blockIdx.y*blockDim.y+threadIdx.y;

如果(i你做了大量的整数乘法,并且有大量的全局内存读取,这两种读取在CUDA中都很慢。我还认为没有太多的合并全局内存读取

提高内核速度的唯一方法是通过共享内存进行合并内存读取和/或重新安排数据,以便在不使用大量整数乘法的情况下对其进行索引


我对扩散方程没有很好的理解,但我不认为有太多幼稚的并行性可以利用。看看和,也许你会对如何改进算法有一些想法。

你做了很多整数乘法,并且有很多全局内存读取,这两者在CUDA中都很慢想象一下,没有太多合并的全局内存读取

提高内核速度的唯一方法是通过共享内存进行合并内存读取和/或重新安排数据,以便在不使用大量整数乘法的情况下对其进行索引


我对扩散方程没有很好的理解,但我不认为有太多幼稚的并行性可以利用。看看和,也许你会对如何改进算法有一些想法。

我看到的最糟糕的问题是,对于输入数组的大小,你启动的块太多了。目前,你正在计算网格大小,如下所示:

dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
对于仅200x200的输入数组,其网格大小应为(4004000)个块。这显然是错误的。计算应如下所示:

int nbx = (n_x / threadsPerBlock.x) + (((n_x % threadsPerBlock.x) == 0) ? 0 : 1);
int nby = (n_y / threadsPerBlock.y) + (((n_y % threadsPerBlock.y) == 0) ? 0 : 1);
dim3 numBlocks(nbx,nby);
这将产生(2,20)个块的网格大小,比您当前启动的网格大小少40000倍


还有其他的优化,你可以考虑到内核,但是那些与这个大小的错误相比显得微不足道。

我所看到的最坏的问题是,你为输入数组的大小发射了太多的块。现在你计算的网格大小为:

dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
对于仅200x200的输入数组,其网格大小应为(4004000)个块。这显然是错误的。计算应如下所示:

int nbx = (n_x / threadsPerBlock.x) + (((n_x % threadsPerBlock.x) == 0) ? 0 : 1);
int nby = (n_y / threadsPerBlock.y) + (((n_y % threadsPerBlock.y) == 0) ? 0 : 1);
dim3 numBlocks(nbx,nby);
这将产生(2,20)个块的网格大小,比您当前启动的网格大小少40000倍


<> P>有其他的优化,你可以考虑到内核,但那些苍白无关紧要的错误与这种大小的错误相比。

如果有人感兴趣的话,我将发布一个完整的关于优化2D热方程求解方法的代码。 考虑五种方法,使用:

  • 全局内存,本质上是OP的方法
  • 大小为
    BLOCK\u size\u X X BLOCK\u size\u Y
    的共享内存
    未加载光晕区域
  • 大小为
    BLOCK\u size\u X X BLOCK\u size\u Y的共享内存加载光晕区域
  • 大小为
    (BLOCK\u size\u X+2)X(BLOCK\u size\u Y+2)的共享内存
    加载光晕区域
  • 纹理记忆
  • 每个人都可以运行代码并检查哪种方法对于自己的GPU架构更快

    #include <iostream>
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include "Utilities.cuh"
    #include "InputOutput.cuh"
    #include "TimingGPU.cuh"
    
    #define BLOCK_SIZE_X 16
    #define BLOCK_SIZE_Y 16
    
    #define DEBUG
    
    texture<float, 2, cudaReadModeElementType>  tex_T;
    texture<float, 2, cudaReadModeElementType>  tex_T_old;
    
    /***********************************/
    /* JACOBI ITERATION FUNCTION - GPU */
    /***********************************/
    __global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
    {
        const int i = blockIdx.x * blockDim.x + threadIdx.x ;
        const int j = blockIdx.y * blockDim.y + threadIdx.y ;
    
                                    //                         N 
        int P = i + j*NX;           // node (i,j)              |
        int N = i + (j+1)*NX;       // node (i,j+1)            |
        int S = i + (j-1)*NX;       // node (i,j-1)     W ---- P ---- E
        int E = (i+1) + j*NX;       // node (i+1,j)            |
        int W = (i-1) + j*NX;       // node (i-1,j)            |
                                    //                         S 
    
        // --- Only update "interior" (not boundary) node points
        if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]); 
    }
    
    /******************************************************/
    /* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V1 */
    /******************************************************/
    __global__ void Jacobi_Iterator_GPU_shared_v1(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
    {
        const int i = blockIdx.x * blockDim.x + threadIdx.x ;
        const int j = blockIdx.y * blockDim.y + threadIdx.y ;
    
                                    //                         N 
        int P = i + j*NX;           // node (i,j)              |
        int N = i + (j+1)*NX;       // node (i,j+1)            |
        int S = i + (j-1)*NX;       // node (i,j-1)     W ---- P ---- E
        int E = (i+1) + j*NX;       // node (i+1,j)            |
        int W = (i-1) + j*NX;       // node (i-1,j)            |
                                    //                         S 
        __shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
    
        // --- Load data to shared memory. Halo regions are NOT loaded.
        T_sh[threadIdx.x][threadIdx.y] = T_old[P];
        __syncthreads();
    
        if ((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) 
            // --- If we do not need halo region elements, then use shared memory.
            T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
        else if (i>0 && i<NX-1 && j>0 && j<NY-1)  // --- Only update "interior" (not boundary) node points
            // --- If we need halo region elements, then use global memory.
            T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]); 
    
    }
    
    /******************************************************/
    /* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
    /******************************************************/
    __global__ void Jacobi_Iterator_GPU_shared_v2(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
    {
        const int i = blockIdx.x * (BLOCK_SIZE_X - 2) + threadIdx.x ;
        const int j = blockIdx.y * (BLOCK_SIZE_Y - 2) + threadIdx.y ;
    
        int P = i + j*NX;           
    
        if ((i >= NX) || (j >= NY)) return;
    
        __shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
    
        // --- Load data to shared memory. Halo regions ARE loaded.
        T_sh[threadIdx.x][threadIdx.y] = T_old[P];
        __syncthreads();
    
        if (((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) &&
           (i>0 && i<NX-1 && j>0 && j<NY-1))
            T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
    
    }
    
    /******************************************************/
    /* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
    /******************************************************/
    __global__ void Jacobi_Iterator_GPU_shared_v3(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
    {
        const int i = blockIdx.x * blockDim.x + threadIdx.x ;
        const int j = blockIdx.y * blockDim.y + threadIdx.y ;
    
        const int tid_block = threadIdx.y * BLOCK_SIZE_X + threadIdx.x;     // --- Flattened thread index within a block
    
        const int i1      = tid_block % (BLOCK_SIZE_X + 2);
        const int j1      = tid_block / (BLOCK_SIZE_Y + 2);
    
        const int i2      = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) % (BLOCK_SIZE_X + 2);
        const int j2      = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) / (BLOCK_SIZE_Y + 2);
    
        int P = i + j * NX;           
    
        if ((i >= NX) || (j >= NY)) return;
    
        __shared__ float T_sh[BLOCK_SIZE_X + 2][BLOCK_SIZE_Y + 2];
    
        if (((blockIdx.x * BLOCK_SIZE_X - 1 + i1) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j1) < NY))
            T_sh[i1][j1] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i1) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j1) * NX];
    
        if (((i2 < (BLOCK_SIZE_X + 2)) && (j2 < (BLOCK_SIZE_Y + 2))) && (((blockIdx.x * BLOCK_SIZE_X - 1 + i2) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j2) < NY)))
            T_sh[i2][j2] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i2) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j2) * NX];
    
        __syncthreads();
    
        if ((threadIdx.x <= (BLOCK_SIZE_X - 1) && (threadIdx.y <= (BLOCK_SIZE_Y ‐ 1))) && (i>0 && i<NX-1 && j>0 && j<NY-1))
            T_new[P] = 0.25 * (T_sh[threadIdx.x + 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y + 2] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x + 2][threadIdx.y + 1]);
    
    }
    
    /*********************************************/
    /* JACOBI ITERATION FUNCTION - GPU - TEXTURE */
    /*********************************************/
    __global__ void Jacobi_Iterator_GPU_texture(float * __restrict__ T_new, const bool flag, const int NX, const int NY) {
    
        const int i = blockIdx.x * blockDim.x + threadIdx.x ;
        const int j = blockIdx.y * blockDim.y + threadIdx.y ;
    
        float P, N, S, E, W;    
        if (flag) {
                                                //                         N 
            P = tex2D(tex_T_old, i,     j);     // node (i,j)              |
            N = tex2D(tex_T_old, i,     j + 1); // node (i,j+1)            |
            S = tex2D(tex_T_old, i,     j - 1); // node (i,j-1)     W ---- P ---- E
            E = tex2D(tex_T_old, i + 1, j);     // node (i+1,j)            |
            W = tex2D(tex_T_old, i - 1, j);     // node (i-1,j)            |
                                                //                         S 
        } else {
                                                //                         N 
            P = tex2D(tex_T,     i,     j);     // node (i,j)              |
            N = tex2D(tex_T,     i,     j + 1); // node (i,j+1)            |
            S = tex2D(tex_T,     i,     j - 1); // node (i,j-1)     W ---- P ---- E
            E = tex2D(tex_T,     i + 1, j);     // node (i+1,j)            |
            W = tex2D(tex_T,     i - 1, j);     // node (i-1,j)            |
                                                //                         S 
        }
    
        // --- Only update "interior" (not boundary) node points
        if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[i + j*NX] = 0.25 * (E + W + N + S);
    }
    
    /***********************************/
    /* JACOBI ITERATION FUNCTION - CPU */
    /***********************************/
    void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
    {
        for(int iter=0; iter<MAX_ITER; iter=iter+2)
        {
            // --- Only update "interior" (not boundary) node points
            for(int j=1; j<NY-1; j++) 
                for(int i=1; i<NX-1; i++) {
                    float T_E = T[(i+1) + NX*j];
                    float T_W = T[(i-1) + NX*j];
                    float T_N = T[i + NX*(j+1)];
                    float T_S = T[i + NX*(j-1)];
                    T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
                }
    
            for(int j=1; j<NY-1; j++) 
                for(int i=1; i<NX-1; i++) {
                    float T_E = T_new[(i+1) + NX*j];
                    float T_W = T_new[(i-1) + NX*j];
                    float T_N = T_new[i + NX*(j+1)];
                    float T_S = T_new[i + NX*(j-1)];
                    T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
                }
        }
    }
    
    /******************************/
    /* TEMPERATURE INITIALIZATION */
    /******************************/
    void Initialize(float * __restrict h_T, const int NX, const int NY)
    {
        // --- Set left wall to 1
        for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
    }
    
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int NX = 256;         // --- Number of discretization points along the x axis
        const int NY = 256;         // --- Number of discretization points along the y axis
    
        const int MAX_ITER = 100;   // --- Number of Jacobi iterations
    
        // --- CPU temperature distributions
        float *h_T              = (float *)calloc(NX * NY, sizeof(float));
        float *h_T_old          = (float *)calloc(NX * NY, sizeof(float));
        Initialize(h_T,     NX, NY);
        Initialize(h_T_old, NX, NY);
        float *h_T_GPU_result       = (float *)malloc(NX * NY * sizeof(float));
        float *h_T_GPU_tex_result   = (float *)malloc(NX * NY * sizeof(float));
        float *h_T_GPU_sh1_result   = (float *)malloc(NX * NY * sizeof(float));
        float *h_T_GPU_sh2_result   = (float *)malloc(NX * NY * sizeof(float));
        float *h_T_GPU_sh3_result   = (float *)malloc(NX * NY * sizeof(float));
    
        // --- GPU temperature distribution
        float *d_T;         gpuErrchk(cudaMalloc((void**)&d_T,          NX * NY * sizeof(float)));
        float *d_T_old;     gpuErrchk(cudaMalloc((void**)&d_T_old,      NX * NY * sizeof(float)));
        float *d_T_tex;     gpuErrchk(cudaMalloc((void**)&d_T_tex,      NX * NY * sizeof(float)));
        float *d_T_old_tex; gpuErrchk(cudaMalloc((void**)&d_T_old_tex,  NX * NY * sizeof(float)));
        float *d_T_sh1;     gpuErrchk(cudaMalloc((void**)&d_T_sh1,      NX * NY * sizeof(float)));
        float *d_T_old_sh1; gpuErrchk(cudaMalloc((void**)&d_T_old_sh1,  NX * NY * sizeof(float)));
        float *d_T_sh2;     gpuErrchk(cudaMalloc((void**)&d_T_sh2,      NX * NY * sizeof(float)));
        float *d_T_old_sh2; gpuErrchk(cudaMalloc((void**)&d_T_old_sh2,  NX * NY * sizeof(float)));
        float *d_T_sh3;     gpuErrchk(cudaMalloc((void**)&d_T_sh3,      NX * NY * sizeof(float)));
        float *d_T_old_sh3; gpuErrchk(cudaMalloc((void**)&d_T_old_sh3,  NX * NY * sizeof(float)));
    
        gpuErrchk(cudaMemcpy(d_T,           h_T,     NX * NY * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_T_tex,       h_T,     NX * NY * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_T_sh1,       h_T,     NX * NY * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_T_sh2,       h_T,     NX * NY * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_T_sh3,       h_T,     NX * NY * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_T_old,       d_T,     NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
        gpuErrchk(cudaMemcpy(d_T_old_tex,   d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
        gpuErrchk(cudaMemcpy(d_T_old_sh1,   d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
        gpuErrchk(cudaMemcpy(d_T_old_sh2,   d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
        gpuErrchk(cudaMemcpy(d_T_old_sh3,   d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
    
        //cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
        cudaChannelFormatDesc desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    
        gpuErrchk(cudaBindTexture2D(NULL, &tex_T,     d_T_tex,     &desc, NX, NY, sizeof(float) * NX));
        gpuErrchk(cudaBindTexture2D(NULL, &tex_T_old, d_T_old_tex, &desc, NX, NY, sizeof(float) * NX));
    
        tex_T.addressMode[0] = cudaAddressModeWrap;
        tex_T.addressMode[1] = cudaAddressModeWrap;
        tex_T.filterMode = cudaFilterModePoint;
        tex_T.normalized = false;
    
        tex_T_old.addressMode[0] = cudaAddressModeWrap;
        tex_T_old.addressMode[1] = cudaAddressModeWrap;
        tex_T_old.filterMode = cudaFilterModePoint;
        tex_T_old.normalized = false;
    
        // --- Grid size
        dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
        dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
    
        // --- Jacobi iterations on the host
        Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
    
        // --- Jacobi iterations on the device
        TimingGPU timerGPU;
        timerGPU.StartCounter();
        for (int k=0; k<MAX_ITER; k=k+2) {
            Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T,     d_T_old, NX, NY);   // --- Update d_T_old     starting from data stored in d_T
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
            Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T    , NX, NY);   // --- Update d_T         starting from data stored in d_T_old
     #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif
        }
        printf("Timing = %f ms\n", timerGPU.GetCounter());
    
        // --- Jacobi iterations on the device - shared memory v1
        timerGPU.StartCounter();
        for (int k=0; k<MAX_ITER; k=k+2) {
            Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_sh1,     d_T_old_sh1, NX, NY);   // --- Update d_T_old     starting from data stored in d_T
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
            Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_old_sh1, d_T_sh1    , NX, NY);   // --- Update d_T         starting from data stored in d_T_old
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
        }
        printf("Timing with shared memory v1 = %f ms\n", timerGPU.GetCounter());
    
        // --- Jacobi iterations on the device - shared memory v2
        dim3 dimBlock2(BLOCK_SIZE_X, BLOCK_SIZE_Y);
        dim3 dimGrid2 (iDivUp(NX, BLOCK_SIZE_X - 2), iDivUp(NY, BLOCK_SIZE_Y - 2));
        timerGPU.StartCounter();
        for (int k=0; k<MAX_ITER; k=k+2) {
            Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_sh2,     d_T_old_sh2, NX, NY);   // --- Update d_T_old     starting from data stored in d_T
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
            Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_old_sh2, d_T_sh2    , NX, NY);   // --- Update d_T         starting from data stored in d_T_old
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
        }
        printf("Timing with shared memory v2 = %f ms\n", timerGPU.GetCounter());
    
        // --- Jacobi iterations on the device - shared memory v3
        timerGPU.StartCounter();
        for (int k=0; k<MAX_ITER; k=k+2) {
            Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_sh3,     d_T_old_sh3, NX, NY);   // --- Update d_T_old     starting from data stored in d_T
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
            Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_old_sh3, d_T_sh3    , NX, NY);   // --- Update d_T         starting from data stored in d_T_old
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
        }
        printf("Timing with shared memory v3 = %f ms\n", timerGPU.GetCounter());
    
        // --- Jacobi iterations on the device - texture case
        timerGPU.StartCounter();
        for (int k=0; k<MAX_ITER; k=k+2) {
            Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_old_tex, 0, NX, NY);   // --- Update d_T_tex         starting from data stored in d_T_old_tex
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
            Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_tex,     1, NX, NY);   // --- Update d_T_old_tex     starting from data stored in d_T_tex
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif        
        }
        printf("Timing with texture = %f ms\n", timerGPU.GetCounter());
    
        saveCPUrealtxt(h_T,     "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\CPU_result.txt",     NX * NY);
        saveGPUrealtxt(d_T_tex, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_tex.txt", NX * NY);
        saveGPUrealtxt(d_T,     "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result.txt",     NX * NY);
        saveGPUrealtxt(d_T_sh1, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh1.txt",     NX * NY);
        saveGPUrealtxt(d_T_sh2, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh2.txt",     NX * NY);
        saveGPUrealtxt(d_T_sh3, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh3.txt",     NX * NY);
    
        // --- Copy results from device to host
        gpuErrchk(cudaMemcpy(h_T_GPU_result,     d_T,     NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_T_GPU_tex_result, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_T_GPU_sh1_result, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_T_GPU_sh2_result, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_T_GPU_sh3_result, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Calculate percentage root mean square error between host and device results
        float sum = 0.f, sum_tex = 0.f, sum_ref = 0.f, sum_sh1 = 0.f, sum_sh2 = 0.f, sum_sh3 = 0.f;
        for (int j=0; j<NY; j++)
            for (int i=0; i<NX; i++) {
                sum     = sum     + (h_T_GPU_result    [j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result    [j * NX + i] - h_T[j * NX + i]);
                sum_tex = sum_tex + (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]);
                sum_sh1 = sum_sh1 + (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]);
                sum_sh2 = sum_sh2 + (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]);
                sum_sh3 = sum_sh3 + (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]);
                sum_ref = sum_ref + h_T[j * NX + i]                                * h_T[j * NX + i];
            }
        printf("Percentage root mean square error           = %f\n", 100.*sqrt(sum     / sum_ref));
        printf("Percentage root mean square error texture   = %f\n", 100.*sqrt(sum_tex / sum_ref));
        printf("Percentage root mean square error shared v1 = %f\n", 100.*sqrt(sum_sh1 / sum_ref));
        printf("Percentage root mean square error shared v2 = %f\n", 100.*sqrt(sum_sh2 / sum_ref));
        printf("Percentage root mean square error shared v3 = %f\n", 100.*sqrt(sum_sh3 / sum_ref));
    
        return 0;
    }
    
    #包括
    #包括“cuda_runtime.h”
    #包括“设备启动参数.h”
    #包括“Utilities.cuh”
    #包括“InputOutput.cuh”
    #包括“TimingGPU.cuh”
    #定义块大小×16
    #定义块大小16
    #定义调试
    纹理纹理;
    纹理纹理纹理;
    /***********************************/
    /*JACOBI迭代函数-GPU*/
    /***********************************/
    __全局无效Jacobi迭代器GPU(常量浮点*\uuuu限制\uuuuuu旧,浮点*\uuu限制\uuuuu新,常量整数NX,常量整数NY)
    {
    const int i=blockIdx.x*blockDim.x+threadIdx.x;
    const int j=blockIdx.y*blockDim.y+threadIdx.y;
    //N
    int P=i+j*NX;//节点(i,j)|
    int N=i+(j+1)*NX;//节点(i,j+1)|
    int S=i+(j-1)*NX;//节点(i,j-1)W----P----E
    int E=(i+1)+j*NX;//节点(i+1,j)|
    int W=(i-1)+j*NX;//节点(i-1,j)|
    //
    //---仅更新“内部”(非边界)节点点
    如果(i>0&&i0&&j0)&&(threadIdx.x<(块大小\ux-1))&(threadIdx.y>0)&(threadIdx.y<(块大小\uy-1)))
    //---如果我们不需要halo区域元素,则使用共享内存。
    T_-sh[threadIdx.x][threadIdx.y-1]+T_-sh[threadIdx.x][threadIdx.y+1]+T_-sh[threadIdx.x-1][threadIdx.y]+T_-sh[threadIdx.x+1][threadIdx.y]);
    否则,如果(i>0&&i0&&j=NX)|(j>=NY))返回;
    __共享浮点数T_sh[块大小X][块大小Y];
    //---将数据加载到共享内存。加载光晕区域。
    T_sh[threadIdx.x][threadIdx.y]=T_old[P];
    __同步线程();
    如果((threadIdx.x>0)和(threadIdx.x<(块大小\u x-1))和(threadIdx.y>0)和(threadIdx.y<(块大小\u y-1)))&&
    (i>0&&i0&&j=NX)|(j>=NY))返回;
    __共享浮点数T_sh[块大小X+2][块大小Y+2];
    如果((块IDX.x*块大小\uX-1+i1)