Helmholtz方程的CUDA一维时域有限差分（FDTD）_Cuda

Helmholtz方程的CUDA一维时域有限差分（FDTD）

cuda

Helmholtz方程的CUDA一维时域有限差分（FDTD）,cuda,Cuda,我的一维波动方程在CUDA中实现时比在C/C++中慢。谁能告诉我我做错了什么？这是我的密码： __global__ void Solver1d(float* up, float* u, float* um) { int id; float dx,dt; dx = (float)L/n; dt = (float)dx/c; float r= c*((float)dt/dx); float R = r*r; // index mapping

我的一维波动方程在CUDA中实现时比在C/C++中慢。谁能告诉我我做错了什么？这是我的密码：

__global__ void Solver1d(float* up, float* u, float* um)
{
    int id;
    float dx,dt;
    dx = (float)L/n;
    dt = (float)dx/c;
    float r= c*((float)dt/dx);
    float R = r*r;

    // index mapping between data and threads
    id = threadIdx.x + blockIdx.x*blockDim.x;

    // Allowing all threads in the range of valids data to execute
    if (id<n)
    {
        if(id==0)
        {
            up[id]=0;
        }
        else if(id==n-1)
        {
            up[n-1]=0;
        }
        else
        {
            up[id] = 2*u[id]-um[id]+R*(u[id+1]-2*u[id]+u[id-1]);    
        }   
    }
}

// main program
int main(int argc, char *argv[])
{        

    // declare all variables 
    int i;
    float inner,L2_exact,ue[n],dx,dt;
    dx = (float)L/n;
    dt = (float)(0.05*dx/c); // Max time step
    float r= c*((float)dt/dx);
    float R = r*r;

    // Allocate memory on host
    //float u=(float *)malloc((n)*sizeof(float));
    //float um=(float *)malloc((n)*sizeof(float));
    float up[n],um[n],u[n];


    //Pointers for device memory allocation
    float *dev_up, *dev_u, *dev_um;


    // Allocating memory to device (GPU)
    HANDLE_ERROR(cudaMalloc((void**)&dev_up, n*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&dev_u, n*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&dev_um, n*sizeof(float)));


    cudaEvent_t start, stop;
    float elapsedTime;

    // Start timer
    HANDLE_ERROR(cudaEventCreate( &start ));
    HANDLE_ERROR(cudaEventCreate( &stop ));
    HANDLE_ERROR(cudaEventRecord( start,0 ));

    //Initialize the stream
    cudaStream_t stream;
    HANDLE_ERROR(cudaStreamCreate( &stream ));
    //Initial condition
    for(i=0;i<n;i++)
    {
        u[i]=sin(2*PI*i*dx);
        //printf("Initialization ok\n");
    }

    // Enforcing special formula for t = -1
    for(i=1;i<n-1 ;i++)
    {
        um[0] = 0;
        um[n-1] = 0;
        um[i] = u[i]  + 0.5*R*(u[i-1] - 2*u[i] + u[i+1]); //+ 0.5*dt*dt*f(i*dx,t)
        //printf("um is runing fine\n");
    }

    // setting blocks and threads numbers
    int noThreads=128;
    dim3 dimBlock(noThreads,1,1);
    dim3 dimGrid(1+n/(noThreads-1),1,1);

    // move u and um to GPU
    HANDLE_ERROR(cudaMemcpy(dev_u, u, n*sizeof(float), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(dev_um, um, n*sizeof(float), cudaMemcpyHostToDevice));

    float t=0;

    //int counter=0;
    while(t<=T)
    {
        //counter++;
        t += dt;

        Solver1d<<<dimGrid,dimBlock>>>(dev_up,dev_u,dev_um);
        // cudaDeviceSynchronize();
        for(i=0;i<n;i++)
        {
            um[i] = u[i];
            u[i]  = up[i];
        }

    }

    HANDLE_ERROR(cudaEventRecord( stop,0 ));
    HANDLE_ERROR(cudaEventSynchronize( stop ));
    HANDLE_ERROR(cudaEventElapsedTime( &elapsedTime,start,stop ));
    HANDLE_ERROR(cudaEventDestroy( start ));
    HANDLE_ERROR(cudaEventDestroy( stop ));     

    printf("elapsed time: %lf sec\n",elapsedTime/1000);
    // move the solution up from GPU to CPU
    HANDLE_ERROR(cudaMemcpy(up, dev_up, n*sizeof(float), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(u, dev_u, n*sizeof(float), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(um, dev_um, n*sizeof(float), cudaMemcpyDeviceToHost));

    int j;
    float L2cpuSolution=0.0;
    float L2gpuSolution=0.0;
    float ERROR_PERCENTAGE=0.0;


    // Verification with exact solution
    for(j=0;j<(n);j++)
    {
        //printf("up[%d]=%.12g\n",j,up[j]);
        ue[j]=0.5*(sin(2*PI*(j*dx+c*T))+sin(2*PI*(j*dx-c*T)));
        //printf("um[%d]=%.12g\n",j,um[j]);
        inner += (ue[j]-up[j])*(ue[j]-up[j]);
        L2cpuSolution += ue[j]*ue[j];
        L2gpuSolution += up[j]*up[j];

    }
    L2cpuSolution = sqrt(L2cpuSolution)/n;
    L2gpuSolution = sqrt(L2gpuSolution)/n;
    L2_exact = sqrt(inner/(n));
    ERROR_PERCENTAGE = 100*(L2_exact/L2cpuSolution);
    printf("L2_exact=%lf\n",L2_exact);
    printf("gpul2=%lf, and cpuL2=%lf \n",L2gpuSolution,L2cpuSolution);
    printf("ERROR_PERCENTAGE= %lf\n", ERROR_PERCENTAGE);

    // Free device memory
    cudaFree(dev_up);
    cudaFree(dev_u);
    cudaFree(dev_um);

    return 0;

}

\uuuu全局\uuuuu无效解算器1d（浮点数*up，浮点数*u，浮点数*um）
{
int-id；
浮动dx，dt；
dx=（浮动）L/n；
dt=（浮点数）dx/c；
浮点数r=c*（（浮点数）dt/dx）；
浮点数R=R*R；
//数据和线程之间的索引映射
id=threadIdx.x+blockIdx.x*blockDim.x；
//允许执行valids数据范围内的所有线程
如果（id从本质上讲，我不认为你在这里做错了什么。但是，你不应该期望CUDA为你变魔术，并且比CPU实现更快地加载。特别是一些相对琐碎的东西，比如一维波动方程（这实际上只是CPU实现中的一个for循环）对于一台现代计算机来说，它是如此简单，以至于几乎没有理由将其并行化。因为请记住：从主机到设备再到设备的数据传输可能是GPU实现性能的瓶颈。因此，除非您的数据量非常大（比如n>10^6左右），否则我认为这是不值得的
然而，改进内核中代码的一种方法是预计算一些变量。变量dx
、dt
、r
和r
在整个模拟过程中似乎是恒定的，但每个小线程都在每个时间步计算它们。因此，这可能是数百万另外，对阵列数据使用纹理内存有可能提高速度，因为每个块的大部分内存访问都发生在同一个邻域。
上述评论中的主要问题和讨论是1D时域有限差分（FDTD）是否方法在C/C++中实现并在顺序机上运行时比在CUDA中实现并在并行GPU上运行时更快
我试图用下面的代码来回答这个问题。它包含了在C/C++和CUDA中用于电磁应用的1D FDTD方法的实现。理论和C/C++实现取自（见程序3.1）.CUDA版本包含两种方法，一种仅使用全局内存，另一种使用共享内存。在后一种情况下，我通过启动两个不同的内核来强制磁场和电场更新之间的同步
对于一个足够大的问题（SIZE=10000000
），GPU版本确实比CPU版本快。我在开普勒K20c卡上测试了代码，结果如下：
Shared Memory version
CPU elapsed time = 3980.763 ms
GPU elapsed time = 356.828 ms

Global Memory version
GPU elapsed time = 359.768 ms

使用共享内存的版本不会改善该场景
代码如下：
内核.cu
/* 1D FDTD simulation with an additive source. */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "TimingCPU.h"
#include "TimingGPU.cuh"

#define BLOCKSIZE   512
//#define DEBUG

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/***********************************/
/* HOST-SIZE FIELD UPDATE FUNCTION */
/***********************************/
void updateHost(double *h_ez, double* h_hy, double imp0, double qTime, const int source, const int N) {

    /* update magnetic field */
    for (int mm = 0; mm < N - 1; mm++)
        h_hy[mm] = h_hy[mm] + (h_ez[mm + 1] - h_ez[mm]) / imp0;

    /* update electric field */
    for (int mm = 1; mm < N; mm++)
        h_ez[mm] = h_ez[mm] + (h_hy[mm] - h_hy[mm - 1]) * imp0;

    /* use additive source at node 50 */
    h_ez[source] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/********************************************************/
/* DEVICE-SIZE FIELD UPDATE FUNCTION - NO SHARED MEMORY */
/********************************************************/
__global__ void updateDevice_v0(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = d_hy[tid] + (d_ez[tid + 1] - d_ez[tid]) / imp0;

    __threadfence();

    /* update electric field */
    if ((tid < N)&&(tid > 0)) d_ez[tid] = d_ez[tid] + (d_hy[tid] - d_hy[tid - 1]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) d_ez[tid] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/**************************************************************/
/* DEVICE-SIZE MAGNETIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_hy(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x] = d_hy[tid];
    ez_temp[threadIdx.x] = d_ez[tid];

    if ((threadIdx.x == 0)&&((tid + BLOCKSIZE) < N)) {
        ez_temp[BLOCKSIZE] = d_ez[tid + BLOCKSIZE];
        hy_temp[BLOCKSIZE] = d_hy[tid + BLOCKSIZE];
    }

    __syncthreads();

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = hy_temp[threadIdx.x] + (ez_temp[threadIdx.x + 1] - ez_temp[threadIdx.x]) / imp0;

}

/**************************************************************/
/* DEVICE-SIZE ELECTRIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_ez(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x + 1] = d_hy[tid];
    ez_temp[threadIdx.x + 1] = d_ez[tid];

    if ((threadIdx.x == 0)&&(tid >= 1)) {
        ez_temp[0] = d_ez[tid - 1];
        hy_temp[0] = d_hy[tid - 1];
    }

    __syncthreads();

    /* update electric field */
    ez_temp[threadIdx.x] = ez_temp[threadIdx.x + 1] + (hy_temp[threadIdx.x + 1] - hy_temp[threadIdx.x]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) ez_temp[threadIdx.x] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

    if ((tid < N)&&(tid > 0)) d_ez[tid] = ez_temp[threadIdx.x];

}

/********/
/* MAIN */
/********/
int main() {

    // --- Problem size
    const int SIZE = 10000000;

    // --- Free-space wave impedance
    double imp0 = 377.0;

    // --- Maximum number of iterations (must be less than the problem size)
    int maxTime = 100;

    // --- Source location
    int source = SIZE / 2;

    // --- Host side memory allocations and initializations
    double *h_ez = (double*)calloc(SIZE, sizeof(double));
    double *h_hy = (double*)calloc(SIZE, sizeof(double));

    // --- Device side memory allocations and initializations
    double *d_ez; gpuErrchk(cudaMalloc((void**)&d_ez, SIZE * sizeof(double)));
    double *d_hy; gpuErrchk(cudaMalloc((void**)&d_hy, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_ez, 0, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_hy, 0, SIZE * sizeof(double)));

    // --- Host side memory allocations for debugging purposes
#ifdef DEBUG
    double *h_ez_temp = (double*)calloc(SIZE, sizeof(double));
    double *h_hy_temp = (double*)calloc(SIZE, sizeof(double));
#endif

    // --- Host-side time-steppings
#ifndef DEBUG
    TimingCPU timerCPU;
    timerCPU.StartCounter();
    for (int qTime = 0; qTime < maxTime; qTime++) {
        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
    }
    printf("CPU elapsed time = %3.3f ms\n", timerCPU.GetCounter());
#endif

    TimingGPU timerGPU;
    timerGPU.StartCounter();
    // --- Device-side time-steppings
    for (int qTime = 0; qTime < maxTime; qTime++) {

        updateDevice_v0<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_hy<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_ez<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
#ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(h_ez_temp, d_ez, SIZE * sizeof(double), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_hy_temp, d_hy, SIZE * sizeof(double), cudaMemcpyDeviceToHost));

        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
        for (int i=0; i<SIZE; i++) {
            printf("%f %f %f %f\n",h_ez_temp[i], h_ez[i], h_hy_temp[i], h_hy[i]);
        }
        printf("\n");
#endif
    }
    printf("GPU elapsed time = %3.3f ms\n", timerGPU.GetCounter());

    return 0;
}

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start,0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
    cudaEventRecord((*privateTimingGPU).start,0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
    return time;
}

定时CPU.cpp
/**************/
/* TIMING CPU */
/**************/

#include "TimingCPU.h"

#ifdef __linux__

    #include <sys/time.h>
    #include <stdio.h>

    TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }

    TimingCPU::~TimingCPU() { }

    void TimingCPU::StartCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return;
        cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
    }

    double TimingCPU::GetCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return -1;

        long cur_time = 1000000 * time.tv_sec + time.tv_usec;
        double sec = (cur_time - cur_time_) / 1000000.0;
        if(sec < 0) sec += 86400;
        cur_time_ = cur_time;

        return 1000.*sec;
    }

#elif _WIN32 || _WIN64
    #include <windows.h>
    #include <iostream>

    struct PrivateTimingCPU {
        double  PCFreq;
        __int64 CounterStart;
    };

    // --- Default constructor
    TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }

    // --- Default destructor
    TimingCPU::~TimingCPU() { }

    // --- Starts the timing
    void TimingCPU::StartCounter()
    {
        LARGE_INTEGER li;
        if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";

        (*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;

        QueryPerformanceCounter(&li);
        (*privateTimingCPU).CounterStart = li.QuadPart;
    }

    // --- Gets the timing counter in ms
    double TimingCPU::GetCounter()
    {
        LARGE_INTEGER li;
        QueryPerformanceCounter(&li);
        return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
    }
#endif

定时GPU.cu
/* 1D FDTD simulation with an additive source. */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "TimingCPU.h"
#include "TimingGPU.cuh"

#define BLOCKSIZE   512
//#define DEBUG

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/***********************************/
/* HOST-SIZE FIELD UPDATE FUNCTION */
/***********************************/
void updateHost(double *h_ez, double* h_hy, double imp0, double qTime, const int source, const int N) {

    /* update magnetic field */
    for (int mm = 0; mm < N - 1; mm++)
        h_hy[mm] = h_hy[mm] + (h_ez[mm + 1] - h_ez[mm]) / imp0;

    /* update electric field */
    for (int mm = 1; mm < N; mm++)
        h_ez[mm] = h_ez[mm] + (h_hy[mm] - h_hy[mm - 1]) * imp0;

    /* use additive source at node 50 */
    h_ez[source] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/********************************************************/
/* DEVICE-SIZE FIELD UPDATE FUNCTION - NO SHARED MEMORY */
/********************************************************/
__global__ void updateDevice_v0(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = d_hy[tid] + (d_ez[tid + 1] - d_ez[tid]) / imp0;

    __threadfence();

    /* update electric field */
    if ((tid < N)&&(tid > 0)) d_ez[tid] = d_ez[tid] + (d_hy[tid] - d_hy[tid - 1]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) d_ez[tid] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/**************************************************************/
/* DEVICE-SIZE MAGNETIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_hy(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x] = d_hy[tid];
    ez_temp[threadIdx.x] = d_ez[tid];

    if ((threadIdx.x == 0)&&((tid + BLOCKSIZE) < N)) {
        ez_temp[BLOCKSIZE] = d_ez[tid + BLOCKSIZE];
        hy_temp[BLOCKSIZE] = d_hy[tid + BLOCKSIZE];
    }

    __syncthreads();

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = hy_temp[threadIdx.x] + (ez_temp[threadIdx.x + 1] - ez_temp[threadIdx.x]) / imp0;

}

/**************************************************************/
/* DEVICE-SIZE ELECTRIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_ez(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x + 1] = d_hy[tid];
    ez_temp[threadIdx.x + 1] = d_ez[tid];

    if ((threadIdx.x == 0)&&(tid >= 1)) {
        ez_temp[0] = d_ez[tid - 1];
        hy_temp[0] = d_hy[tid - 1];
    }

    __syncthreads();

    /* update electric field */
    ez_temp[threadIdx.x] = ez_temp[threadIdx.x + 1] + (hy_temp[threadIdx.x + 1] - hy_temp[threadIdx.x]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) ez_temp[threadIdx.x] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

    if ((tid < N)&&(tid > 0)) d_ez[tid] = ez_temp[threadIdx.x];

}

/********/
/* MAIN */
/********/
int main() {

    // --- Problem size
    const int SIZE = 10000000;

    // --- Free-space wave impedance
    double imp0 = 377.0;

    // --- Maximum number of iterations (must be less than the problem size)
    int maxTime = 100;

    // --- Source location
    int source = SIZE / 2;

    // --- Host side memory allocations and initializations
    double *h_ez = (double*)calloc(SIZE, sizeof(double));
    double *h_hy = (double*)calloc(SIZE, sizeof(double));

    // --- Device side memory allocations and initializations
    double *d_ez; gpuErrchk(cudaMalloc((void**)&d_ez, SIZE * sizeof(double)));
    double *d_hy; gpuErrchk(cudaMalloc((void**)&d_hy, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_ez, 0, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_hy, 0, SIZE * sizeof(double)));

    // --- Host side memory allocations for debugging purposes
#ifdef DEBUG
    double *h_ez_temp = (double*)calloc(SIZE, sizeof(double));
    double *h_hy_temp = (double*)calloc(SIZE, sizeof(double));
#endif

    // --- Host-side time-steppings
#ifndef DEBUG
    TimingCPU timerCPU;
    timerCPU.StartCounter();
    for (int qTime = 0; qTime < maxTime; qTime++) {
        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
    }
    printf("CPU elapsed time = %3.3f ms\n", timerCPU.GetCounter());
#endif

    TimingGPU timerGPU;
    timerGPU.StartCounter();
    // --- Device-side time-steppings
    for (int qTime = 0; qTime < maxTime; qTime++) {

        updateDevice_v0<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_hy<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_ez<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
#ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(h_ez_temp, d_ez, SIZE * sizeof(double), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_hy_temp, d_hy, SIZE * sizeof(double), cudaMemcpyDeviceToHost));

        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
        for (int i=0; i<SIZE; i++) {
            printf("%f %f %f %f\n",h_ez_temp[i], h_ez[i], h_hy_temp[i], h_hy[i]);
        }
        printf("\n");
#endif
    }
    printf("GPU elapsed time = %3.3f ms\n", timerGPU.GetCounter());

    return 0;
}

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start,0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
    cudaEventRecord((*privateTimingGPU).start,0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
    return time;
}

/**************/
/*定时GPU*/
/**************/
#包括“TimingGPU.cuh”
#包括
#包括
结构私有定时GPU{
cudaEvent\u t启动；
不要停下来；
};
//默认构造函数
TimingGPU:：TimingGPU（）{privateTimingGPU=new privateTimingGPU；}
//默认析构函数
TimingGPU:：~TimingGPU（）{}
void TimingGPU:：StartCounter（）
{
cudaEventCreate（&（*privateTimingGPU.start））；
cudaEventCreate（&（*privateTimingGPU.stop））；
cudaEventRecord（（*专用定时GPU）.start，0）；
}
void timeingGPU:：StartCounterFlags（）
{
int eventflags=cudaEventBlockingSync；
cudaEventCreateWithFlags（&（（*privateTimingGPU.start）、eventflags）；
cudaEventCreateWithFlags（&（*privateTimingGPU.stop）、eventflags）；
cudaEventRecord（（*专用定时GPU）.start，0）；
}
//获取以毫秒为单位的计数器
浮点计时GPU:：GetCounter（）
{
浮动时间；
cudaEventRecord（（*专用定时GPU）。停止，0）；
cudaEventSynchronize（（*专用定时GPU）.stop）；
CUDAEventReleasedTime（&time，（*privateTimingGPU）.start，（*privateTimingGPU.stop）；
返回时间；
}
哪里定义了n
？它的值是什么？你是如何计算CPU版本的时间的？请注意，你还考虑了初始条件
和在GPU时间内强制执行特殊公式
。也许一个好问题是，你为什么认为一阶1D有限差分法这么简单thod在GPU上应该比主机CPU更快？n的值是多少？@Talonmes因为它可以更快。数据只需要足够大。@user2114645可能会有两个问题。1）您的数据大小很小。2）您使用的是旧一代卡，可能不支持缓存。在（2）的情况下您可能希望使用共享内存，而不是从全局内存多次读取。您的答案包含许多假设：解决方案域相对较小，应用程序没有执行大量迭代（例如，类似FDTD模拟，这似乎是），等等。如果问题规模或运行长度很大，即使1D SIM也可能需要过多的处理能力。这是真的，但这些假设是有限差分模拟在GPU上的性能可能不如在CPU上好的主要原因，其次是上述其他建议，以及GPU的类型。我明白你的观点。FD然而，TD（特别是——我对有限元或改进的FDTD算法几乎没有经验，因此我的观点可能不适用），是“令人尴尬的平行”我的第一次OpenCL尝试在GTX580上运行速度比在i7 2.6Ghz上快50倍。基于GPU的模拟肯定会有开销，但如果您运行足够的帧来补偿设置时间，并且不必过于频繁地在主机和计算设备之间来回移动数据，那么使用GPU对您有利。（顺便说一句，+1-很好的答案，我只是觉得它不完整。）非常正确，我绝对不会放弃GPU方法！我自己也做过很多FDTD的工作，我的经验是，对于2D和3D情况，CUDA实现肯定会提供g