基于CUDA的FFT求解泊松方程_Cuda_Fftw_Poisson_Cufft

基于CUDA的FFT求解泊松方程

cuda

基于CUDA的FFT求解泊松方程,cuda,fftw,poisson,cufft,Cuda,Fftw,Poisson,Cufft,下面是关于使用cuFFT库的教程：在一行一行地跟踪它的代码之后，我得到了非常奇怪的结果我的输入数据是一个NxN浮点数组。程序执行FFT正变换，求解泊松方程，然后执行逆FFT。输入数据（和输出数据）称为边长为N的方形图像。当我注释掉求解泊松（r_复数d，kx_d，ky_d，N），它将正确地对数据进行前向变换，然后执行逆变换，从而使输出数据与输入数据相同。这是应该发生的这是不调用solve\u poisson方法的输出 0 r_initial: 0.00125126 r: 0.

下面是关于使用

cuFFT

库的教程：

在一行一行地跟踪它的代码之后，我得到了非常奇怪的结果

我的输入数据是一个

NxN

浮点数组。程序执行

FFT

正变换，求解泊松方程，然后执行逆

FFT

。输入数据（和输出数据）称为边长为

的方形图像。当我注释掉

求解泊松（r_复数d，kx_d，ky_d，N），它将正确地对数据进行前向变换，然后执行逆变换，从而使输出数据与输入数据相同。这是应该发生的
这是不调用solve\u poisson
方法的输出
0       r_initial: 0.00125126   r: 0.00125132
1       r_initial: 0.563585     r: 0.563585
2       r_initial: 0.193304     r: 0.193304
3       r_initial: 0.80874      r: 0.80874
4       r_initial: 0.585009     r: 0.585009
5       r_initial: 0.479873     r: 0.479873
6       r_initial: 0.350291     r: 0.350291
7       r_initial: 0.895962     r: 0.895962
8       r_initial: 0.82284      r: 0.82284
9       r_initial: 0.746605     r: 0.746605
10      r_initial: 0.174108     r: 0.174108
11      r_initial: 0.858943     r: 0.858943
12      r_initial: 0.710501     r: 0.710502
13      r_initial: 0.513535     r: 0.513535
14      r_initial: 0.303995     r: 0.303995
15      r_initial: 0.0149846    r: 0.0149846
Press any key to continue . . .

然而，当我取消注释solve\u poisson
方法时，输出数据是inf
或nan
，这使我相信solve\u poisson方法中的标度变量在某种程度上接近于零。
所以我改变了浮动比例=-（kx[idx]*kx[idx]+ky[idy]*ky[idy]）到浮动刻度=-（kx[idx]*kx[idx]+ky[idy]*ky[idy]）+0.00001f
。此更改不在原始教程中。此处计算的结果不应具有极端正值或负值
0       r_initial: 0.00125126   r: -11448.1
1       r_initial: 0.563585     r: 11449.3
2       r_initial: 0.193304     r: -11448.3
3       r_initial: 0.80874      r: 11449.2
4       r_initial: 0.585009     r: 11449.4
5       r_initial: 0.479873     r: -11448.4
6       r_initial: 0.350291     r: 11449.5
7       r_initial: 0.895962     r: -11448.6
8       r_initial: 0.82284      r: -11448.5
9       r_initial: 0.746605     r: 11449.4
10      r_initial: 0.174108     r: -11448.3
11      r_initial: 0.858943     r: 11449.3
12      r_initial: 0.710501     r: 11449.2
13      r_initial: 0.513535     r: -11448.4
14      r_initial: 0.303995     r: 11449.3
15      r_initial: 0.0149846    r: -11448.1
Press any key to continue . . .

在本教程中，第22页幻灯片43
上的示例计算是computed=0.975879 reference=0.975882
，但我的结果完全不同，而且非常大
下面的代码是我使用的
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>

#include <stdlib.h>
#include <iostream>

#define N 4 //4 X 4 // N is the sidelength of the image -> 16 pixels in entire image
#define block_size_x 2 
#define block_size_y 2

__global__ void real2complex(cufftComplex *c, float *a, int n);
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n);
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n);


int main()
{
    float *kx, *ky, *r;
    kx = (float *)malloc(sizeof(float) * N);
    ky = (float *)malloc(sizeof(float) * N);
    r = (float *)malloc(sizeof(float) * N * N);

    float *kx_d, *ky_d, *r_d;
    cufftComplex *r_complex_d;
    cudaMalloc((void **)&kx_d, sizeof(float) * N);
    cudaMalloc((void **)&ky_d, sizeof(float) * N);
    cudaMalloc((void **)&r_d, sizeof(float) * N * N);
    cudaMalloc((void **)&r_complex_d, sizeof(cufftComplex) * N * N);

    for (int y = 0; y < N; y++)
        for (int x = 0; x < N; x++)
            r[x + y * N] = rand() / (float)RAND_MAX;
            //r[x + y * N] = sin(exp(-((x - N / 2.0f) * (x - N / 2.0f) + (N / 2.0f - y) * (N / 2.0f - y)) / (20 * 20))) * 255 / sin(1); //Here is sample data that will high values at the center of the image and low values as you go farther and farther away from the center.

    float* r_inital = (float *)malloc(sizeof(float) * N * N);
    for (int i = 0; i < N * N; i++)
        r_inital[i] = r[i];

    for (int i = 0; i < N; i++)
    {
        kx[i] = i - N / 2.0f; //centers kx values to be at center of image
        ky[i] = N / 2.0f - i; //centers ky values to be at center of image
    }

    cudaMemcpy(kx_d, kx, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(ky_d, ky, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(r_d, r, sizeof(float) * N * N, cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan2d(&plan, N, N, CUFFT_C2C);

    /* Compute the execution configuration, block_size_x*block_size_y = number of threads */
    dim3 dimBlock(block_size_x, block_size_y);
    dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
    /* Handle N not multiple of block_size_x or block_size_y */
    if (N % block_size_x != 0) dimGrid.x += 1;
    if (N % block_size_y != 0) dimGrid.y += 1;

    real2complex << < dimGrid, dimBlock >> > (r_complex_d, r_d, N);

    cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
    solve_poisson << <dimGrid, dimBlock >> > (r_complex_d, kx_d, ky_d, N);
    cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);

    float scale = 1.0f / (N * N);
    complex2real_scaled << <dimGrid, dimBlock >> > (r_d, r_complex_d, scale, N);

    cudaMemcpy(r, r_d, sizeof(float) * N * N, cudaMemcpyDeviceToHost);

    for (int i = 0; i < N * N; i++)
        std::cout << i << "\tr_initial: " << r_inital[i] << "\tr: " << r[i] << std::endl;
    system("pause");

    /* Destroy plan and clean up memory on device*/
    free(kx);
    free(ky);
    free(r);
    free(r_inital);
    cufftDestroy(plan);
    cudaFree(r_complex_d);
    cudaFree(kx_d);
}

__global__ void real2complex(cufftComplex *c, float *a, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        c[index].x = a[index];
        c[index].y = 0.0f;
    }
}

__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        a[index] = scale * c[index].x;
    }
}


__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f;
        if (idx == 0 && idy == 0) scale = 1.0f;
        scale = 1.0f / scale;
        c[index].x *= scale;
        c[index].y *= scale;
    }
}

#包括
#包括
#包括
#包括
#包括
#定义n4//4 X 4//N是图像的边长->整个图像中的16个像素
#定义块大小2
#定义块大小2
__全局无效real2complex（袖口复杂*c，浮点*a，整数n）；
__全局无效复数实标度（浮点*a，袖口复数*c，浮点标度，整数n）；
__全局无效解泊松（袖口复数*c，浮点*kx，浮点*ky，整数n）；
int main（）
{
浮点数*kx，*ky，*r；
kx=（浮点数*）malloc（浮点数）*N；
ky=（浮动*）malloc（浮动的大小）*N）；
r=（浮点数*）malloc（浮点数）*N*N；
浮点数*kx_d，*ky_d，*r_d；
袖口复合体*r_复合体\u d；
Cudamaloc（（无效**）和kx_d，尺寸（浮动）*N）；
Cudamaloc（（无效**）和ky_d，sizeof（浮动）*N）；
cudamaloc（（无效**）和r_d，大小（浮动）*N*N；
Cudamaloc（（void**）和r_复合体，尺寸（袖套复合体）*N*N；
对于（int y=0；y>（r_complex_d，r_d，N）；
cufftExecC2C（平面图、r_复合体d、r_复合体d、CUFFT_前锋）；
求解泊松（r_复数d，kx_d，ky_d，N）；
CUFFT（平面、r_复数、r_复数、CUFFT逆）；
浮动刻度=1.0f/（N*N）；
complex2real\u scale>（r\u d，r\u complex\u d，scale，N）；
cudaMemcpy（r，r_d，sizeof（float）*N*N，cudamemcpydevicetoost）；
对于（int i=0；ix=（0：（N-1））*h；
和y=（0：（N-1））*h、 
，这就是为什么从这些x和y值创建的网格从0开始并增加，如第16页幻灯片31上图形的x和y轴所示。在这种情况下，图像长度为1（我将NxN浮点数组或网格网格的输入数据称为图像），图像的中心实际上是（0.5，0.5）。我想转换这些点，所以中心点将改为（0,0），并遵循该点的典型表示形式
所以在我的代码中，而不是
x = (0:(N-1))*h;
y = (0:(N-1))*h;

可以作为
for (int i = 0; i < N; i++)
    {
        kx[i] = i;
        ky[i] = i;
    }

for（int i=0；i

我把它换成了
for (int i = 0; i < N; i++)
    {
        kx[i] = i - N / 2.0f; //centers kx values to be at center of image
        ky[i] = N / 2.0f - i; //centers ky values to be at center of image
    }

for（int i=0；i

但是，我忘记了更改泊松计算，因此它将图像的中心识别为原点，而不是右上角作为原点
将此行更改为：if（idx==0&&idy==0）scale=1.0f；if
（idx==2&&idy==2）比例=1.0f；


对于图像长度或N为4的情况。
为了将这一点推广到任何图像长度，如果（idx==n/2&&idy==n/2）scale=1.0f；

如图所示，则这一行代码可以更改为


clear all
close all
clc

M       = 64;              % --- Number of Fourier harmonics along x (should be a multiple of 2)  
N       = 32;              % --- Number of Fourier harmonics along y (should be a multiple of 2)  
Lx      = 3;               % --- Domain size along x
Ly      = 1.5;               % --- Domain size along y
sigma   = 0.1;             % --- Characteristic width of f (make << 1)

% --- Wavenumbers
kx = (2 * pi / Lx) * [0 : (M / 2 - 1) (- M / 2) : (-1)]; % --- Wavenumbers along x
ky = (2 * pi / Ly) * [0 : (N / 2 - 1) (- N / 2) : (-1)]; % --- Wavenumbers along y
[Kx, Ky]  = meshgrid(kx, ky); 

% --- Right-hand side of differential equation
hx              = Lx / M;                   % --- Grid spacing along x
hy              = Ly / N;                   % --- Grid spacing along y
x               = (0 : (M - 1)) * hx;
y               = (0 : (N - 1)) * hy;
[X, Y]          = meshgrid(x, y);
rSquared        = (X - 0.5 * Lx).^2 + (Y - 0.5 * Ly).^2;
sigmaSquared    = sigma^2;
f               = exp(-rSquared / (2 * sigmaSquared)) .* (rSquared - 2 * sigmaSquared) / (sigmaSquared^2);
fHat            = fft2(f);

% --- Denominator of the unknown spectrum
den             = -(Kx.^2 + Ky.^2); 
den(1, 1)       = 1;            % --- Avoid division by zero at wavenumber (0, 0)

% --- Unknown determination
uHat            = ifft2(fHat ./ den);
% uHat(1, 1)      = 0;            % --- Force the unknown spectrum at (0, 0) to be zero
u               = real(uHat);
u               = u - u(1,1);   % --- Force arbitrary constant to be zero by forcing u(1, 1) = 0

% --- Plots
uRef    = exp(-rSquared / (2 * sigmaSquared));
err     = 100 * sqrt(sum(sum(abs(u - uRef).^2)) / sum(sum(abs(uRef).^2)));
errMax  = norm(u(:)-uRef(:),inf)
fprintf('Percentage root mean square error = %f\n', err);
fprintf('Maximum error = %f\n', errMax);
surf(X, Y, u)
xlabel('x')
ylabel('y')
zlabel('u')
title('Solution of 2D Poisson equation by spectral method')

#include <stdio.h>
#include <fstream>
#include <iomanip>

// --- Greek pi
#define _USE_MATH_DEFINES
#include <math.h>

#include <cufft.h>

#define BLOCKSIZEX      16
#define BLOCKSIZEY      16

#define prec_save 10

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************************************/
/* COMPUTE RIGHT HAND SIDE OF 2D POISSON EQUATION */
/**************************************************/
__global__ void computeRHS(const float * __restrict__ d_x, const float * __restrict__ d_y,
                           float2 * __restrict__ d_r, const float Lx, const float Ly, const float sigma, 
                           const int M, const int N) {

    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    const float sigmaSquared = sigma * sigma;

    const float rSquared = (d_x[tidx] - 0.5f * Lx) * (d_x[tidx] - 0.5f * Lx) +
                           (d_y[tidy] - 0.5f * Ly) * (d_y[tidy] - 0.5f * Ly);

    d_r[tidy * M + tidx].x = expf(-rSquared / (2.f * sigmaSquared)) * (rSquared - 2.f * sigmaSquared) / (sigmaSquared * sigmaSquared);
    d_r[tidy * M + tidx].y = 0.f;

}

/****************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN */
/****************************************************/
__global__ void solvePoisson(const float * __restrict__ d_kx, const float * __restrict__ d_ky, 
                              float2 * __restrict__ d_r, const int M, const int N)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    float scale = -(d_kx[tidx] * d_kx[tidx] + d_ky[tidy] * d_ky[tidy]);

    if (tidx == 0 && tidy == 0) scale = 1.f;

    scale = 1.f / scale;
    d_r[M * tidy + tidx].x *= scale;
    d_r[M * tidy + tidx].y *= scale;

}

/****************************************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN - SHARED MEMORY VERSION */
/****************************************************************************/
__global__ void solvePoissonShared(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
    float2 * __restrict__ d_r, const int M, const int N)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    // --- Use shared memory to minimize multiple access to same spectral coordinate values
    __shared__ float kx_s[BLOCKSIZEX], ky_s[BLOCKSIZEY];

    kx_s[threadIdx.x] = d_kx[tidx];
    ky_s[threadIdx.y] = d_ky[tidy];
    __syncthreads();

    float scale = -(kx_s[threadIdx.x] * kx_s[threadIdx.x] + ky_s[threadIdx.y] * ky_s[threadIdx.y]);

    if (tidx == 0 && tidy == 0) scale = 1.f;

    scale = 1.f / scale;
    d_r[M * tidy + tidx].x *= scale;
    d_r[M * tidy + tidx].y *= scale;

}

/******************************/
/* COMPLEX2REAL SCALED KERNEL */
/******************************/
__global__ void complex2RealScaled(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    d_result[tidy * M + tidx] = scale * (d_r[tidy * M + tidx].x - d_r[0].x);
}

/******************************************/
/* COMPLEX2REAL SCALED KERNEL - OPTIMIZED */
/******************************************/
__global__ void complex2RealScaledOptimized(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    __shared__ float d_r_0[1];

    if (threadIdx.x == 0) d_r_0[0] = d_r[0].x;

    volatile float2 c;
    c.x = d_r[tidy * M + tidx].x;
    c.y = d_r[tidy * M + tidx].y;

    d_result[tidy * M + tidx] = scale * (c.x - d_r_0[0]);
}

/**************************************/
/* SAVE FLOAT2 ARRAY FROM GPU TO FILE */
/**************************************/
void saveGPUcomplextxt(const float2 * d_in, const char *filename, const int M) {

    float2 *h_in = (float2 *)malloc(M * sizeof(float2));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(float2), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) {
        //printf("%f %f\n", h_in[i].c.x, h_in[i].c.y);
        outfile << std::setprecision(prec_save) << h_in[i].x << "\n"; outfile << std::setprecision(prec_save) << h_in[i].y << "\n";
    }
    outfile.close();

}

/*************************************/
/* SAVE FLOAT ARRAY FROM GPU TO FILE */
/*************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/********/
/* MAIN */
/********/
int main()
{
    const int   M       = 64;              // --- Number of Fourier harmonics along x (should be a multiple of 2)
    const int   N       = 32;              // --- Number of Fourier harmonics along y(should be a multiple of 2)
    const float Lx      = 3.f;             // --- Domain size along x
    const float Ly      = 1.5f;            // --- Domain size along y
    const float sigma   = 0.1f;            // --- Characteristic width of f(make << 1)

    // --- Wavenumbers on the host
    float *h_kx = (float *)malloc(M * sizeof(float));
    float *h_ky = (float *)malloc(N * sizeof(float));
    for (int k = 0; k < M / 2; k++)  h_kx[k]        = (2.f * M_PI / Lx) * k;
    for (int k = -M / 2; k < 0; k++) h_kx[k + M]    = (2.f * M_PI / Lx) * k;
    for (int k = 0; k < N / 2; k++)  h_ky[k]        = (2.f * M_PI / Ly) * k;
    for (int k = -N / 2; k < 0; k++) h_ky[k + N]    = (2.f * M_PI / Ly) * k;

    // --- Wavenumbers on the device
    float *d_kx;    gpuErrchk(cudaMalloc(&d_kx, M * sizeof(float)));
    float *d_ky;    gpuErrchk(cudaMalloc(&d_ky, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_kx, h_kx, M * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ky, h_ky, N * sizeof(float), cudaMemcpyHostToDevice));

    // --- Domain discretization on the host
    float *h_x = (float *)malloc(M * sizeof(float));
    float *h_y = (float *)malloc(N * sizeof(float));
    for (int k = 0; k < M; k++)  h_x[k] = Lx / (float)M * k;
    for (int k = 0; k < N; k++)  h_y[k] = Ly / (float)N * k;

    // --- Domain discretization on the device
    float *d_x;     gpuErrchk(cudaMalloc(&d_x, M * sizeof(float)));
    float *d_y;     gpuErrchk(cudaMalloc(&d_y, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_x, h_x, M * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice));

    // --- Compute right-hand side of the equation on the device
    float2 *d_r;    gpuErrchk(cudaMalloc(&d_r, M * N * sizeof(float2)));
    dim3 dimBlock(BLOCKSIZEX, BLOCKSIZEY);
    dim3 dimGrid(iDivUp(M, BLOCKSIZEX), iDivUp(N, BLOCKSIZEY));
    computeRHS << <dimGrid, dimBlock >> >(d_x, d_y, d_r, Lx, Ly, sigma, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    // --- Create plan for CUDA FFT
    cufftHandle plan;
    cufftPlan2d(&plan, N, M, CUFFT_C2C);

    // --- Compute in place forward FFT of right-hand side
    cufftExecC2C(plan, d_r, d_r, CUFFT_FORWARD);

    // --- Solve Poisson equation in Fourier space 
    //solvePoisson << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
    solvePoissonShared << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);

    // --- Compute in place inverse FFT
    cufftExecC2C(plan, d_r, d_r, CUFFT_INVERSE);

    //saveGPUcomplextxt(d_r, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_r.txt", M * N);

    // --- With cuFFT, an FFT followed by an IFFT will return the same array times the size of the transform
    // --- Accordingly, we need to scale the result.
    const float scale = 1.f / ((float)M * (float)N);
    float *d_result;    gpuErrchk(cudaMalloc(&d_result, M * N * sizeof(float)));
    //complex2RealScaled << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
    complex2RealScaledOptimized << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);

    //saveGPUrealtxt(d_result, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_result.txt", M * N);

    // --- Transfer data from device to host
    float *h_result = (float *)malloc(M * N * sizeof(float));
    gpuErrchk(cudaMemcpy(h_result, d_result, M * N * sizeof(float), cudaMemcpyDeviceToHost));

    return 0;

}