Matlab 利用cuFFT进行逆FFT的标度_Matlab_Cuda_Fft_Scaling_Cufft

Matlab 利用cuFFT进行逆FFT的标度

matlab cuda

Matlab 利用cuFFT进行逆FFT的标度,matlab,cuda,fft,scaling,cufft,Matlab,Cuda,Fft,Scaling,Cufft,每当我用cuFFT绘制程序获得的值，并将结果与Matlab的结果进行比较时，我得到的是相同形状的图，最大值和最小值在相同的点上。然而，由CUFT产生的值远大于由Matlab产生的值。Matlab代码是 fs = 1000; % sample freq D = [0:1:4]'; % pulse delay times t = 0 : 1/fs : 4000/fs;

每当我用cuFFT绘制程序获得的值，并将结果与Matlab的结果进行比较时，我得到的是相同形状的图，最大值和最小值在相同的点上。然而，由CUFT产生的值远大于由Matlab产生的值。Matlab代码是

fs = 1000;                              % sample freq
D = [0:1:4]';                           % pulse delay times
t = 0 : 1/fs : 4000/fs;                 % signal evaluation time
w = 0.5;                                % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx)));

具有相同输入的CUDA代码如下所示：

cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal,     CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);

cufftExecC2C（平面，（cufftComplex*）d_信号，（cufftComplex*）d_信号，CUFFT_向前）；
cufftExecC2C（平面图，（cufftComplex*）d_滤波器信号，（cufftComplex*）d_滤波器信号，CUFFT_正向）；
复点式MUL（d_信号，d_滤波器_信号，NX）；
cufftExecC2C（平面，（cufftComplex*）d_信号，（cufftComplex*）d_信号，CUFFT_逆）；

cuFFT还执行批量大小为

的

点FFT

比例因子为

NX=1024

，数值不正确。请告诉我该怎么做。

这是一个迟来的答案，可以将此问题从未回答列表中删除

您没有提供足够的信息来诊断您的问题，因为您没有指定设置袖口计划的方式。您甚至没有指定Matlab和cuFFT信号的形状是否完全相同（因此您只需进行缩放）或形状是否大致相同。不过，让我提出以下两点意见：

yp

向量具有

元素；与之相反的是，通过

fft（yp，1024）

，您通过将信号截断为

1024个

元素来执行fft

逆Cuft不按矢量元素的数量进行缩放

为了方便（可能对其他用户有用），我在下面报告一个简单的FFT-IFFT方案，其中还包括使用CUDA推力库执行的缩放

#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

int main(void){

    const int N=4;

    // --- Setting up input device vector    
    thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));

    cufftHandle plan;
    cufftPlan1d(&plan, N, CUFFT_C2C, 1);

    // --- Perform in-place direct Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);

    // --- Perform in-place inverse Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);

    thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));

    // --- Setting up output host vector    
    thrust::host_vector<float2> h_vec(d_vec);

    for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);

    getchar();
}

#包括
#包括
#包括
/*********************/
/*按常数缩放*/
/*********************/
类标度乘以常数
{
私人：
浮点数；
公众：
按常数（浮点数c）{c_u=c；}缩放；
__主机\uuuuuuuuuu设备\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
浮动2输出；
输出.x=a.x/c；
输出y=a.y/c；
返回输出；
}
};
内部主（空）{
常数int N=4；
//---设置输入设备向量
推力：设备向量d向量（N，使复杂（1.f，2.f））；
卡夫坦德尔计划；
袖口平面图（和平面图，N，袖口C2C，1）；
//---执行就地直接傅里叶变换
cufftExecC2C（平面，推力：：原始指针投射（d_vec.data（）），推力：：原始指针投射（d_vec.data（）），CUFFT_向前）；
//---执行就地傅里叶逆变换
cufftExecC2C（平面，推力：：原始指针转换（d_vec.data（）），推力：：原始指针转换（d_vec.data（）），CUFFT\u反转）；
推力：：变换（d_vec.begin（），d_vec.end（），d_vec.begin（），按常量（（float）（N））缩放）；
//---设置输出主机向量
推力：主机向量h向量（d向量）；
对于（int i=0；i，随着cuFFT回调功能的引入，通过将规范化操作定义为\uuu设备
函数，cuFFT执行的逆FFT所需的规范化可以直接嵌入cufftExecC2C
调用中
除了《cuFFT用户指南》，有关cuFFT回调功能，请参阅

下面是通过cuFFT回调实现IFFT规范化的示例
    #include <stdio.h>
    #include <assert.h>

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <cufft.h>
    #include <cufftXt.h>

    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }

    /*********************/
    /* CUFFT ERROR CHECK */
    /*********************/
    // See http://stackoverflow.com/questions/16267149/cufft-error-handling
    #ifdef _CUFFT_H_
    static const char *_cudaGetErrorEnum(cufftResult error)
    {
        switch (error)
        {
            case CUFFT_SUCCESS:
                return "CUFFT_SUCCESS";

            case CUFFT_INVALID_PLAN:
                return "CUFFT_INVALID_PLAN";

            case CUFFT_ALLOC_FAILED:
                return "CUFFT_ALLOC_FAILED";

            case CUFFT_INVALID_TYPE:
                 return "CUFFT_INVALID_TYPE";

            case CUFFT_INVALID_VALUE:
                return "CUFFT_INVALID_VALUE";

            case CUFFT_INTERNAL_ERROR:
                return "CUFFT_INTERNAL_ERROR";

            case CUFFT_EXEC_FAILED:
                return "CUFFT_EXEC_FAILED";

            case CUFFT_SETUP_FAILED:
                return "CUFFT_SETUP_FAILED";

            case CUFFT_INVALID_SIZE:
                return "CUFFT_INVALID_SIZE";

            case CUFFT_UNALIGNED_DATA:
                return "CUFFT_UNALIGNED_DATA";
        }

        return "<unknown>";
    }
    #endif

    #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
    {
        if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
        }
    }

    __device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

        float *scaling_factor = (float*)callerInfo;

        float2 output;
        output.x = cuCrealf(element);
        output.y = cuCimagf(element);

        output.x = output.x / scaling_factor[0];
        output.y = output.y / scaling_factor[0];

        ((float2*)dataOut)[offset] = output;
}

    __device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

    /********/
    /* MAIN */
    /********/
    int main() {

        const int N = 16;

        cufftHandle plan;

        float2 *h_input             = (float2*)malloc(N*sizeof(float2));
        float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
        float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

        float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
        float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
        float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

        float *h_scaling_factor     = (float*)malloc(sizeof(float));
        h_scaling_factor[0] = 16.0f;
        float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
        gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

        for (int i=0; i<N; i++) {
            h_input[i].x = 1.0f;
            h_input[i].y = 0.f;
        }

        gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));

        cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

        cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
        gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);

        cufftCallbackStoreC h_storeCallbackPtr;
        gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));

        cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

        cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
        gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);

        cufftSafeCall(cufftDestroy(plan));

        gpuErrchk(cudaFree(d_input));
        gpuErrchk(cudaFree(d_output1));
        gpuErrchk(cudaFree(d_output2));

        return 0;
    }

#包括
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
/********************/
/*CUDA错误检查*/
/********************/
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
内联void gpuAssert（cudaError\u t代码，char*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
如果（中止）退出（代码）；
}
}
/*********************/
/*袖口错误检查*/
/*********************/
//看http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef_CUFFT_H_
静态常量字符*_cudaGetErrorEnum（袖套结果错误）
{
开关（错误）
{
成功案例：
返回“CUFFT_SUCCESS”；
案例卡夫图无效：
返回“袖口计划无效”；
案例CUFFT_ALLOC_失败：
返回“CUFFT_ALLOC_失败”；
箱子袖口类型无效：
返回“袖口类型无效”；
case CUFT_无效_值：
返回“CUFFT\u无效值”；
案例袖口内部错误：
返回“CUFFT_内部错误”；
案例CUFFT_EXEC_失败：
返回“CUFFT_EXEC_失败”；
案例袖口安装失败：
返回“袖口安装失败”；
箱子袖口尺寸无效：
返回“袖口尺寸无效”；
案例袖口未对齐数据：
返回“袖口未对齐数据”；
}
返回“”；
}
#恩迪夫
#定义cufftSafeCall（err）\ cufftSafeCall（err、\文件、\行）
内联void\uuu cufftSafeCall（cufftResult err、const char*文件、const int行）
{
如果（成功！=错误）{
fprintf（stderr，“文件“%s”中的袖口错误，第%d行\n%s\n错误%d:%s\n正在删除！\n“，\uuuuu文件，\uuuu行\uuuu，错误\
_cudaGetErrorEnum（err））\
cudaDeviceReset（）；断言（0）\
}
}
__设备\uuuuu无效IFFT\u缩放（无效*数据输出、大小\uu t偏移、袖口复杂元素、无效*调用者信息、无效*共享DPTR）{
float*比例因子=（float*）调用者信息；
浮动2输出；
输出x=cucreaf（元素）；
输出y=cuCimagf（元素）；
output.x=output.x/scaling\u
#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cufft.h>
#include <cufftXt.h>

#include <thrust/device_vector.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

//#define DISPLAY

/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

    float *scaling_factor = (float*)callerInfo;

    float2 output;
    output.x = cuCrealf(element);
    output.y = cuCimagf(element);

    output.x = output.x / scaling_factor[0];
    output.y = output.y / scaling_factor[0];

    ((float2*)dataOut)[offset] = output;
}

__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

/********/
/* MAIN */
/********/
int main() {

    const int N = 100000000;

    cufftHandle plan;           cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

    TimingGPU timerGPU;

    float2 *h_input             = (float2*)malloc(N*sizeof(float2));
    float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
    float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

    float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
    float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
    float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

    // --- Callback function parameters
    float *h_scaling_factor     = (float*)malloc(sizeof(float));
    h_scaling_factor[0] = 16.0f;
    float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
    gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

    // --- Initializing the input on the host and moving it to the device
    for (int i = 0; i < N; i++) {
        h_input[i].x = 1.0f;
        h_input[i].y = 0.f;
    }
    gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));

    // --- Execute direct FFT on the device and move the results to the host
    cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif

    // --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
    thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing NO callback %f\n", timerGPU.GetCounter());

    // --- Setup store callback
//    timerGPU.StartCounter();
    cufftCallbackStoreC h_storeCallbackPtr;
    gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
    cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

    // --- Execute inverse callback FFT on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing callback %f\n", timerGPU.GetCounter());

    cufftSafeCall(cufftDestroy(plan));

    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output1));
    gpuErrchk(cudaFree(d_output2));

    return 0;
}

Non-callback 69.029762 ms
Callback     65.868607 ms