Matlab 利用cuFFT进行逆FFT的标度
每当我用cuFFT绘制程序获得的值,并将结果与Matlab的结果进行比较时,我得到的是相同形状的图,最大值和最小值在相同的点上。然而,由CUFT产生的值远大于由Matlab产生的值。Matlab代码是Matlab 利用cuFFT进行逆FFT的标度,matlab,cuda,fft,scaling,cufft,Matlab,Cuda,Fft,Scaling,Cufft,每当我用cuFFT绘制程序获得的值,并将结果与Matlab的结果进行比较时,我得到的是相同形状的图,最大值和最小值在相同的点上。然而,由CUFT产生的值远大于由Matlab产生的值。Matlab代码是 fs = 1000; % sample freq D = [0:1:4]'; % pulse delay times t = 0 : 1/fs : 4000/fs;
fs = 1000; % sample freq
D = [0:1:4]'; % pulse delay times
t = 0 : 1/fs : 4000/fs; % signal evaluation time
w = 0.5; % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx)));
具有相同输入的CUDA代码如下所示:
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal, CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);
cufftExecC2C(平面,(cufftComplex*)d_信号,(cufftComplex*)d_信号,CUFFT_向前);
cufftExecC2C(平面图,(cufftComplex*)d_滤波器信号,(cufftComplex*)d_滤波器信号,CUFFT_正向);
复点式MUL(d_信号,d_滤波器_信号,NX);
cufftExecC2C(平面,(cufftComplex*)d_信号,(cufftComplex*)d_信号,CUFFT_逆);
cuFFT还执行批量大小为2
的1024
点FFT
比例因子为
NX=1024
,数值不正确。请告诉我该怎么做。这是一个迟来的答案,可以将此问题从未回答列表中删除
您没有提供足够的信息来诊断您的问题,因为您没有指定设置袖口计划的方式。您甚至没有指定Matlab和cuFFT信号的形状是否完全相同(因此您只需进行缩放)或形状是否大致相同。不过,让我提出以下两点意见:
yp
向量具有4000
元素;与之相反的是,通过fft(yp,1024)
,您通过将信号截断为1024个
元素来执行fft李>
#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
int main(void){
const int N=4;
// --- Setting up input device vector
thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
// --- Perform in-place direct Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);
// --- Perform in-place inverse Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);
thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));
// --- Setting up output host vector
thrust::host_vector<float2> h_vec(d_vec);
for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);
getchar();
}
#包括
#包括
#包括
/*********************/
/*按常数缩放*/
/*********************/
类标度乘以常数
{
私人:
浮点数;
公众:
按常数(浮点数c){c_u=c;}缩放;
__主机\uuuuuuuuuu设备\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
浮动2输出;
输出.x=a.x/c;
输出y=a.y/c;
返回输出;
}
};
内部主(空){
常数int N=4;
//---设置输入设备向量
推力:设备向量d向量(N,使复杂(1.f,2.f));
卡夫坦德尔计划;
袖口平面图(和平面图,N,袖口C2C,1);
//---执行就地直接傅里叶变换
cufftExecC2C(平面,推力::原始指针投射(d_vec.data()),推力::原始指针投射(d_vec.data()),CUFFT_向前);
//---执行就地傅里叶逆变换
cufftExecC2C(平面,推力::原始指针转换(d_vec.data()),推力::原始指针转换(d_vec.data()),CUFFT\u反转);
推力::变换(d_vec.begin(),d_vec.end(),d_vec.begin(),按常量((float)(N))缩放);
//---设置输出主机向量
推力:主机向量h向量(d向量);
对于(int i=0;i,随着cuFFT回调功能的引入,通过将规范化操作定义为\uuu设备
函数,cuFFT执行的逆FFT所需的规范化可以直接嵌入cufftExecC2C
调用中
除了《cuFFT用户指南》,有关cuFFT回调功能,请参阅
下面是通过cuFFT回调实现IFFT规范化的示例
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
cufftHandle plan;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
for (int i=0; i<N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}
#包括
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
/********************/
/*CUDA错误检查*/
/********************/
#定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
内联void gpuAssert(cudaError\u t代码,char*文件,int行,bool abort=true)
{
如果(代码!=cudaSuccess)
{
fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
如果(中止)退出(代码);
}
}
/*********************/
/*袖口错误检查*/
/*********************/
//看http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef_CUFFT_H_
静态常量字符*_cudaGetErrorEnum(袖套结果错误)
{
开关(错误)
{
成功案例:
返回“CUFFT_SUCCESS”;
案例卡夫图无效:
返回“袖口计划无效”;
案例CUFFT_ALLOC_失败:
返回“CUFFT_ALLOC_失败”;
箱子袖口类型无效:
返回“袖口类型无效”;
case CUFT_无效_值:
返回“CUFFT\u无效值”;
案例袖口内部错误:
返回“CUFFT_内部错误”;
案例CUFFT_EXEC_失败:
返回“CUFFT_EXEC_失败”;
案例袖口安装失败:
返回“袖口安装失败”;
箱子袖口尺寸无效:
返回“袖口尺寸无效”;
案例袖口未对齐数据:
返回“袖口未对齐数据”;
}
返回“”;
}
#恩迪夫
#定义cufftSafeCall(err)\ cufftSafeCall(err、\文件、\行)
内联void\uuu cufftSafeCall(cufftResult err、const char*文件、const int行)
{
如果(成功!=错误){
fprintf(stderr,“文件“%s”中的袖口错误,第%d行\n%s\n错误%d:%s\n正在删除!\n“,\uuuuu文件,\uuuu行\uuuu,错误\
_cudaGetErrorEnum(err))\
cudaDeviceReset();断言(0)\
}
}
__设备\uuuuu无效IFFT\u缩放(无效*数据输出、大小\uu t偏移、袖口复杂元素、无效*调用者信息、无效*共享DPTR){
float*比例因子=(float*)调用者信息;
浮动2输出;
输出x=cucreaf(元素);
输出y=cuCimagf(元素);
output.x=output.x/scaling\u
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
#include <thrust/device_vector.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define DISPLAY
/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 100000000;
cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
TimingGPU timerGPU;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
// --- Callback function parameters
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
// --- Initializing the input on the host and moving it to the device
for (int i = 0; i < N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));
// --- Execute direct FFT on the device and move the results to the host
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif
// --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing NO callback %f\n", timerGPU.GetCounter());
// --- Setup store callback
// timerGPU.StartCounter();
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
// --- Execute inverse callback FFT on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing callback %f\n", timerGPU.GetCounter());
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}
Non-callback 69.029762 ms
Callback 65.868607 ms