如何使用cuFFT执行从实到复杂的转换_C_Cuda_Cufft

如何使用cuFFT执行从实到复杂的转换

c cuda

如何使用cuFFT执行从实到复杂的转换,c,cuda,cufft,C,Cuda,Cufft,以下代码已从改编为适用于使用cufftPlan1d的单个1D转换。最终，我希望执行批处理就地R2C转换，但下面的代码使用单独的输入和输出数组执行单个转换如何调整此代码以执行就地转换，从而减少设备上分配的内存量谢谢 CUDA6.5-注意：我正在运行Matlab2015a中的mexFunction代码代码： #包括 #包括 #包括 #包括 #定义数据大小8 #定义批次1 #定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）} 内联void gpuAssert（

以下代码已从改编为适用于使用cufftPlan1d的单个1D转换。最终，我希望执行批处理就地R2C转换，但下面的代码使用单独的输入和输出数组执行单个转换

如何调整此代码以执行就地转换，从而减少设备上分配的内存量

谢谢
CUDA6.5-注意：我正在运行Matlab2015a中的mexFunction代码

代码：

#包括
#包括
#包括
#包括
#定义数据大小8
#定义批次1
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
内联void gpuAssert（cudaError\u t代码，const char*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
如果（中止）退出（代码）；
}
}
void main（整型argc，字符**argv）
{   
//---主机端输入数据分配和初始化
cufftReal*hostInputData=（cufftReal*）malloc（DATASIZE*sizeof（cufftReal））；
对于（int j=0；j结果的主副本）
gpuerchk（cudaMemcpy（主机输出数据，设备输出数据，（数据大小/2+1）*sizeof（袖套复合体），cudaMemcpyDeviceToHost））；
对于（int j=0；j，解决方案已在另一个答案中给出：
例如，这意味着：
将输入分配为cufftComplex：
cufftComplex *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * sizeof(cufftComplex)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);

就地改造：
cufftStatus = cufftExecR2C(handle,  (cufftReal *)deviceInputData, deviceInputData);
gpuErrchk(cudaMemcpy(hostOutputData, deviceInputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

顺便说一句：MATLAB还包含一个GPU加速版本的fft（），也许这对您也很有用：
这是我自己的完整解决方案，从cufftReal开始
void process(double *x, double *y, size_t n){
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)x[j];

// --- Device side input data allocation and initialization
cufftReal *deviceData; 
gpuErrchk(cudaMalloc((void**)&deviceData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
cudaMemcpy(deviceData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);

// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));

cufftResult cufftStatus;
cufftHandle handle;

cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }       

cufftStatus = cufftExecR2C(handle,  deviceData, (cufftComplex*)deviceData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }  

// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

for (int j=0; j<(DATASIZE / 2 + 1); j++)
        mexPrintf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);

cufftDestroy(handle);
gpuErrchk(cudaFree(deviceData));}

void过程（双*x，双*y，大小）{
//---主机端输入数据分配和初始化
cufftReal*hostInputData=（cufftReal*）malloc（DATASIZE*sizeof（cufftReal））；
对于（int j=0；j结果的主副本）
gpuerchk（cudaMemcpy（主机输出数据，设备数据，（数据大小/2+1）*sizeof（袖套复合体），cudaMemcpyDeviceToHost））；
对于（int j=0；jj）您会遇到什么错误？编译时还是运行时？您使用的是哪一个CUDA版本？CUDA 6.5。我更新了帖子以反映这一点。我还没有收到任何错误，我还不知道如何在原则上做到这一点。dpes one如何填充cufftComplex以便（cufftReal*）数据会起作用吗？@m.s.我已经更新了这个问题，以便更严格地回答您的评论。您能否提供一个可编译、自包含的示例（请参阅）没有任何MATLAB依赖项？请添加一个包含示例数据和内核启动的主函数。@m.s.不需要内核启动函数，转换由cufftExecR2C执行，它是一个内置函数，唯一的MATLAB依赖项是mexPrintf函数，可以随时与printf交换函数。我将函数更改为main（）；谢谢，我最终自己也解决了这个问题，但我做的略有不同，首先是从一个输入数据cufftReal数组开始，然后分配一个长度数组（DATASIZE/2+1）*sizeof（cufftComplex）来处理转换产生的两个额外floast。
void process(double *x, double *y, size_t n){
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)x[j];

// --- Device side input data allocation and initialization
cufftReal *deviceData; 
gpuErrchk(cudaMalloc((void**)&deviceData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
cudaMemcpy(deviceData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);

// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));

cufftResult cufftStatus;
cufftHandle handle;

cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }       

cufftStatus = cufftExecR2C(handle,  deviceData, (cufftComplex*)deviceData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }  

// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

for (int j=0; j<(DATASIZE / 2 + 1); j++)
        mexPrintf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);

cufftDestroy(handle);
gpuErrchk(cudaFree(deviceData));}