CUDA中3D矩阵的列和行的1D FFT_Cuda_Cufft

CUDA中3D矩阵的列和行的1D FFT

cuda

CUDA中3D矩阵的列和行的1D FFT,cuda,cufft,Cuda,Cufft,我正在尝试使用计算批处理1D FFT。数据集来自一个3D字段，存储在一个1D数组中，我想在其中计算x和y方向上的1D FFT。数据存储如下图所示；在x然后y然后z中连续在x方向进行批量FFT（我相信）是非常超前的；通过输入stride=1，distance=nx和batch=ny*nz，它计算元素{0,1,2,3}，{4,5,6,7}，..，{28,29,30,31}上的FFT。但是，我想不出一种方法可以在y方向上为FFT实现同样的效果。每个xy平面的批处理也很简单（输入stride=nx，d

我正在尝试使用计算批处理1D FFT。数据集来自一个3D字段，存储在一个1D数组中，我想在其中计算

和

方向上的1D FFT。数据存储如下图所示；在

然后

中连续

在

方向进行批量FFT（我相信）是非常超前的；通过输入

stride=1

，

distance=nx

和

batch=ny*nz

，它计算元素

{0,1,2,3}

，

{4,5,6,7}

，

..

，

{28,29,30,31}

上的FFT。但是，我想不出一种方法可以在

方向上为FFT实现同样的效果。每个

xy

平面的批处理也很简单（输入

stride=nx

，

dist=1

，

batch=nx

结果是

{0,4,8,12}

，

{1,5,9,13}

等）。但是当

batch=nx*nz

从

{3,7,11,15}

到

{16,20,24,28}

时，距离大于

。用cufftPlanMany能做到这一点吗

我认为对你的问题的简短回答（使用单个

cufftPlanMany

对3D矩阵的列执行1D FFT的可能性）是否定的

事实上，根据您称之为

cufftPlanMany(&handle, rank, n, 
              inembed, istride, idist,
              onembed, ostride, odist, CUFFT_C2C, batch);

必须遵守法律。特别是，1D FFT根据以下布局进行计算

input[b * idist + x * istride]

其中，

表示第

-th个信号，

istride

表示同一信号中两个连续项目之间的距离。如果3D矩阵具有尺寸

M*N*Q

，并且如果要沿列执行1D变换，则两个连续元素之间的距离将为

，而两个连续信号之间的距离将为

。此外，批处理执行的数量必须设置为

。使用这些参数，您只能覆盖3D矩阵的一个切片。事实上，如果您尝试增加

，那么CUFT将开始尝试从第二行开始计算新的按列FFT。这个问题的唯一解决方案是迭代调用

2c

，以覆盖所有

切片

下面的代码提供了一个完整的示例，说明如何对3D矩阵的列执行1D FFT

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { N };                        // --- Size of the Fourier transform
    int istride = M, ostride = M;           // --- Distance between two successive input/output elements
    int idist = 1, odist = 1;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = M;                          // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    for (int k=0; k<Q; k++)
        cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data()) + k * M * N), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data()) + k * M * N), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}

#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {

    const int M = 3;
    const int N = 4;
    const int Q = 2;

    thrust::host_vector<float2> h_matrix(M * N * Q);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) {
                float2 temp;
                temp.x = (float)(j + k * M); 
                //temp.x = 1.f; 
                temp.y = 0.f;
                h_matrix[k*M*N+j*M+i] = temp;
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }
    printf("\n");

    thrust::device_vector<float2> d_matrix(h_matrix);

    thrust::device_vector<float2> d_matrix_out(M * N * Q);

    // --- Advanced data layout
    //     input[b * idist + x * istride]
    //     output[b * odist + x * ostride]
    //     b = signal number
    //     x = element of the b-th signal

    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { M };                        // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = M, odist = M;               // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = N * Q;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_C2C, batch);

    cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data())), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data())), CUFFT_FORWARD);
    cufftDestroy(handle);

    for (int k=0; k<Q; k++) 
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) { 
                float2 temp = d_matrix_out[k*M*N+j*M+i];
                printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); 
            }

}

我猜idist=nx*nz也可以跳过整个平面，batch=nz将覆盖一个yx平面。应根据nx或nz是否较大作出决定

谢谢，您的解决方案或多或少与我们目前的做法一致。有趣的是，对于相对较小的问题（例如64^3，但似乎高达~256^3），在水平方向上对域进行转置，以便我们也可以在y方向上对整个场进行分批FFT，与每片分批FFT（定时包括转置）相比，似乎提供了巨大的加速。我将进一步测试它，并尝试制作一个最小的示例，并将其发布在这里。