Filter CUDA中的FIR滤波器(作为一维卷积)

Filter CUDA中的FIR滤波器(作为一维卷积),filter,cuda,signal-processing,Filter,Cuda,Signal Processing,我试图在CUDA中实现FIR(有限脉冲响应)滤波器。我的方法非常简单,看起来有点像这样: #包括 __全局无效过滤器数据(常量浮点*数据, 常量浮点*d_分子, 浮动*d_过滤器数据, 常量整数长度, 常量整型过滤器(最大长度) { int i=blockDim.x*blockIdx.x+threadIdx.x; 浮动总和=0.0f; 如果(i

我试图在CUDA中实现FIR(有限脉冲响应)滤波器。我的方法非常简单,看起来有点像这样:

#包括
__全局无效过滤器数据(常量浮点*数据,
常量浮点*d_分子,
浮动*d_过滤器数据,
常量整数长度,
常量整型过滤器(最大长度)
{
int i=blockDim.x*blockIdx.x+threadIdx.x;
浮动总和=0.0f;
如果(i
这个过滤器可以工作,但我是CUDA编程新手,不知道如何优化它

我看到的一个小问题是,
dataLength
filteredatalength
numeratorLength
在我打算使用过滤器的应用程序中是未知的。此外,即使在上述代码中,
dataLength
32
的倍数,也不能保证在最终应用中是该倍数

当我将上面的代码与ArrayFire进行比较时,执行代码所需的时间大约是ArrayFire的三倍

有人对如何加快速度有什么想法吗


编辑:已将所有
过滤器长度
更改为
数字长度

,我可以建议以下加快代码速度:

  • 使用共享内存:它是一个很小的类似缓存的内存,但是非常有用 比全局卡内存更快。你可以通过 在CUDA文档中查找共享关键字。对于 例如,您可以预取过滤器分子和大块 在共享内存中存储大量数据,这将显著增强 演出你需要特别注意这些数据 在这种情况下,对齐非常重要,而且可能会减慢速度 你的密码
  • 考虑展开分子的for循环 总和您可以在CUDA中查看reduce vector示例 文件
  • 您还可以考虑将 分子环本身。这可以通过向线程块添加额外的维度(比如“y”)来实现。您还需要将sum设置为具有numeratorLength维度的共享向量。您还可以查看reduce vector示例,了解如何 在结尾处快速计算该向量的和

  • 您正试图通过CUDA内核直接计算1D卷积来计算滤波器输出

    在滤波器脉冲响应持续时间较长的情况下,您可以使用FFT直接在共轭域中执行计算,以评估滤波输入。下面我将报告一个使用CUDA推力和cuFFT库的示例代码。这是在上报告的基于Matlab的示例的直接翻译

    我不承认这段代码可以进行一些优化,但我更倾向于保持原样,以便与Matlab的对应代码进行比较

    #include <stdio.h>
    #include <math.h>
    
    #include <cufft.h>
    
    #include <thrust\device_vector.h>
    #include <thrust\sequence.h>
    
    #define pi_f  3.14159265358979f                 // Greek pi in single precision
    
    /****************/
    /* SIN OPERATOR */
    /****************/
    class sin_op {
    
        float fk_, Fs_;
    
        public:
    
            sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }
    
            __host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
    };
    
    /*****************/
    /* SINC OPERATOR */
    /*****************/
    class sinc_op {
    
        float fc_, Fs_;
    
        public:
    
            sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }
    
            __host__ __device__ float operator()(float x) const 
            {
                if (x==0)   return (2.f*fc_/Fs_);
                else            return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
            }
    };
    
    /********************/
    /* HAMMING OPERATOR */
    /********************/
    class hamming_op {
    
        int L_;
    
        public:
    
            hamming_op(int L) { L_ = L; }
    
            __host__ __device__ float operator()(int x) const 
            {
                return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
            }
    };
    
    
    /*********************************/
    /* MULTIPLY CUFFTCOMPLEX NUMBERS */
    /*********************************/
    struct multiply_cufftComplex {
        __device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
            cufftComplex r;
            r.x = a.x * b.x - a.y * b.y;
            r.y = a.x * b.y + a.y * b.x;
            return r;
        }
    };
    
    /********/
    /* MAIN */
    /********/
    void main(){
    
        // Signal parameters:
        int M = 256;                            // signal length
        const int N = 4;
        float f[N] = { 440, 880, 1000, 2000 };              // frequencies
        float Fs = 5000.;                       // sampling rate
    
        // Generate a signal by adding up sinusoids:
        thrust::device_vector<float> d_x(M,0.f);            // pre-allocate 'accumulator'
        thrust::device_vector<float> d_n(M);                // discrete-time grid
        thrust::sequence(d_n.begin(), d_n.end(), 0, 1);
    
        thrust::device_vector<float> d_temp(M);
        for (int i=0; i<N; i++) { 
            float fk = f[i];
            thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
            thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>()); 
        }
    
        // Filter parameters:
        int L = 257;                        // filter length
        float fc = 600.f;                   // cutoff frequency
    
        // Design the filter using the window method:
        thrust::device_vector<float> d_hsupp(L);            
        thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
        thrust::device_vector<float> d_hideal(L);           
        thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
        thrust::device_vector<float> d_l(L);                
        thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
        thrust::device_vector<float> d_h(L);                
        thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
        // h is our filter
        thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());  
    
        // --- Choose the next power of 2 greater than L+M-1
        int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)
    
        // Zero pad the signal and impulse response:
        thrust::device_vector<float> d_xzp(Nfft,0.f);
        thrust::device_vector<float> d_hzp(Nfft,0.f);
        thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
        thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());
    
        // Transform the signal and the filter:
        cufftHandle plan;
        cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
        thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
        thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
        cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
        cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));
    
        thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
        thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());  
    
        cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
        thrust::device_vector<float> d_y(Nfft);
        cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));
    
        getchar();
    
    }
    
    #包括
    #包括
    #包括
    #包括
    #包括
    #定义pi_f 3.14159265358979f//希腊语单精度pi
    /****************/
    /*正弦算子*/
    /****************/
    类sin_op{
    浮点数fk_u3;,Fs_3;;
    公众:
    sin_op(float fk,float Fs){fk_=fk;Fs_=Fs;}
    __主机设备浮点数操作符()(浮点数x)常量{return sin(2.f*pi\u f*x*fk_/Fs_)}
    };
    /*****************/
    /*SINC算子*/
    /*****************/
    类sinc_op{
    浮点数fc_uufs;
    公众:
    sinc_op(float fc,float Fs){fc_=fc;Fs_=Fs;}
    __主机\设备\浮点运算符()(浮点x)常量
    {
    如果(x==0)返回(2.f*fc\uFs\uFs);
    else返回(2.f*fc\uFs\uFs)*sin(2.f*pi\u f*fc\uFx/Fs\uFs)/(2.f*pi\u f*fc\uFs\uFs);
    }
    };
    /********************/
    /*汉明算子*/
    /********************/
    哈明顿班{
    国际劳工组织;
    公众:
    哈明厄普(intl){L=L;}
    __主机设备浮点运算符()(int x)常量
    {
    返回0.54-0.46*cos(2.f*pi_f*x/(L_-1));
    }
    };
    /*********************************/
    /*乘复数*/
    
    #include <stdio.h>
    #include <stdlib.h>
    
    #include "TimingGPU.cuh"
    #include "Utilities.cuh"
    
    #define RG          10
    #define BLOCKSIZE   8
    
    /****************/
    /* CPU FUNCTION */
    /****************/
    void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU, 
                          const int N, const int K) {
    
        for (int i = 0; i < N; i++) {
    
            float temp = 0.f;
    
            int N_start_point = i - (K / 2);
    
            for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
                temp += h_Signal[N_start_point+ j] * h_ConvKernel[j];
            }
    
            h_Result_CPU[i] = temp;
        }
    }
    
    /********************/
    /* BASIC GPU KERNEL */
    /********************/
    __global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                           const int N, const int K) {
    
        int i = blockIdx.x * blockDim.x + threadIdx.x;
    
        float temp = 0.f;
    
        int N_start_point = i - (K / 2);
    
        for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
            temp += d_Signal[N_start_point+ j] * d_ConvKernel[j];
        }
    
        d_Result_GPU[i] = temp;
    }
    
    /***************************/
    /* GPU KERNEL WITH CACHING */
    /***************************/
    __global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                             const int N, const int K) {
    
        int i = blockIdx.x * blockDim.x + threadIdx.x;
    
        __shared__ float d_Tile[BLOCKSIZE];
    
        d_Tile[threadIdx.x] = d_Signal[i];
        __syncthreads();
    
        float temp = 0.f;
    
        int N_start_point = i - (K / 2);
    
        for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
    
                if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x))
    
                    // --- The signal element is in the tile loaded in the shared memory
                    temp += d_Tile[threadIdx.x + j - (K / 2)] * d_ConvKernel[j]; 
    
                else
    
                    // --- The signal element is not in the tile loaded in the shared memory
                    temp += d_Signal[N_start_point + j] * d_ConvKernel[j];
    
        }
    
        d_Result_GPU[i] = temp;
    }
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        const int N = 15;           // --- Signal length
        const int K = 5;            // --- Convolution kernel length
    
        float *h_Signal         = (float *)malloc(N * sizeof(float));
        float *h_Result_CPU     = (float *)malloc(N * sizeof(float));
        float *h_Result_GPU     = (float *)malloc(N * sizeof(float));
        float *h_ConvKernel     = (float *)malloc(K * sizeof(float));
    
        float *d_Signal;        gpuErrchk(cudaMalloc(&d_Signal,     N * sizeof(float)));
        float *d_Result_GPU;    gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float)));
        float *d_ConvKernel;    gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float)));
    
        for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); }
    
        for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); }
    
        gpuErrchk(cudaMemcpy(d_Signal,      h_Signal,       N * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_ConvKernel,  h_ConvKernel,   K * sizeof(float), cudaMemcpyHostToDevice));
    
        h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K);
    
        d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
    
        for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
    
        printf("Test basic passed\n");
    
        d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
    
        for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
    
        printf("Test caching passed\n");
    
        return 0;
    }