CUDA中使用Cuft的模板匹配?

CUDA中使用Cuft的模板匹配?,cuda,cufft,Cuda,Cufft,下面是我的代码,使用cufft中提供的FFT实现模板匹配任务。核心逻辑与此处提供的Matlab代码类似: Matlab代码: #include <iostream> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <math.h> #include <chrono> #include <math_functions.h> #include <bi

下面是我的代码,使用cufft中提供的FFT实现模板匹配任务。核心逻辑与此处提供的Matlab代码类似:

Matlab代码

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <chrono>
#include <math_functions.h>
#include <bitmap_image.hpp>
#include <cufft.h>
using namespace std;

#define errorHandler(stmt)                                                                                  \
    do {                                                                                                    \
        cudaError_t err = stmt;                                                                             \
        if (err != cudaSuccess) {                                                                           \
            printf("[ERROR] Failed to run stmt %d, error body: %s\n", __LINE__, cudaGetErrorString(err));   \
            return -1; }                                                                                    \
    } while (0)                                                                                             \

#define M_PI 3.14159265

#define BLOCK_SIZE 1024
typedef float2 Complex;

int initiate_parallel_template_matching(bitmap_image, bitmap_image);
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, cufftComplex *, int, float);

// Padding functions
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size);

int get_number_of_occurances(cufftComplex * arr, unsigned int size);

int main()
{
    bitmap_image main_image("Input Files/collection.bmp");
    bitmap_image template_image("Input Files/collection_coin.bmp");

    initiate_parallel_template_matching(main_image, template_image);

    system("pause");
    return 0;
}

int initiate_parallel_template_matching(bitmap_image main_image, bitmap_image template_image)
{
    // Get sizes
    int main_width = main_image.width();
    int main_height = main_image.height();
    int main_size = main_width * main_height;
    int template_width = template_image.width();
    int template_height = template_image.height();
    int template_size = template_width * template_height;

    unsigned char* h_main_image = new unsigned char[3 * main_size];

    for (int col = 0; col < main_width; col++) {
        for (int row = 0; row < main_height; row++) {
            rgb_t colors;

            main_image.get_pixel(col, row, colors);
            h_main_image[(row * main_width + col) * 3 + 0] = colors.red;
            h_main_image[(row * main_width + col) * 3 + 1] = colors.green;
            h_main_image[(row * main_width + col) * 3 + 2] = colors.blue;
        }
    }

    unsigned char* h_template_image = new unsigned char[3 * template_size];

    for (int col = 0; col < template_width; col++) {
        for (int row = 0; row < template_height; row++) {
            rgb_t colors;

            template_image.get_pixel(col, row, colors);
            h_template_image[(row * template_width + col) * 3 + 0] = colors.red;
            h_template_image[(row * template_width + col) * 3 + 1] = colors.green;
            h_template_image[(row * template_width + col) * 3 + 2] = colors.blue;
        }
    }

    cufftComplex* h_main_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * main_width * main_height * 3);
    cufftComplex* h_template_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * template_width * template_height * 3);
    int main_signal_size = main_width * main_height * 3;
    int template_signal_size = template_width * template_height * 3;

    for (int y = 0; y < main_height; y++) {
        for (int x = 0; x < main_width; x++) {
            h_main_signal[(y * main_width + x) * 3 + 0].x = (double)h_main_image[(y * main_width + x) * 3 + 0];
            h_main_signal[(y * main_width + x) * 3 + 1].x = (double)h_main_image[(y * main_width + x) * 3 + 1];
            h_main_signal[(y * main_width + x) * 3 + 2].x = (double)h_main_image[(y * main_width + x) * 3 + 2];
            h_main_signal[(y * main_width + x) * 3 + 0].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 1].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 2].y = 0;
        }
    }


    for (int y = 0; y < template_height; y++) {
        for (int x = 0; x < template_width; x++) {
            h_template_signal[(y * template_width + x) * 3 + 0].x = (double)h_template_image[(y * template_width + x) * 3 + 0];
            h_template_signal[(y * template_width + x) * 3 + 1].x = (double)h_template_image[(y * template_width + x) * 3 + 1];
            h_template_signal[(y * template_width + x) * 3 + 2].x = (double)h_template_image[(y * template_width + x) * 3 + 2];
            h_template_signal[(y * template_width + x) * 3 + 0].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 1].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 2].y = 0;
        }
    }

    cufftComplex* d_main_signal;
    cufftComplex* d_template_signal;
    cufftComplex* d_main_signal_out;
    cufftComplex* d_template_signal_out;
    cufftComplex* d_inversed;

    int main_memsize = sizeof(cufftComplex) * main_signal_size;
    int template_memsize = sizeof(cufftComplex) * template_signal_size;

    // Pad image signals
    cufftComplex *h_padded_main_signal;
    cufftComplex *h_padded_template_signal;


    int NEW_SIZE = PadData(h_main_signal, &h_padded_main_signal, main_signal_size, h_template_signal, &h_padded_template_signal, template_signal_size);

    errorHandler(cudaMalloc((void**)&d_main_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_main_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_inversed, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMemcpy(d_main_signal, h_padded_main_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));
    errorHandler(cudaMemcpy(d_template_signal, h_padded_template_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));

    // Plan for 2 CUFFT_FORWARDs
    cufftHandle plan_main;
    cufftHandle plan_template;
    cufftPlan1d(&plan_main, NEW_SIZE, CUFFT_C2C, 1);
    cufftPlan1d(&plan_template, NEW_SIZE, CUFFT_C2C, 1);

    // Perform forward FFT
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal, (cufftComplex *)d_main_signal_out, CUFFT_FORWARD);
    cufftExecC2C(plan_template, (cufftComplex *)d_template_signal, (cufftComplex *)d_template_signal_out, CUFFT_FORWARD);

    // Multiply the coefficients together and normalize the result
    printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
    dim3 gridDimensions((unsigned int)(ceil(NEW_SIZE / (float)BLOCK_SIZE)), 1, 1);
    dim3 blockDimensions(BLOCK_SIZE, 1, 1);

    ComplexPointwiseMulAndScale << <gridDimensions, blockDimensions >> >((cufftComplex *)d_main_signal_out, (cufftComplex *)d_template_signal_out, NEW_SIZE, 1.0f / NEW_SIZE);

    errorHandler(cudaGetLastError());

    // Perform the inverse fft on the main signal
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal_out, (cufftComplex *)d_inversed, CUFFT_INVERSE);

    // Copy data back to host
    cufftComplex * h_correlation_signal;
    h_correlation_signal = h_padded_main_signal;
    errorHandler(cudaMemcpy(h_correlation_signal, d_inversed, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyDeviceToHost));

    for (int i = 0; i < NEW_SIZE; i++) {
        h_correlation_signal[i].x = abs(h_correlation_signal[i].x);
        h_correlation_signal[i].y = abs(h_correlation_signal[i].y);
    }

    get_number_of_occurances(h_correlation_signal, NEW_SIZE);


    // Cancel plans :))))
    cufftDestroy(plan_main);
    cufftDestroy(plan_template);

    // Free allocated memory
    errorHandler(cudaFree(d_main_signal));
    errorHandler(cudaFree(d_template_signal));
    errorHandler(cudaFree(d_main_signal_out));
    errorHandler(cudaFree(d_template_signal_out));
    free(h_main_image);
    free(h_template_image);
    free(h_main_signal);
    free(h_template_signal);
    free(h_padded_main_signal);
    free(h_padded_template_signal);
    return EXIT_SUCCESS;
}

///////////////////////////////////////////////////////////////////////////////////
// Function for padding original data
//////////////////////////////////////////////////////////////////////////////////
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    cufftComplex *new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, signal, signal_size * sizeof(cufftComplex));
    memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(cufftComplex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(cufftComplex));
    memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(cufftComplex));
    memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(cufftComplex));
    *padded_filter_kernel = new_data;

    return new_size;
}


////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = threadID; i < size; i += numThreads)
    {
        a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
    }
}

int get_number_of_occurances(cufftComplex * arr, unsigned int size)
{
    cufftComplex max = arr[0];
    int num_of_occurs = 0;

    for (unsigned int i = 1; i < size; i++) {
        if (arr[i].x > max.x && arr[i].y > max.y) {
            num_of_occurs = 1;
            max = arr[i];
        }

        if (arr[i].x == max.x && arr[i].y == max.y)
            num_of_occurs++;
    }

    wcout << "[Number of Occurances]: " << num_of_occurs << endl;

    return num_of_occurs;
}
全部清除;全部关闭;
模板=rgb2gray(imread('possum_ear.jpg');
背景=rgb2gray(imread('possum.jpg'));
%%计算填充
bx=大小(背景,2);
by=大小(背景,1);
tx=尺寸(模板,2);%用于bbox放置
ty=尺寸(模板,1);
%%快速傅里叶变换
c=真实值(ifft2(fft2(背景)。*fft2(模板,bx));
%%查找峰值相关性
[max_c,imax]=max(abs(c(:));
[ypeak,xpeak]=find(c==max(c(:));
图形表面(c),阴影平坦;%小区相关性
%%显示最佳匹配
hFig=数字;
hAx=轴;
位置=[xpeak(1)-tx,ypeak(1)-ty,tx,ty];
imshow(背景,'父',hAx);
imrect(hAx,位置);
我的CUDA代码

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <chrono>
#include <math_functions.h>
#include <bitmap_image.hpp>
#include <cufft.h>
using namespace std;

#define errorHandler(stmt)                                                                                  \
    do {                                                                                                    \
        cudaError_t err = stmt;                                                                             \
        if (err != cudaSuccess) {                                                                           \
            printf("[ERROR] Failed to run stmt %d, error body: %s\n", __LINE__, cudaGetErrorString(err));   \
            return -1; }                                                                                    \
    } while (0)                                                                                             \

#define M_PI 3.14159265

#define BLOCK_SIZE 1024
typedef float2 Complex;

int initiate_parallel_template_matching(bitmap_image, bitmap_image);
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, cufftComplex *, int, float);

// Padding functions
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size);

int get_number_of_occurances(cufftComplex * arr, unsigned int size);

int main()
{
    bitmap_image main_image("Input Files/collection.bmp");
    bitmap_image template_image("Input Files/collection_coin.bmp");

    initiate_parallel_template_matching(main_image, template_image);

    system("pause");
    return 0;
}

int initiate_parallel_template_matching(bitmap_image main_image, bitmap_image template_image)
{
    // Get sizes
    int main_width = main_image.width();
    int main_height = main_image.height();
    int main_size = main_width * main_height;
    int template_width = template_image.width();
    int template_height = template_image.height();
    int template_size = template_width * template_height;

    unsigned char* h_main_image = new unsigned char[3 * main_size];

    for (int col = 0; col < main_width; col++) {
        for (int row = 0; row < main_height; row++) {
            rgb_t colors;

            main_image.get_pixel(col, row, colors);
            h_main_image[(row * main_width + col) * 3 + 0] = colors.red;
            h_main_image[(row * main_width + col) * 3 + 1] = colors.green;
            h_main_image[(row * main_width + col) * 3 + 2] = colors.blue;
        }
    }

    unsigned char* h_template_image = new unsigned char[3 * template_size];

    for (int col = 0; col < template_width; col++) {
        for (int row = 0; row < template_height; row++) {
            rgb_t colors;

            template_image.get_pixel(col, row, colors);
            h_template_image[(row * template_width + col) * 3 + 0] = colors.red;
            h_template_image[(row * template_width + col) * 3 + 1] = colors.green;
            h_template_image[(row * template_width + col) * 3 + 2] = colors.blue;
        }
    }

    cufftComplex* h_main_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * main_width * main_height * 3);
    cufftComplex* h_template_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * template_width * template_height * 3);
    int main_signal_size = main_width * main_height * 3;
    int template_signal_size = template_width * template_height * 3;

    for (int y = 0; y < main_height; y++) {
        for (int x = 0; x < main_width; x++) {
            h_main_signal[(y * main_width + x) * 3 + 0].x = (double)h_main_image[(y * main_width + x) * 3 + 0];
            h_main_signal[(y * main_width + x) * 3 + 1].x = (double)h_main_image[(y * main_width + x) * 3 + 1];
            h_main_signal[(y * main_width + x) * 3 + 2].x = (double)h_main_image[(y * main_width + x) * 3 + 2];
            h_main_signal[(y * main_width + x) * 3 + 0].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 1].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 2].y = 0;
        }
    }


    for (int y = 0; y < template_height; y++) {
        for (int x = 0; x < template_width; x++) {
            h_template_signal[(y * template_width + x) * 3 + 0].x = (double)h_template_image[(y * template_width + x) * 3 + 0];
            h_template_signal[(y * template_width + x) * 3 + 1].x = (double)h_template_image[(y * template_width + x) * 3 + 1];
            h_template_signal[(y * template_width + x) * 3 + 2].x = (double)h_template_image[(y * template_width + x) * 3 + 2];
            h_template_signal[(y * template_width + x) * 3 + 0].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 1].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 2].y = 0;
        }
    }

    cufftComplex* d_main_signal;
    cufftComplex* d_template_signal;
    cufftComplex* d_main_signal_out;
    cufftComplex* d_template_signal_out;
    cufftComplex* d_inversed;

    int main_memsize = sizeof(cufftComplex) * main_signal_size;
    int template_memsize = sizeof(cufftComplex) * template_signal_size;

    // Pad image signals
    cufftComplex *h_padded_main_signal;
    cufftComplex *h_padded_template_signal;


    int NEW_SIZE = PadData(h_main_signal, &h_padded_main_signal, main_signal_size, h_template_signal, &h_padded_template_signal, template_signal_size);

    errorHandler(cudaMalloc((void**)&d_main_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_main_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_inversed, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMemcpy(d_main_signal, h_padded_main_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));
    errorHandler(cudaMemcpy(d_template_signal, h_padded_template_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));

    // Plan for 2 CUFFT_FORWARDs
    cufftHandle plan_main;
    cufftHandle plan_template;
    cufftPlan1d(&plan_main, NEW_SIZE, CUFFT_C2C, 1);
    cufftPlan1d(&plan_template, NEW_SIZE, CUFFT_C2C, 1);

    // Perform forward FFT
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal, (cufftComplex *)d_main_signal_out, CUFFT_FORWARD);
    cufftExecC2C(plan_template, (cufftComplex *)d_template_signal, (cufftComplex *)d_template_signal_out, CUFFT_FORWARD);

    // Multiply the coefficients together and normalize the result
    printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
    dim3 gridDimensions((unsigned int)(ceil(NEW_SIZE / (float)BLOCK_SIZE)), 1, 1);
    dim3 blockDimensions(BLOCK_SIZE, 1, 1);

    ComplexPointwiseMulAndScale << <gridDimensions, blockDimensions >> >((cufftComplex *)d_main_signal_out, (cufftComplex *)d_template_signal_out, NEW_SIZE, 1.0f / NEW_SIZE);

    errorHandler(cudaGetLastError());

    // Perform the inverse fft on the main signal
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal_out, (cufftComplex *)d_inversed, CUFFT_INVERSE);

    // Copy data back to host
    cufftComplex * h_correlation_signal;
    h_correlation_signal = h_padded_main_signal;
    errorHandler(cudaMemcpy(h_correlation_signal, d_inversed, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyDeviceToHost));

    for (int i = 0; i < NEW_SIZE; i++) {
        h_correlation_signal[i].x = abs(h_correlation_signal[i].x);
        h_correlation_signal[i].y = abs(h_correlation_signal[i].y);
    }

    get_number_of_occurances(h_correlation_signal, NEW_SIZE);


    // Cancel plans :))))
    cufftDestroy(plan_main);
    cufftDestroy(plan_template);

    // Free allocated memory
    errorHandler(cudaFree(d_main_signal));
    errorHandler(cudaFree(d_template_signal));
    errorHandler(cudaFree(d_main_signal_out));
    errorHandler(cudaFree(d_template_signal_out));
    free(h_main_image);
    free(h_template_image);
    free(h_main_signal);
    free(h_template_signal);
    free(h_padded_main_signal);
    free(h_padded_template_signal);
    return EXIT_SUCCESS;
}

///////////////////////////////////////////////////////////////////////////////////
// Function for padding original data
//////////////////////////////////////////////////////////////////////////////////
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    cufftComplex *new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, signal, signal_size * sizeof(cufftComplex));
    memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(cufftComplex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(cufftComplex));
    memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(cufftComplex));
    memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(cufftComplex));
    *padded_filter_kernel = new_data;

    return new_size;
}


////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = threadID; i < size; i += numThreads)
    {
        a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
    }
}

int get_number_of_occurances(cufftComplex * arr, unsigned int size)
{
    cufftComplex max = arr[0];
    int num_of_occurs = 0;

    for (unsigned int i = 1; i < size; i++) {
        if (arr[i].x > max.x && arr[i].y > max.y) {
            num_of_occurs = 1;
            max = arr[i];
        }

        if (arr[i].x == max.x && arr[i].y == max.y)
            num_of_occurs++;
    }

    wcout << "[Number of Occurances]: " << num_of_occurs << endl;

    return num_of_occurs;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
#包括
使用名称空间std;
#定义错误处理程序(stmt)\
做{\
cudaError\u t err=stmt\
如果(err!=cudaSuccess){\
printf(“[ERROR]无法运行stmt%d,错误正文:%s\n”,_行_uuu,cudaGetErrorString(err))\
返回-1;}\
}而(0)\
#定义M_PI 3.14159265
#定义块大小为1024
2型复合物;
int启动并行模板匹配(位图图像、位图图像);
静态、设备、主机、内联复杂复合体DD(复杂、复杂);
静态u_设备__主机_;内联复杂复杂度刻度(复杂,浮点);
静态、设备、主机、内联复杂复杂(复杂、复杂);
静态uuu全局uuuuu无效复数点式拟合比例(CuftComplex*,CuftComplex*,int,float);
//填充函数
int PadData(常数袖口复合体*信号,袖口复合体**填充信号,int信号大小,
常量cuftcomplex*过滤器内核,cuftcomplex**填充过滤器内核,int过滤器内核大小);
int获取发生次数(cufftComplex*arr,无符号int大小);
int main()
{
位图图像主图像(“输入文件/collection.bmp”);
位图图像模板图像(“输入文件/collection\u coin.bmp”);
启动并行模板匹配(主图像、模板图像);
系统(“暂停”);
返回0;
}
int启动并行模板匹配(位图图像主图像、位图图像模板图像)
{
//获取尺寸
int main_width=main_image.width();
int main_height=main_image.height();
int main_size=主_宽度*主_高度;
int template_width=template_image.width();
int template_height=template_image.height();
int template_size=模板宽度*模板高度;
无符号字符*h_main_image=新的无符号字符[3*main_size];
对于(int col=0;col