CUDA中使用Cuft的模板匹配？_Cuda_Cufft

CUDA中使用Cuft的模板匹配？

cuda

CUDA中使用Cuft的模板匹配？,cuda,cufft,Cuda,Cufft,下面是我的代码，使用cufft中提供的FFT实现模板匹配任务。核心逻辑与此处提供的Matlab代码类似： Matlab代码： #include <iostream> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <math.h> #include <chrono> #include <math_functions.h> #include <bi

下面是我的代码，使用cufft中提供的FFT实现模板匹配任务。核心逻辑与此处提供的Matlab代码类似：

Matlab代码：

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <chrono>
#include <math_functions.h>
#include <bitmap_image.hpp>
#include <cufft.h>
using namespace std;

#define errorHandler(stmt)                                                                                  \
    do {                                                                                                    \
        cudaError_t err = stmt;                                                                             \
        if (err != cudaSuccess) {                                                                           \
            printf("[ERROR] Failed to run stmt %d, error body: %s\n", __LINE__, cudaGetErrorString(err));   \
            return -1; }                                                                                    \
    } while (0)                                                                                             \

#define M_PI 3.14159265

#define BLOCK_SIZE 1024
typedef float2 Complex;

int initiate_parallel_template_matching(bitmap_image, bitmap_image);
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, cufftComplex *, int, float);

// Padding functions
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size);

int get_number_of_occurances(cufftComplex * arr, unsigned int size);

int main()
{
    bitmap_image main_image("Input Files/collection.bmp");
    bitmap_image template_image("Input Files/collection_coin.bmp");

    initiate_parallel_template_matching(main_image, template_image);

    system("pause");
    return 0;
}

int initiate_parallel_template_matching(bitmap_image main_image, bitmap_image template_image)
{
    // Get sizes
    int main_width = main_image.width();
    int main_height = main_image.height();
    int main_size = main_width * main_height;
    int template_width = template_image.width();
    int template_height = template_image.height();
    int template_size = template_width * template_height;

    unsigned char* h_main_image = new unsigned char[3 * main_size];

    for (int col = 0; col < main_width; col++) {
        for (int row = 0; row < main_height; row++) {
            rgb_t colors;

            main_image.get_pixel(col, row, colors);
            h_main_image[(row * main_width + col) * 3 + 0] = colors.red;
            h_main_image[(row * main_width + col) * 3 + 1] = colors.green;
            h_main_image[(row * main_width + col) * 3 + 2] = colors.blue;
        }
    }

    unsigned char* h_template_image = new unsigned char[3 * template_size];

    for (int col = 0; col < template_width; col++) {
        for (int row = 0; row < template_height; row++) {
            rgb_t colors;

            template_image.get_pixel(col, row, colors);
            h_template_image[(row * template_width + col) * 3 + 0] = colors.red;
            h_template_image[(row * template_width + col) * 3 + 1] = colors.green;
            h_template_image[(row * template_width + col) * 3 + 2] = colors.blue;
        }
    }

    cufftComplex* h_main_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * main_width * main_height * 3);
    cufftComplex* h_template_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * template_width * template_height * 3);
    int main_signal_size = main_width * main_height * 3;
    int template_signal_size = template_width * template_height * 3;

    for (int y = 0; y < main_height; y++) {
        for (int x = 0; x < main_width; x++) {
            h_main_signal[(y * main_width + x) * 3 + 0].x = (double)h_main_image[(y * main_width + x) * 3 + 0];
            h_main_signal[(y * main_width + x) * 3 + 1].x = (double)h_main_image[(y * main_width + x) * 3 + 1];
            h_main_signal[(y * main_width + x) * 3 + 2].x = (double)h_main_image[(y * main_width + x) * 3 + 2];
            h_main_signal[(y * main_width + x) * 3 + 0].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 1].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 2].y = 0;
        }
    }


    for (int y = 0; y < template_height; y++) {
        for (int x = 0; x < template_width; x++) {
            h_template_signal[(y * template_width + x) * 3 + 0].x = (double)h_template_image[(y * template_width + x) * 3 + 0];
            h_template_signal[(y * template_width + x) * 3 + 1].x = (double)h_template_image[(y * template_width + x) * 3 + 1];
            h_template_signal[(y * template_width + x) * 3 + 2].x = (double)h_template_image[(y * template_width + x) * 3 + 2];
            h_template_signal[(y * template_width + x) * 3 + 0].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 1].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 2].y = 0;
        }
    }

    cufftComplex* d_main_signal;
    cufftComplex* d_template_signal;
    cufftComplex* d_main_signal_out;
    cufftComplex* d_template_signal_out;
    cufftComplex* d_inversed;

    int main_memsize = sizeof(cufftComplex) * main_signal_size;
    int template_memsize = sizeof(cufftComplex) * template_signal_size;

    // Pad image signals
    cufftComplex *h_padded_main_signal;
    cufftComplex *h_padded_template_signal;


    int NEW_SIZE = PadData(h_main_signal, &h_padded_main_signal, main_signal_size, h_template_signal, &h_padded_template_signal, template_signal_size);

    errorHandler(cudaMalloc((void**)&d_main_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_main_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_inversed, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMemcpy(d_main_signal, h_padded_main_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));
    errorHandler(cudaMemcpy(d_template_signal, h_padded_template_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));

    // Plan for 2 CUFFT_FORWARDs
    cufftHandle plan_main;
    cufftHandle plan_template;
    cufftPlan1d(&plan_main, NEW_SIZE, CUFFT_C2C, 1);
    cufftPlan1d(&plan_template, NEW_SIZE, CUFFT_C2C, 1);

    // Perform forward FFT
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal, (cufftComplex *)d_main_signal_out, CUFFT_FORWARD);
    cufftExecC2C(plan_template, (cufftComplex *)d_template_signal, (cufftComplex *)d_template_signal_out, CUFFT_FORWARD);

    // Multiply the coefficients together and normalize the result
    printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
    dim3 gridDimensions((unsigned int)(ceil(NEW_SIZE / (float)BLOCK_SIZE)), 1, 1);
    dim3 blockDimensions(BLOCK_SIZE, 1, 1);

    ComplexPointwiseMulAndScale << <gridDimensions, blockDimensions >> >((cufftComplex *)d_main_signal_out, (cufftComplex *)d_template_signal_out, NEW_SIZE, 1.0f / NEW_SIZE);

    errorHandler(cudaGetLastError());

    // Perform the inverse fft on the main signal
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal_out, (cufftComplex *)d_inversed, CUFFT_INVERSE);

    // Copy data back to host
    cufftComplex * h_correlation_signal;
    h_correlation_signal = h_padded_main_signal;
    errorHandler(cudaMemcpy(h_correlation_signal, d_inversed, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyDeviceToHost));

    for (int i = 0; i < NEW_SIZE; i++) {
        h_correlation_signal[i].x = abs(h_correlation_signal[i].x);
        h_correlation_signal[i].y = abs(h_correlation_signal[i].y);
    }

    get_number_of_occurances(h_correlation_signal, NEW_SIZE);


    // Cancel plans :))))
    cufftDestroy(plan_main);
    cufftDestroy(plan_template);

    // Free allocated memory
    errorHandler(cudaFree(d_main_signal));
    errorHandler(cudaFree(d_template_signal));
    errorHandler(cudaFree(d_main_signal_out));
    errorHandler(cudaFree(d_template_signal_out));
    free(h_main_image);
    free(h_template_image);
    free(h_main_signal);
    free(h_template_signal);
    free(h_padded_main_signal);
    free(h_padded_template_signal);
    return EXIT_SUCCESS;
}

///////////////////////////////////////////////////////////////////////////////////
// Function for padding original data
//////////////////////////////////////////////////////////////////////////////////
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    cufftComplex *new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, signal, signal_size * sizeof(cufftComplex));
    memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(cufftComplex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(cufftComplex));
    memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(cufftComplex));
    memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(cufftComplex));
    *padded_filter_kernel = new_data;

    return new_size;
}


////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = threadID; i < size; i += numThreads)
    {
        a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
    }
}

int get_number_of_occurances(cufftComplex * arr, unsigned int size)
{
    cufftComplex max = arr[0];
    int num_of_occurs = 0;

    for (unsigned int i = 1; i < size; i++) {
        if (arr[i].x > max.x && arr[i].y > max.y) {
            num_of_occurs = 1;
            max = arr[i];
        }

        if (arr[i].x == max.x && arr[i].y == max.y)
            num_of_occurs++;
    }

    wcout << "[Number of Occurances]: " << num_of_occurs << endl;

    return num_of_occurs;
}

全部清除；全部关闭；
模板=rgb2gray（imread（'possum_ear.jpg'）；
背景=rgb2gray（imread（'possum.jpg'））；
%%计算填充
bx=大小（背景，2）；
by=大小（背景，1）；
tx=尺寸（模板，2）；%用于bbox放置
ty=尺寸（模板，1）；
%%快速傅里叶变换
c=真实值（ifft2（fft2（背景）。*fft2（模板，bx））；
%%查找峰值相关性
[max_c，imax]=max（abs（c（：））；
[ypeak，xpeak]=find（c==max（c（：））；
图形表面（c），阴影平坦；%小区相关性
%%显示最佳匹配
hFig=数字；
hAx=轴；
位置=[xpeak（1）-tx，ypeak（1）-ty，tx，ty]；
imshow（背景，'父'，hAx）；
imrect（hAx，位置）；

我的CUDA代码：

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <chrono>
#include <math_functions.h>
#include <bitmap_image.hpp>
#include <cufft.h>
using namespace std;

#define errorHandler(stmt)                                                                                  \
    do {                                                                                                    \
        cudaError_t err = stmt;                                                                             \
        if (err != cudaSuccess) {                                                                           \
            printf("[ERROR] Failed to run stmt %d, error body: %s\n", __LINE__, cudaGetErrorString(err));   \
            return -1; }                                                                                    \
    } while (0)                                                                                             \

#define M_PI 3.14159265

#define BLOCK_SIZE 1024
typedef float2 Complex;

int initiate_parallel_template_matching(bitmap_image, bitmap_image);
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, cufftComplex *, int, float);

// Padding functions
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size);

int get_number_of_occurances(cufftComplex * arr, unsigned int size);

int main()
{
    bitmap_image main_image("Input Files/collection.bmp");
    bitmap_image template_image("Input Files/collection_coin.bmp");

    initiate_parallel_template_matching(main_image, template_image);

    system("pause");
    return 0;
}

int initiate_parallel_template_matching(bitmap_image main_image, bitmap_image template_image)
{
    // Get sizes
    int main_width = main_image.width();
    int main_height = main_image.height();
    int main_size = main_width * main_height;
    int template_width = template_image.width();
    int template_height = template_image.height();
    int template_size = template_width * template_height;

    unsigned char* h_main_image = new unsigned char[3 * main_size];

    for (int col = 0; col < main_width; col++) {
        for (int row = 0; row < main_height; row++) {
            rgb_t colors;

            main_image.get_pixel(col, row, colors);
            h_main_image[(row * main_width + col) * 3 + 0] = colors.red;
            h_main_image[(row * main_width + col) * 3 + 1] = colors.green;
            h_main_image[(row * main_width + col) * 3 + 2] = colors.blue;
        }
    }

    unsigned char* h_template_image = new unsigned char[3 * template_size];

    for (int col = 0; col < template_width; col++) {
        for (int row = 0; row < template_height; row++) {
            rgb_t colors;

            template_image.get_pixel(col, row, colors);
            h_template_image[(row * template_width + col) * 3 + 0] = colors.red;
            h_template_image[(row * template_width + col) * 3 + 1] = colors.green;
            h_template_image[(row * template_width + col) * 3 + 2] = colors.blue;
        }
    }

    cufftComplex* h_main_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * main_width * main_height * 3);
    cufftComplex* h_template_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * template_width * template_height * 3);
    int main_signal_size = main_width * main_height * 3;
    int template_signal_size = template_width * template_height * 3;

    for (int y = 0; y < main_height; y++) {
        for (int x = 0; x < main_width; x++) {
            h_main_signal[(y * main_width + x) * 3 + 0].x = (double)h_main_image[(y * main_width + x) * 3 + 0];
            h_main_signal[(y * main_width + x) * 3 + 1].x = (double)h_main_image[(y * main_width + x) * 3 + 1];
            h_main_signal[(y * main_width + x) * 3 + 2].x = (double)h_main_image[(y * main_width + x) * 3 + 2];
            h_main_signal[(y * main_width + x) * 3 + 0].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 1].y = 0;
            h_main_signal[(y * main_width + x) * 3 + 2].y = 0;
        }
    }


    for (int y = 0; y < template_height; y++) {
        for (int x = 0; x < template_width; x++) {
            h_template_signal[(y * template_width + x) * 3 + 0].x = (double)h_template_image[(y * template_width + x) * 3 + 0];
            h_template_signal[(y * template_width + x) * 3 + 1].x = (double)h_template_image[(y * template_width + x) * 3 + 1];
            h_template_signal[(y * template_width + x) * 3 + 2].x = (double)h_template_image[(y * template_width + x) * 3 + 2];
            h_template_signal[(y * template_width + x) * 3 + 0].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 1].y = 0;
            h_template_signal[(y * template_width + x) * 3 + 2].y = 0;
        }
    }

    cufftComplex* d_main_signal;
    cufftComplex* d_template_signal;
    cufftComplex* d_main_signal_out;
    cufftComplex* d_template_signal_out;
    cufftComplex* d_inversed;

    int main_memsize = sizeof(cufftComplex) * main_signal_size;
    int template_memsize = sizeof(cufftComplex) * template_signal_size;

    // Pad image signals
    cufftComplex *h_padded_main_signal;
    cufftComplex *h_padded_template_signal;


    int NEW_SIZE = PadData(h_main_signal, &h_padded_main_signal, main_signal_size, h_template_signal, &h_padded_template_signal, template_signal_size);

    errorHandler(cudaMalloc((void**)&d_main_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_main_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_template_signal_out, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMalloc((void**)&d_inversed, sizeof(cufftComplex) * NEW_SIZE));
    errorHandler(cudaMemcpy(d_main_signal, h_padded_main_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));
    errorHandler(cudaMemcpy(d_template_signal, h_padded_template_signal, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyHostToDevice));

    // Plan for 2 CUFFT_FORWARDs
    cufftHandle plan_main;
    cufftHandle plan_template;
    cufftPlan1d(&plan_main, NEW_SIZE, CUFFT_C2C, 1);
    cufftPlan1d(&plan_template, NEW_SIZE, CUFFT_C2C, 1);

    // Perform forward FFT
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal, (cufftComplex *)d_main_signal_out, CUFFT_FORWARD);
    cufftExecC2C(plan_template, (cufftComplex *)d_template_signal, (cufftComplex *)d_template_signal_out, CUFFT_FORWARD);

    // Multiply the coefficients together and normalize the result
    printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
    dim3 gridDimensions((unsigned int)(ceil(NEW_SIZE / (float)BLOCK_SIZE)), 1, 1);
    dim3 blockDimensions(BLOCK_SIZE, 1, 1);

    ComplexPointwiseMulAndScale << <gridDimensions, blockDimensions >> >((cufftComplex *)d_main_signal_out, (cufftComplex *)d_template_signal_out, NEW_SIZE, 1.0f / NEW_SIZE);

    errorHandler(cudaGetLastError());

    // Perform the inverse fft on the main signal
    cufftExecC2C(plan_main, (cufftComplex *)d_main_signal_out, (cufftComplex *)d_inversed, CUFFT_INVERSE);

    // Copy data back to host
    cufftComplex * h_correlation_signal;
    h_correlation_signal = h_padded_main_signal;
    errorHandler(cudaMemcpy(h_correlation_signal, d_inversed, sizeof(cufftComplex) * NEW_SIZE, cudaMemcpyDeviceToHost));

    for (int i = 0; i < NEW_SIZE; i++) {
        h_correlation_signal[i].x = abs(h_correlation_signal[i].x);
        h_correlation_signal[i].y = abs(h_correlation_signal[i].y);
    }

    get_number_of_occurances(h_correlation_signal, NEW_SIZE);


    // Cancel plans :))))
    cufftDestroy(plan_main);
    cufftDestroy(plan_template);

    // Free allocated memory
    errorHandler(cudaFree(d_main_signal));
    errorHandler(cudaFree(d_template_signal));
    errorHandler(cudaFree(d_main_signal_out));
    errorHandler(cudaFree(d_template_signal_out));
    free(h_main_image);
    free(h_template_image);
    free(h_main_signal);
    free(h_template_signal);
    free(h_padded_main_signal);
    free(h_padded_template_signal);
    return EXIT_SUCCESS;
}

///////////////////////////////////////////////////////////////////////////////////
// Function for padding original data
//////////////////////////////////////////////////////////////////////////////////
int PadData(const cufftComplex *signal, cufftComplex **padded_signal, int signal_size,
    const cufftComplex *filter_kernel, cufftComplex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    cufftComplex *new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, signal, signal_size * sizeof(cufftComplex));
    memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(cufftComplex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (cufftComplex *)malloc(sizeof(cufftComplex) * new_size);
    memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(cufftComplex));
    memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(cufftComplex));
    memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(cufftComplex));
    *padded_filter_kernel = new_data;

    return new_size;
}


////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = threadID; i < size; i += numThreads)
    {
        a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
    }
}

int get_number_of_occurances(cufftComplex * arr, unsigned int size)
{
    cufftComplex max = arr[0];
    int num_of_occurs = 0;

    for (unsigned int i = 1; i < size; i++) {
        if (arr[i].x > max.x && arr[i].y > max.y) {
            num_of_occurs = 1;
            max = arr[i];
        }

        if (arr[i].x == max.x && arr[i].y == max.y)
            num_of_occurs++;
    }

    wcout << "[Number of Occurances]: " << num_of_occurs << endl;

    return num_of_occurs;
}

#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
#包括
使用名称空间std；
#定义错误处理程序（stmt）\
做{\
cudaError\u t err=stmt\
如果（err！=cudaSuccess）{\
printf（“[ERROR]无法运行stmt%d，错误正文：%s\n”，_行_uuu，cudaGetErrorString（err））\
返回-1；}\
}而（0）\
#定义M_PI 3.14159265
#定义块大小为1024
2型复合物；
int启动并行模板匹配（位图图像、位图图像）；
静态、设备、主机、内联复杂复合体DD（复杂、复杂）；
静态u_设备__主机_;内联复杂复杂度刻度（复杂，浮点）；
静态、设备、主机、内联复杂复杂（复杂、复杂）；
静态uuu全局uuuuu无效复数点式拟合比例（CuftComplex*，CuftComplex*，int，float）；
//填充函数
int PadData（常数袖口复合体*信号，袖口复合体**填充信号，int信号大小，
常量cuftcomplex*过滤器内核，cuftcomplex**填充过滤器内核，int过滤器内核大小）；
int获取发生次数（cufftComplex*arr，无符号int大小）；
int main（）
{
位图图像主图像（“输入文件/collection.bmp”）；
位图图像模板图像（“输入文件/collection\u coin.bmp”）；
启动并行模板匹配（主图像、模板图像）；
系统（“暂停”）；
返回0；
}
int启动并行模板匹配（位图图像主图像、位图图像模板图像）
{
//获取尺寸
int main_width=main_image.width（）；
int main_height=main_image.height（）；
int main_size=主_宽度*主_高度；
int template_width=template_image.width（）；
int template_height=template_image.height（）；
int template_size=模板宽度*模板高度；
无符号字符*h_main_image=新的无符号字符[3*main_size]；
对于（int col=0；col