基于CUDA的线路检测_Cuda_Hough Transform

基于CUDA的线路检测

cuda

基于CUDA的线路检测,cuda,hough-transform,Cuda,Hough Transform,我正在尝试使用CUDA进行实时线路检测。我计算了hough变换以及每个箱子的最小、最大线坐标。为了得到我正在跟踪的线段（使用Bresenham的直线算法），通过从最小到最大的点，得到每个箱子上的线段。当hough阈值较低且图像中存在大量线时，跟踪线需要花费大量时间才能完成在GTX 660上，hough变换（hough_line_变换）计算每帧大约需要5-10ms（1280x720）（观察到比CPU实现快10倍）。但从最小点到最大点追踪线段需要1ms-15ms 我有两个关于在线检测的问题是否有

我正在尝试使用CUDA进行实时线路检测。我计算了hough变换以及每个箱子的最小、最大线坐标。为了得到我正在跟踪的线段（使用Bresenham的直线算法），通过从最小到最大的点，得到每个箱子上的线段。当hough阈值较低且图像中存在大量线时，跟踪线需要花费大量时间才能完成

在GTX 660上，hough变换（hough_line_变换）计算每帧大约需要5-10ms（1280x720）（观察到比CPU实现快10倍）。但从最小点到最大点追踪线段需要1ms-15ms

我有两个关于在线检测的问题

是否有更好的算法从hough箱的最小、最大点获取线段

是否可以进一步优化Hough_line_变换（请参见下面的代码）？我使用的是原子操作。有没有可能避免原子学

我附上下面的代码

类标题

#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_

#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>

union Pos;
struct Line;

struct Hough_params
{
    int w;
    int h;
    int r;
};

class Hough_lines
{
public:
    enum Type {INT, SHORT_INT, FLOAT};

    Hough_lines(int _w, int _h);
    ~Hough_lines();

public:
    bool init();
    bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);

protected:
    void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
    void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
    void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
    void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);

    static void compute_trig_funcs();

protected:
    Hough_params params;
    thrust::device_vector<Hough_params> d_param;

    static bool trig_init;
};

#endif

\ifndef\u HOUGH\u行_
#定义线_
#包括
#包括
联合销售点；
结构线；
结构Hough_参数
{
int w；
int-h；
INTR；
};
类Hough_线
{
公众：
枚举类型{INT，SHORT_INT，FLOAT}；
Hough_线（int_w，int_h）；
~Hough_line（）；
公众：
bool init（）；
布尔检测线（胶合纹理边缘、整数阈值、整数最小长度、整数最小间隙、胶合线、类型、整数和计数）；
受保护的：
无效获取边（推力：：设备向量和坐标、整数和大小）；
void get_hough_线（int阈值，推力：：设备向量和d_线，int和大小）；
void get_行（int threshold、int minu length、int minu gap、GLuint行、Hough_行：：Type Type、int&count）；
无效跟踪所有线（int-min-len、int-min-gap、推力：：设备向量和d线、int-size、int*d线坐标、int&count）；
静态void compute_trig_funcs（）；
受保护的：
霍夫参数；
推力：设备矢量d参数；
静态布尔触发器初始化；
};
#恩迪夫

身体

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义角度大小360
#定义每个线程的最大线10
联合销售点
{
结构
{
uint16_t x；
uint16_t y；
};
uint32_t值；
};
结构Hough_信息
{
Pos端；
Pos启动；
整数计数；
};
结构线
{
Pos启动；
Pos端；
};
结构行信息
{
整数行计数；
线-线[每螺纹的最大线]；
};
__恒定浮动偏差[角度大小]；
__固定浮动开发成本[角度大小]；
纹理纹理纹理；
bool-Hough_线：：trig_init=false；
__全局无效标记边（常量Hough参数*参数，整数*边）
{
intx=（blockIdx.x*blockDim.x+threadIdx.x）；
int y=（blockIdx.y*blockDim.y+threadIdx.y）；
int pos=x+（参数->w*y）；
边[pos]=（255==tex2D（luma_-tex，x，y））？1:0；
}
__全局无效获取坐标（常数Hough参数*参数，整数*边，位置*坐标）
{
整数指数；
intx=（blockIdx.x*blockDim.x+threadIdx.x）；
int y=（blockIdx.y*blockDim.y+threadIdx.y）；
int pos=x+（参数->w*y）；
if（255==tex2D（luma_-tex，x，y））
{
索引=边[pos]；
坐标[index].y=y；
坐标[index].x=x；
}
}
__全局无效hough线变换（常数hough参数*参数，整数大小，常数位置*坐标，整数阈值，整数*标记，hough信息*输出）
{
int i；
内角；
int rdata；
__共享的Hough信息sh_rho_数据[1001]；
i=threadIdx.x；
而（ir）
{
sh_rho_data[i].end.value=0x0；
sh_rho_data[i].start.value=0xFFFFFFFF；
sh_rho_数据[i].计数=0；
i+=blockDim.x；
}
__同步线程（）；
i=threadIdx.x；
角度=块IDx.x；
常数浮动cos_角度=偏差成本[角度]；
常数浮点正弦角=偏差正弦角；
而（iw>>1））*cos_角）+（浮点）（（参数->h>>1）-coord[i].y）*sin_角）；
如果（rdata>=0）
{
atomicMax（&sh_rho_data[rdata].end.value，coord[i].value）；
atomicMin（&sh_rho_data[rdata].start.value，coord[i].value）；
原子添加（&sh_rho_数据[rdata]。计数，1）；
}
i+=blockDim.x；
}
__同步线程（）；
i=threadIdx.x；
rdata=（角度*参数->r）；
而（ir）
{
memcpy（&out[rdata+i]，&sh_rho_数据[i]，sizeof（Hough_信息））；
标记[rdata+i]=（sh_rho_数据[i]。计数>=阈值）？1:0；
i+=blockDim.x；
}
}
__全局无效获取行（常量Hough参数*param，int阈值，Hough信息*hdata，int*标记，行*Line）
{
int pos；
int i=threadIdx.x；
int offset=（blockIdx.x*param->r）；
而（ir）
{
如果（hdata[offset+i]。计数>=阈值）
{
pos=标记[偏移量+i]；
行[pos].start.value=hdata[offset+i].start.value；
行[pos].end.value=hdata[offset+i].end.value；
}
i+=blockDim.x；
}
}
__设备无效添加线（int-xs、int-ys、int-xe、int-ye、int-min\u-len、line\u-info*line）
{
intd=abs（xexs）+abs（yeys）；
如果（（d>=最小长度）和&（线->线计数<每线最大线数））
{
行->行[行->行计数].start.x=xs；
行->行[行->行计数].start.y=ys；
行->行[行->行计数].end.x=xe；
行->行[行->行计数].end.y=ye；
++行->行计数；
//printf（“\n（%d%d）（%d%d）%d”，xs，ys，xe，ye，d）；
}
} 
__全局无效跟踪线（常量线*输入，整数输入大小，整数最小长度，整数最小间距，线信息*线信息，整数*标记）
{
int d；
int dsub；
int dstep；
int-xstep；
int ystep；
int xs，ys，xe，ye；
inti=（blockIdx.x*blockDim.x+threadIdx.x）；
如果（i>=inp\u尺寸）
{
返回；
}
xs=输入[i].start.x；
ys=输入[i]。开始。y；
xe=输入[i].end.x；
ye=输入[i]。结束。y；
行信息[i]。行计数=0；
int dx=abs（xe-xs）；
int-dy=abs（ye-ys）；
int
#include <hough_lines.h>

#include <math.h>
#include <stdio.h>

#include <cuda.h>
#include <cuda_runtime_api.h>

#include <cuda_gl_interop.h>

#include <thrust/host_vector.h>

#include <thrust/copy.h>
#include <thrust/scan.h>

#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10

union Pos
{
    struct
    {
        uint16_t x;
        uint16_t y;
    };
    uint32_t value;
};

struct Hough_info
{
    Pos end;
    Pos start;
    int count;
};

struct Line
{
    Pos start;
    Pos end;
};

struct Line_info
{
    int line_count;
    Line line[MAX_LINE_PER_THREAD];
};

__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];

texture<uint8_t, 2, cudaReadModeElementType> luma_tex;

bool Hough_lines::trig_init = false;

__global__ void mark_edges(const Hough_params* param, int* edge)
{
    int x = (blockIdx.x*blockDim.x+threadIdx.x);
    int y = (blockIdx.y*blockDim.y+threadIdx.y);
    int pos = x+(param->w*y);   
    edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}

__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
    int index;
    int x = (blockIdx.x*blockDim.x+threadIdx.x);
    int y = (blockIdx.y*blockDim.y+threadIdx.y);
    int pos = x+(param->w*y);   
    if (255 == tex2D(luma_tex, x, y))
    {
        index = edge[pos];
        coord[index].y = y;
        coord[index].x = x;
    }
}

__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
    int i;
    int angle;
    int rdata;
    __shared__ Hough_info sh_rho_data[1001];

    i = threadIdx.x;
    while (i < param->r)
    {
        sh_rho_data[i].end.value = 0x0;
        sh_rho_data[i].start.value = 0xFFFFFFFF;
        sh_rho_data[i].count = 0;

        i += blockDim.x;
    }
    __syncthreads();

    i = threadIdx.x;
    angle = blockIdx.x;
    const float cos_angle = dev_cost[angle];
    const float sin_angle = dev_sint[angle];
    while (i < size)
    {
        rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
        if (rdata >= 0)
        {
            atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
            atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);

            atomicAdd(&sh_rho_data[rdata].count, 1);
        }
        i += blockDim.x;
    }

    __syncthreads();

    i = threadIdx.x;
    rdata = (angle*param->r);
    while (i < param->r)
    {
        memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
        mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
        i += blockDim.x;
    }
}

__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
    int pos;
    int i = threadIdx.x;
    int offset = (blockIdx.x*param->r);
    while (i < param->r)
    {
        if (hdata[offset+i].count >= threshold)
        {
            pos = mark[offset+i];
            lines[pos].start.value = hdata[offset+i].start.value;
            lines[pos].end.value = hdata[offset+i].end.value;
        }
        i += blockDim.x;
    }
}

__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
    int d = abs(xe-xs)+abs(ye-ys);
    if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
    {
        line->line[line->line_count].start.x = xs;
        line->line[line->line_count].start.y = ys;
        line->line[line->line_count].end.x = xe;
        line->line[line->line_count].end.y = ye;

        ++line->line_count;

        //printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
    }
} 

__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
    int d;
    int dsub;
    int dstep;
    int xstep;
    int ystep;

    int xs, ys, xe, ye;
    int i = (blockIdx.x*blockDim.x+threadIdx.x);
    if (i >= inp_size)
    {
        return;
    }

    xs = input[i].start.x;
    ys = input[i].start.y;
    xe = input[i].end.x;
    ye = input[i].end.y;

    line_info[i].line_count = 0;

    int dx = abs(xe-xs);
    int dy = abs(ye-ys);
    int xinc = (xe > xs)?1:-1;
    int yinc = (ye > ys)?1:-1;

    int gap = 0;
    bool sflag;
    int s_x, s_y, e_x, e_y;

    if (dx > dy)
    {
        dsub = (dx<<1);
        dstep = (dy<<1);
        d = dstep-dx;
        xstep = xinc;
        ystep = 0;
        xinc = 0;
    }
    else
    {
        dsub = (dy<<1);
        dstep = (dx<<1);
        d = dstep-dy;
        xstep = 0;
        ystep = yinc;
        yinc = 0;
    }

    sflag = true;
    s_x = xs;
    s_y = ys;
    e_x = xs;
    e_y = ys;

    int x = xs;
    int y = ys;

    while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
    {
        x += xstep;
        y += ystep;
        if (d > 0)
        {
            x += xinc;
            y += yinc;

            d -= dsub;
        }
        d += dstep;

        if (255 == tex2D(luma_tex, x, y))
        {
            e_x = x;
            e_y = y;
            gap = 0;

            if (!sflag)
            {
                s_x = x;
                s_y = y;
                sflag = true;
            }
        }
        else if (sflag)
        {
            ++gap;
            if (gap >= min_gap)
            {
                sflag = false;
                add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
            }
        }
    }

    if (sflag)
    {
        add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
    }
    mark[i] = line_info[i].line_count;
}

__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
    int index = (blockIdx.x*blockDim.x+threadIdx.x);
    if (index >= size)
    {
        return;
    }

    int pos;
    int start = 4*mark[index];
    Line* line_data = &line[index].line[0];
    for (int i = 0; i < line[index].line_count; i++)
    {
        pos = start+(4*i);
        coords[pos] = line_data[i].start.x-(param->w>>1);
        coords[pos+1] = (param->h>>1)-line_data[i].start.y;
        coords[pos+2] = line_data[i].end.x-(param->w>>1);
        coords[pos+3] = (param->h>>1)-line_data[i].end.y;
    }

    if ((index+1) == size)
    {
        *count = mark[index];
    }
}

Hough_lines::Hough_lines(int _w, int _h)
    :d_param(1)
{
    params.w = _w;
    params.h = _h;
    params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
    thrust::copy_n(&params, 1, d_param.begin());
}

Hough_lines::~Hough_lines()
{
}

bool Hough_lines::init()
{
    if (false == trig_init)
    {
        trig_init = true;
        compute_trig_funcs();
    }
    return true;
}

void Hough_lines::compute_trig_funcs()
{
    float theta;
    cudaError_t err = cudaSuccess;
    static float sint[ANGLE_SIZE];
    static float cost[ANGLE_SIZE];

    for (int i = 0; i < ANGLE_SIZE; i++)
    {
        theta = (M_PI*(float)i)/180.0;
        sint[i] = sin(theta);
        cost[i] = cos(theta);
    }

    err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
    err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
    if (cudaSuccess != err)
    {
        printf("\n%s", cudaGetErrorString(cudaGetLastError()));
    } 
}

void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
    dim3 bsize(16, 16);
    dim3 gsize(params.w/bsize.x, params.h/bsize.y);
    thrust::device_vector<int> d_mark(params.w*params.h);

    size = 0;
    mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_mark.data()));
    thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
    get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_mark.data()),
        thrust::raw_pointer_cast(d_coords.data()));
    thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}

void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
    int edge_count = 0;
    thrust::device_vector<Pos> d_coords(params.w*params.h); 
    get_edges(d_coords, edge_count);

    thrust::device_vector<int> d_mark(params.r*360);
    thrust::device_vector<Hough_info> d_hough_data(params.r*360);
    hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
        edge_count,
        thrust::raw_pointer_cast(d_coords.data()), threshold,
        thrust::raw_pointer_cast(d_mark.data()), 
        thrust::raw_pointer_cast(d_hough_data.data())); 
    thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
    ::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
        threshold,
        thrust::raw_pointer_cast(d_hough_data.data()),
        thrust::raw_pointer_cast(d_mark.data()),
        thrust::raw_pointer_cast(d_lines.data()));
    thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}

void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
    thrust::device_vector<int> d_mark_line(size);
    thrust::device_vector<Line_info> d_nlines(size);

    trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()), 
        size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
        thrust::raw_pointer_cast(d_mark_line.data()));

    thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());

    thrust::device_vector<int> d_count(1);
    copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
        thrust::raw_pointer_cast(d_nlines.data()), size,
        thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
        thrust::raw_pointer_cast(d_count.data()));

    thrust::copy(d_count.begin(), d_count.end(), &count);
    //printf("\nLine count: %d", count);
}

void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
    int* d_line_coord = 0;
    cudaGLRegisterBufferObject(line);
    cudaGLMapBufferObject((void **)&d_line_coord, line);

    int size = 0;
    thrust::device_vector<Line> d_lines(params.r*360); 
    get_hough_lines(threshold, d_lines, size);
    //printf("\nget_hough_lines: %d", size);

    trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);

    cudaGLUnmapBufferObject(line);
    cudaGLUnregisterBufferObject(line);
    }

    bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
    {
    cudaError_t err;
    cudaArray* array_edge;
    cudaGraphicsResource* res_edge;

    err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
    if (err != cudaSuccess)
    {
        printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    cudaGraphicsMapResources(1, &res_edge);
    cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
    err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
    if (err != cudaSuccess)
    {
        printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
    {
        printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
        exit(0);
    }

    float time = 0.0;
    //static float max = 0.0;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    count = 0;
    get_lines(threshold, min_length, min_gap, line, type, count);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    //static int frame = 0;
    //frame++;
    //if (time > max)
    {
        //max = time;
        printf("\nElpased time: %f ms", time);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaUnbindTexture(luma_tex);   
    cudaGraphicsUnmapResources(1, &res_edge);
    cudaGraphicsUnregisterResource(res_edge);

    return true;
}