基于CUDA的线路检测

基于CUDA的线路检测,cuda,hough-transform,Cuda,Hough Transform,我正在尝试使用CUDA进行实时线路检测。我计算了hough变换以及每个箱子的最小、最大线坐标。为了得到我正在跟踪的线段(使用Bresenham的直线算法),通过从最小到最大的点,得到每个箱子上的线段。当hough阈值较低且图像中存在大量线时,跟踪线需要花费大量时间才能完成 在GTX 660上,hough变换(hough_line_变换)计算每帧大约需要5-10ms(1280x720)(观察到比CPU实现快10倍)。但从最小点到最大点追踪线段需要1ms-15ms 我有两个关于在线检测的问题 是否有

我正在尝试使用CUDA进行实时线路检测。我计算了hough变换以及每个箱子的最小、最大线坐标。为了得到我正在跟踪的线段(使用Bresenham的直线算法),通过从最小到最大的点,得到每个箱子上的线段。当hough阈值较低且图像中存在大量线时,跟踪线需要花费大量时间才能完成

在GTX 660上,hough变换(hough_line_变换)计算每帧大约需要5-10ms(1280x720)(观察到比CPU实现快10倍)。但从最小点到最大点追踪线段需要1ms-15ms

我有两个关于在线检测的问题

  • 是否有更好的算法从hough箱的最小、最大点获取线段

  • 是否可以进一步优化Hough_line_变换(请参见下面的代码)?我使用的是原子操作。有没有可能避免原子学

  • 我附上下面的代码

    类标题

    #ifndef _HOUGH_LINES_H_
    #define _HOUGH_LINES_H_
    
    #include <cuda_gl_interop.h>
    #include <thrust/device_vector.h>
    
    union Pos;
    struct Line;
    
    struct Hough_params
    {
        int w;
        int h;
        int r;
    };
    
    class Hough_lines
    {
    public:
        enum Type {INT, SHORT_INT, FLOAT};
    
        Hough_lines(int _w, int _h);
        ~Hough_lines();
    
    public:
        bool init();
        bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
    
    protected:
        void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
        void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
        void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
        void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
    
        static void compute_trig_funcs();
    
    protected:
        Hough_params params;
        thrust::device_vector<Hough_params> d_param;
    
        static bool trig_init;
    };
    
    #endif
    
    \ifndef\u HOUGH\u行_
    #定义线_
    #包括
    #包括
    联合销售点;
    结构线;
    结构Hough_参数
    {
    int w;
    int-h;
    INTR;
    };
    类Hough_线
    {
    公众:
    枚举类型{INT,SHORT_INT,FLOAT};
    Hough_线(int_w,int_h);
    ~Hough_line();
    公众:
    bool init();
    布尔检测线(胶合纹理边缘、整数阈值、整数最小长度、整数最小间隙、胶合线、类型、整数和计数);
    受保护的:
    无效获取边(推力::设备向量和坐标、整数和大小);
    void get_hough_线(int阈值,推力::设备向量和d_线,int和大小);
    void get_行(int threshold、int minu length、int minu gap、GLuint行、Hough_行::Type Type、int&count);
    无效跟踪所有线(int-min-len、int-min-gap、推力::设备向量和d线、int-size、int*d线坐标、int&count);
    静态void compute_trig_funcs();
    受保护的:
    霍夫参数;
    推力:设备矢量d参数;
    静态布尔触发器初始化;
    };
    #恩迪夫
    
    身体

    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #定义角度大小360
    #定义每个线程的最大线10
    联合销售点
    {
    结构
    {
    uint16_t x;
    uint16_t y;
    };
    uint32_t值;
    };
    结构Hough_信息
    {
    Pos端;
    Pos启动;
    整数计数;
    };
    结构线
    {
    Pos启动;
    Pos端;
    };
    结构行信息
    {
    整数行计数;
    线-线[每螺纹的最大线];
    };
    __恒定浮动偏差[角度大小];
    __固定浮动开发成本[角度大小];
    纹理纹理纹理;
    bool-Hough_线::trig_init=false;
    __全局无效标记边(常量Hough参数*参数,整数*边)
    {
    intx=(blockIdx.x*blockDim.x+threadIdx.x);
    int y=(blockIdx.y*blockDim.y+threadIdx.y);
    int pos=x+(参数->w*y);
    边[pos]=(255==tex2D(luma_-tex,x,y))?1:0;
    }
    __全局无效获取坐标(常数Hough参数*参数,整数*边,位置*坐标)
    {
    整数指数;
    intx=(blockIdx.x*blockDim.x+threadIdx.x);
    int y=(blockIdx.y*blockDim.y+threadIdx.y);
    int pos=x+(参数->w*y);
    if(255==tex2D(luma_-tex,x,y))
    {
    索引=边[pos];
    坐标[index].y=y;
    坐标[index].x=x;
    }
    }
    __全局无效hough线变换(常数hough参数*参数,整数大小,常数位置*坐标,整数阈值,整数*标记,hough信息*输出)
    {
    int i;
    内角;
    int rdata;
    __共享的Hough信息sh_rho_数据[1001];
    i=threadIdx.x;
    而(ir)
    {
    sh_rho_data[i].end.value=0x0;
    sh_rho_data[i].start.value=0xFFFFFFFF;
    sh_rho_数据[i].计数=0;
    i+=blockDim.x;
    }
    __同步线程();
    i=threadIdx.x;
    角度=块IDx.x;
    常数浮动cos_角度=偏差成本[角度];
    常数浮点正弦角=偏差正弦角;
    而(iw>>1))*cos_角)+(浮点)((参数->h>>1)-coord[i].y)*sin_角);
    如果(rdata>=0)
    {
    atomicMax(&sh_rho_data[rdata].end.value,coord[i].value);
    atomicMin(&sh_rho_data[rdata].start.value,coord[i].value);
    原子添加(&sh_rho_数据[rdata]。计数,1);
    }
    i+=blockDim.x;
    }
    __同步线程();
    i=threadIdx.x;
    rdata=(角度*参数->r);
    而(ir)
    {
    memcpy(&out[rdata+i],&sh_rho_数据[i],sizeof(Hough_信息));
    标记[rdata+i]=(sh_rho_数据[i]。计数>=阈值)?1:0;
    i+=blockDim.x;
    }
    }
    __全局无效获取行(常量Hough参数*param,int阈值,Hough信息*hdata,int*标记,行*Line)
    {
    int pos;
    int i=threadIdx.x;
    int offset=(blockIdx.x*param->r);
    而(ir)
    {
    如果(hdata[offset+i]。计数>=阈值)
    {
    pos=标记[偏移量+i];
    行[pos].start.value=hdata[offset+i].start.value;
    行[pos].end.value=hdata[offset+i].end.value;
    }
    i+=blockDim.x;
    }
    }
    __设备无效添加线(int-xs、int-ys、int-xe、int-ye、int-min\u-len、line\u-info*line)
    {
    intd=abs(xexs)+abs(yeys);
    如果((d>=最小长度)和&(线->线计数<每线最大线数))
    {
    行->行[行->行计数].start.x=xs;
    行->行[行->行计数].start.y=ys;
    行->行[行->行计数].end.x=xe;
    行->行[行->行计数].end.y=ye;
    ++行->行计数;
    //printf(“\n(%d%d)(%d%d)%d”,xs,ys,xe,ye,d);
    }
    } 
    __全局无效跟踪线(常量线*输入,整数输入大小,整数最小长度,整数最小间距,线信息*线信息,整数*标记)
    {
    int d;
    int dsub;
    int dstep;
    int-xstep;
    int ystep;
    int xs,ys,xe,ye;
    inti=(blockIdx.x*blockDim.x+threadIdx.x);
    如果(i>=inp\u尺寸)
    {
    返回;
    }
    xs=输入[i].start.x;
    ys=输入[i]。开始。y;
    xe=输入[i].end.x;
    ye=输入[i]。结束。y;
    行信息[i]。行计数=0;
    int dx=abs(xe-xs);
    int-dy=abs(ye-ys);
    int
    
    #include <hough_lines.h>
    
    #include <math.h>
    #include <stdio.h>
    
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    
    #include <cuda_gl_interop.h>
    
    #include <thrust/host_vector.h>
    
    #include <thrust/copy.h>
    #include <thrust/scan.h>
    
    #define ANGLE_SIZE 360
    #define MAX_LINE_PER_THREAD 10
    
    union Pos
    {
        struct
        {
            uint16_t x;
            uint16_t y;
        };
        uint32_t value;
    };
    
    struct Hough_info
    {
        Pos end;
        Pos start;
        int count;
    };
    
    struct Line
    {
        Pos start;
        Pos end;
    };
    
    struct Line_info
    {
        int line_count;
        Line line[MAX_LINE_PER_THREAD];
    };
    
    __constant__ float dev_sint[ANGLE_SIZE];
    __constant__ float dev_cost[ANGLE_SIZE];
    
    texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
    
    bool Hough_lines::trig_init = false;
    
    __global__ void mark_edges(const Hough_params* param, int* edge)
    {
        int x = (blockIdx.x*blockDim.x+threadIdx.x);
        int y = (blockIdx.y*blockDim.y+threadIdx.y);
        int pos = x+(param->w*y);   
        edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
    }
    
    __global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
    {
        int index;
        int x = (blockIdx.x*blockDim.x+threadIdx.x);
        int y = (blockIdx.y*blockDim.y+threadIdx.y);
        int pos = x+(param->w*y);   
        if (255 == tex2D(luma_tex, x, y))
        {
            index = edge[pos];
            coord[index].y = y;
            coord[index].x = x;
        }
    }
    
    __global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
    {
        int i;
        int angle;
        int rdata;
        __shared__ Hough_info sh_rho_data[1001];
    
        i = threadIdx.x;
        while (i < param->r)
        {
            sh_rho_data[i].end.value = 0x0;
            sh_rho_data[i].start.value = 0xFFFFFFFF;
            sh_rho_data[i].count = 0;
    
            i += blockDim.x;
        }
        __syncthreads();
    
        i = threadIdx.x;
        angle = blockIdx.x;
        const float cos_angle = dev_cost[angle];
        const float sin_angle = dev_sint[angle];
        while (i < size)
        {
            rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
            if (rdata >= 0)
            {
                atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
                atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
    
                atomicAdd(&sh_rho_data[rdata].count, 1);
            }
            i += blockDim.x;
        }
    
        __syncthreads();
    
        i = threadIdx.x;
        rdata = (angle*param->r);
        while (i < param->r)
        {
            memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
            mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
            i += blockDim.x;
        }
    }
    
    __global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
    {
        int pos;
        int i = threadIdx.x;
        int offset = (blockIdx.x*param->r);
        while (i < param->r)
        {
            if (hdata[offset+i].count >= threshold)
            {
                pos = mark[offset+i];
                lines[pos].start.value = hdata[offset+i].start.value;
                lines[pos].end.value = hdata[offset+i].end.value;
            }
            i += blockDim.x;
        }
    }
    
    __device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
    {
        int d = abs(xe-xs)+abs(ye-ys);
        if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
        {
            line->line[line->line_count].start.x = xs;
            line->line[line->line_count].start.y = ys;
            line->line[line->line_count].end.x = xe;
            line->line[line->line_count].end.y = ye;
    
            ++line->line_count;
    
            //printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
        }
    } 
    
    __global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
    {
        int d;
        int dsub;
        int dstep;
        int xstep;
        int ystep;
    
        int xs, ys, xe, ye;
        int i = (blockIdx.x*blockDim.x+threadIdx.x);
        if (i >= inp_size)
        {
            return;
        }
    
        xs = input[i].start.x;
        ys = input[i].start.y;
        xe = input[i].end.x;
        ye = input[i].end.y;
    
        line_info[i].line_count = 0;
    
        int dx = abs(xe-xs);
        int dy = abs(ye-ys);
        int xinc = (xe > xs)?1:-1;
        int yinc = (ye > ys)?1:-1;
    
        int gap = 0;
        bool sflag;
        int s_x, s_y, e_x, e_y;
    
        if (dx > dy)
        {
            dsub = (dx<<1);
            dstep = (dy<<1);
            d = dstep-dx;
            xstep = xinc;
            ystep = 0;
            xinc = 0;
        }
        else
        {
            dsub = (dy<<1);
            dstep = (dx<<1);
            d = dstep-dy;
            xstep = 0;
            ystep = yinc;
            yinc = 0;
        }
    
        sflag = true;
        s_x = xs;
        s_y = ys;
        e_x = xs;
        e_y = ys;
    
        int x = xs;
        int y = ys;
    
        while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
        {
            x += xstep;
            y += ystep;
            if (d > 0)
            {
                x += xinc;
                y += yinc;
    
                d -= dsub;
            }
            d += dstep;
    
            if (255 == tex2D(luma_tex, x, y))
            {
                e_x = x;
                e_y = y;
                gap = 0;
    
                if (!sflag)
                {
                    s_x = x;
                    s_y = y;
                    sflag = true;
                }
            }
            else if (sflag)
            {
                ++gap;
                if (gap >= min_gap)
                {
                    sflag = false;
                    add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
                }
            }
        }
    
        if (sflag)
        {
            add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
        }
        mark[i] = line_info[i].line_count;
    }
    
    __global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
    {
        int index = (blockIdx.x*blockDim.x+threadIdx.x);
        if (index >= size)
        {
            return;
        }
    
        int pos;
        int start = 4*mark[index];
        Line* line_data = &line[index].line[0];
        for (int i = 0; i < line[index].line_count; i++)
        {
            pos = start+(4*i);
            coords[pos] = line_data[i].start.x-(param->w>>1);
            coords[pos+1] = (param->h>>1)-line_data[i].start.y;
            coords[pos+2] = line_data[i].end.x-(param->w>>1);
            coords[pos+3] = (param->h>>1)-line_data[i].end.y;
        }
    
        if ((index+1) == size)
        {
            *count = mark[index];
        }
    }
    
    Hough_lines::Hough_lines(int _w, int _h)
        :d_param(1)
    {
        params.w = _w;
        params.h = _h;
        params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
        thrust::copy_n(&params, 1, d_param.begin());
    }
    
    Hough_lines::~Hough_lines()
    {
    }
    
    bool Hough_lines::init()
    {
        if (false == trig_init)
        {
            trig_init = true;
            compute_trig_funcs();
        }
        return true;
    }
    
    void Hough_lines::compute_trig_funcs()
    {
        float theta;
        cudaError_t err = cudaSuccess;
        static float sint[ANGLE_SIZE];
        static float cost[ANGLE_SIZE];
    
        for (int i = 0; i < ANGLE_SIZE; i++)
        {
            theta = (M_PI*(float)i)/180.0;
            sint[i] = sin(theta);
            cost[i] = cos(theta);
        }
    
        err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
        err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
        if (cudaSuccess != err)
        {
            printf("\n%s", cudaGetErrorString(cudaGetLastError()));
        } 
    }
    
    void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
    {
        dim3 bsize(16, 16);
        dim3 gsize(params.w/bsize.x, params.h/bsize.y);
        thrust::device_vector<int> d_mark(params.w*params.h);
    
        size = 0;
        mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_mark.data()));
        thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
        get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_mark.data()),
            thrust::raw_pointer_cast(d_coords.data()));
        thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
    }
    
    void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
    {
        int edge_count = 0;
        thrust::device_vector<Pos> d_coords(params.w*params.h); 
        get_edges(d_coords, edge_count);
    
        thrust::device_vector<int> d_mark(params.r*360);
        thrust::device_vector<Hough_info> d_hough_data(params.r*360);
        hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
            edge_count,
            thrust::raw_pointer_cast(d_coords.data()), threshold,
            thrust::raw_pointer_cast(d_mark.data()), 
            thrust::raw_pointer_cast(d_hough_data.data())); 
        thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
        ::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
            threshold,
            thrust::raw_pointer_cast(d_hough_data.data()),
            thrust::raw_pointer_cast(d_mark.data()),
            thrust::raw_pointer_cast(d_lines.data()));
        thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
    }
    
    void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
    {
        thrust::device_vector<int> d_mark_line(size);
        thrust::device_vector<Line_info> d_nlines(size);
    
        trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()), 
            size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
            thrust::raw_pointer_cast(d_mark_line.data()));
    
        thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
    
        thrust::device_vector<int> d_count(1);
        copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_nlines.data()), size,
            thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
            thrust::raw_pointer_cast(d_count.data()));
    
        thrust::copy(d_count.begin(), d_count.end(), &count);
        //printf("\nLine count: %d", count);
    }
    
    void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
    {
        int* d_line_coord = 0;
        cudaGLRegisterBufferObject(line);
        cudaGLMapBufferObject((void **)&d_line_coord, line);
    
        int size = 0;
        thrust::device_vector<Line> d_lines(params.r*360); 
        get_hough_lines(threshold, d_lines, size);
        //printf("\nget_hough_lines: %d", size);
    
        trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
    
        cudaGLUnmapBufferObject(line);
        cudaGLUnregisterBufferObject(line);
        }
    
        bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
        {
        cudaError_t err;
        cudaArray* array_edge;
        cudaGraphicsResource* res_edge;
    
        err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
        if (err != cudaSuccess)
        {
            printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        cudaGraphicsMapResources(1, &res_edge);
        cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
        err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
        if (err != cudaSuccess)
        {
            printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
        {
            printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        float time = 0.0;
        //static float max = 0.0;
        cudaEvent_t start, stop;
    
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start);
    
        count = 0;
        get_lines(threshold, min_length, min_gap, line, type, count);
    
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
    
        //static int frame = 0;
        //frame++;
        //if (time > max)
        {
            //max = time;
            printf("\nElpased time: %f ms", time);
        }
    
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    
        cudaUnbindTexture(luma_tex);   
        cudaGraphicsUnmapResources(1, &res_edge);
        cudaGraphicsUnregisterResource(res_edge);
    
        return true;
    }