Python 如何使用OpenCV按行和列对裁剪的表格单元格进行排序?

Python 如何使用OpenCV按行和列对裁剪的表格单元格进行排序?,python,opencv,ocr,tesseract,data-science,Python,Opencv,Ocr,Tesseract,Data Science,在构建一个OCR管道以便从一个相当混乱的扫描文档中自动提取数据时,我陷入了图像预处理阶段,在对其运行Tesseract之前,我需要从表中裁剪所有单元格。一个主要的复杂性是必须以某种方式标记行和列,因为提取的数据需要结构来保存任何意义。此外,如果能够只提取特定的行和/或列,那么在处理多个具有类似组合的文档时,我的管道将更快,这是它的预期用途 下面是我目前使用的代码,几乎直接取自于Github上的Kanan Vyas的开源代码 import cv2 import numpy as np import

在构建一个OCR管道以便从一个相当混乱的扫描文档中自动提取数据时,我陷入了图像预处理阶段,在对其运行Tesseract之前,我需要从表中裁剪所有单元格。一个主要的复杂性是必须以某种方式标记行和列,因为提取的数据需要结构来保存任何意义。此外,如果能够只提取特定的行和/或列,那么在处理多个具有类似组合的文档时,我的管道将更快,这是它的预期用途

下面是我目前使用的代码,几乎直接取自于Github上的Kanan Vyas的开源代码

import cv2
import numpy as np
import os
import glob

def sort_contours(cnts, method="left-to-right"):
    reverse = False
    i = 0
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))
    return (cnts, boundingBoxes)

def box_extraction(img_for_box_extraction_path, cropped_dir_path):

    img = cv2.imread(img_for_box_extraction_path, 0)
    img[int(0):int(img.shape[0]),int(0):int(5)] = [255, 255, 255]

    (thresh, img_bin) = cv2.threshold(img, 128, 255,
                                      cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
    img_bin = 255-img_bin  
    cv2.imwrite("Image_bin.jpg",img_bin)
    kernel_length = np.array(img).shape[1]//40

    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

    img_temp1 = cv2.erode(img_bin, vertical_kernel, iterations=3)
    vertical_lines_img = cv2.dilate(img_temp1, vertical_kernel, iterations=3)
    cv2.imwrite("vertical_lines.jpg",vertical_lines_img)

    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
    cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)

    alpha = 0.5
    beta = 1.0 - alpha
    img_final_bin = cv2.addWeighted(vertical_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=2)
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    cv2.imwrite("img_final_bin.png",img_final_bin)
    im2, contours, hierarchy = cv2.findContours(
        img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    (contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")

    idx = 0
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w > 30 and h > 20) and w > h and h < 150:
            idx += 1
            new_img = img[y:y+h, x:x+w]
            cv2.imwrite(cropped_dir_path+str(idx) + '.png', new_img)

    cv2.drawContours(img, contours, -1, (0, 0, 255), 3)
    cv2.imwrite("./Temp/img_contour.jpg", img)

output_dir = "./Cropped/"
files = glob.glob(output_dir+"*")
for f in files:
    os.remove(f)

box_extraction("./prototype/page_cropped.png", output_dir)
这种方法的问题在于排序-此处使用的简单定向指令无法保留表结构,特别是如果源图像中的表有点倾斜,这是因为在取消表边界和为OCR保留表内容之间进行了权衡。如果这很重要,我目前不希望处理占用多行或多列的表单元格

我的实际图像包含敏感的业务数据,所以我用GIMP制作了这个虚拟图像。它应该足够用于演示目的,您只需要将box_提取函数指向它

考虑到框提取算法应该忽略右侧不完整的单元格,我希望得到9 x 4=36个名为1.png的图像,单元格为0,0,2.png为0,1等,每组4个单元对应于一行的单元,通过从第一行和之后的每五幅图像中选择其标题单元,应该可以获得一列的所有单元


但是,现在输出的图像以一种非常奇怪的顺序排列,1.png到4.png以相反的顺序保存第一行的单元格,5.png-第二行的第一个单元格,6.png-第二行的最后一个单元格,然后折叠模式。

请提供示例图像,您的预期输出,以及您当前获得的输出。@HansHirse已按要求添加,如果您还需要其他内容,请发表意见。
contours = [[1727.0220947265625, 151.22364807128906, 761.0980224609375, 91.216552734375, 0.9920176267623901], [1340.4588623046875, 1518.04248046875, 270.90380859375, 71.301025390625, 0.9890508651733398], [2110.046630859375, 1843.77099609375, 278.02294921875, 74.4261474609375, 0.9886908531188965], [2055.577392578125, 1226.511474609375, 311.0478515625, 82.458740234375, 0.984758734703064], [186.29859924316406, 1517.8702392578125, 569.6539764404297, 76.9754638671875, 0.9835792183876038], [1354.387451171875, 1321.7530517578125, 240.595947265625, 72.9495849609375, 0.9832095503807068], [1325.5714111328125, 1420.0745849609375, 285.83837890625, 73.9566650390625, 0.9797852635383606], [2059.18212890625, 1319.1517333984375, 308.333984375, 82.51513671875, 0.9795728325843811], [1232.687744140625, 631.7418823242188, 312.6070556640625, 77.5128173828125, 0.9789395928382874], [2079.22216796875, 1416.3896484375, 283.681640625, 81.77978515625, 0.9754523038864136], [321.4118347167969, 636.4778442382812, 329.8473205566406, 81.48944091796875, 0.9666035771369934], [1353.6107177734375, 1227.5003662109375, 238.04345703125, 75.3751220703125, 0.9659966826438904], [1228.7125244140625, 722.1092529296875, 316.5693359375, 69.8770751953125, 0.960330069065094], [2066.78076171875, 1511.6800537109375, 296.3779296875, 80.0389404296875, 0.9578678607940674], [163.09405517578125, 1417.7535400390625, 610.5952758789062, 83.1680908203125, 0.930923342704773], [1923.8992919921875, 640.7562255859375, 259.6851806640625, 74.0850830078125, 0.9276629090309143], [156.9827880859375, 1224.2708740234375, 676.9793701171875, 87.8233642578125, 0.9211469292640686], [168.77809143066406, 1322.410888671875, 620.0236663818359, 80.5582275390625, 0.8804788589477539], [1716.60595703125, 1414.467041015625, 283.1617431640625, 81.5452880859375, 0.8343544602394104], [1718.3673095703125, 1510.7855224609375, 304.5853271484375, 81.029541015625, 0.7616285681724548], [1718.100830078125, 1226.4912109375, 287.2305908203125, 78.2430419921875, 0.745871365070343], [1722.0452880859375, 1319.096923828125, 277.3297119140625, 81.481201171875, 0.7207437753677368]]

sorted_contours = []
sorted_by_height = sorted(contours, key=lambda x: x[1])
for index,i in enumerate(sorted_by_height):
    result = list(filter(lambda x: x[1] > i[1]- 6 and x[1] < i[1] + 6 and i not in sorted_contours , sorted_by_height[index:])) 
    sorted_contours = sorted_contours + sorted(result, key=lambda x: x[0])

print(sorted_contours)