Python 尝试tesseract文本结果中的每个加权字母组合_Python_Tesseract_Pillow

Python 尝试tesseract文本结果中的每个加权字母组合

python

Python 尝试tesseract文本结果中的每个加权字母组合,python,tesseract,pillow,Python,Tesseract,Pillow,我一直在使用pyocr（tesseract-ocr和libeteseract）测试图像中的文本识别。我一直在应用各种PIL.ImageFilters，并得到图像中一个特定字符串的结果。它并不准确，但我有14个不同的结果。在它们之间，图像中字符串的所有正确字母都存在。因此，我枚举了每个字符串，并创建了一个dict，其中包含字符作为键的位置，该键包含出现在该位置的每个字符的dict，出现次数作为值。这里有一个简短的例子图像中的字符串：结果：字典： { 0: { u'2'

我一直在使用

pyocr

（

tesseract-ocr

和

libeteseract

）测试图像中的文本识别。我一直在应用各种

PIL.ImageFilter

s，并得到图像中一个特定字符串的结果。它并不准确，但我有14个不同的结果。在它们之间，图像中字符串的所有正确字母都存在。因此，我枚举了每个字符串，并创建了一个

dict

，其中包含字符作为键的位置，该键包含出现在该位置的每个字符的

dict

，出现次数作为值。这里有一个简短的例子

图像中的字符串：结果：字典：

{
    0: {
        u'2': 10, 
        u'Z': 4
    }, 1: {
        u'H': 13, 
        u'N': 1
    }, 2: {
        u'C': 3, 
        u'O': 4, 
        u'G': 5, 
        u'6': 2
    }, 3: {
        u'2': 2, 
        u'Z': 11, 
        u'z': 1
    }
}

我想尝试每个位置的字母组合，直到得到

2HG2

。任何帮助都将不胜感激

编辑： 我试图实现的目标是扫描汽车登记，从中获取文本，然后用数据填充表单。作为概念证明，我正在尝试从我的个人注册中获取VIN编号。目前，我（很可能是天真地）正在应用一系列

PIL.ImageFilter

s，并从中获取文本。下面是我的脚本

import re
from itertools import permutations

from PIL import Image, ImageFilter
import pyocr
from pyocr import builders

vins = []
characters = {}


def validate(vincode):
    """
    Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
    """
    maps = "0123456789X"
    weights = [
        8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
    ]
    table = {
        "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
        "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
        "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
        "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
    }

    if not isinstance(vincode, str) and not isinstance(vincode, unicode):
        return False

    if len(vincode) != 17:
        return False

    vincode = vincode.upper()
    if "I" in vincode or "O" in vincode or "Q" in vincode:
        return False

    total = 0
    for index, value in enumerate(vincode):
        try:
            products = table[value] * weights[index]
        except KeyError:
            break
        total += products

    index = total % 11

    return maps[index] == vincode[8]


def get_text(tools_, img_):
    for tool in tools_:
        if tool.get_name() == 'Cuneiform (sh)':
            continue
        # print '=======================\nUsing {}\n======================='.format(tool.get_name())
        boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
        global vins
        pattern = re.compile('[\W_]+')
        vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
        # boxes = [x for x in boxes if len(x.content.strip()) != 0]
        # print boxes[3].content
        # for box in boxes:
        #     print box.content


def apply_filters_and_get_text(img_, filter_):
    for x in range(1, 5):
        print 'Applying {} size: {}'.format(str(filter_), x)
        try:
            img_ = img_.filter(filter_(x))
        except ValueError:
            print 'error on {} size: {}'.format(str(filter_), x)
            continue
        img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
        get_text(tools, img_)


def count_occurrences(value):
    global characters
    for index, c in enumerate(value):
        if index in characters and c in characters[index]:
            characters[index][c] += 1
            continue
        if index in characters and isinstance(characters[index], dict):
            characters[index][c] = 1
            continue
        characters[index] = {c: 1}


tools = pyocr.get_available_tools()

img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)

for vin in vins:
    count_occurrences(vin)
    # print vin
    # print validate(vin)

print characters

重新导入
从itertools导入置换
从PIL导入图像，图像过滤器
导入pyocr
来自pyocr导入生成器
VIN=[]
字符={}
def验证（vincode）：
"""
来自的验证代码https://en.wikipedia.org/wiki/Vehicle_identification_number
"""
maps=“0123456789X”
权重=[
8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
]
表={
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
“A”：1，“B”：2，“C”：3，“D”：4，“E”：5，“F”：6，“G”：7，“H”：8，
“J”：1，“K”：2，“L”：3，“M”：4，“N”：5，“P”：7，“R”：9，
“S”：2，“T”：3，“U”：4，“V”：5，“W”：6，“X”：7，“Y”：8，“Z”：9，
}
如果不是isinstance（vincode，str）也不是isinstance（vincode，unicode）：
返回错误
如果len（vincode）！=17:
返回错误
vincode=vincode.upper（）
如果vincode中的“I”或vincode中的“O”或vincode中的“Q”：
返回错误
总数=0
对于索引，枚举中的值（vincode）：
尝试：
产品=表[值]*权重[指数]
除KeyError外：
打破
总数+=产品
索引=总数%11
返回映射[index]==vincode[8]
def get_文本（工具、img）：
对于工具中的工具：
如果tool.get_name（）=“楔形文字（sh）”：
持续
#打印“==================================\n使用{}\n=================================='.格式（tool.get\u name（））
box=tool.image\u to\u字符串（img\u，lang='eng'，builder=builders.WordBoxBuilder（））
全球葡萄酒
pattern=re.compile（'[\W\u]+'））
如果len（pattern.sub（“”，x.content））==17，则框中x的VIN+=[pattern.sub（“”，x.content））为x
#方框=[x代表方框中的x，如果len（x.content.strip（））！=0]
#打印框[3]。内容
#对于盒子中的盒子：
#打印框.内容
def应用过滤器和获取文本（img过滤器）：
对于范围（1,5）内的x：
打印“应用{}大小：{}”。格式（str（filter），x）
尝试：
img_ux=img_ux.filter（filter_x））
除值错误外：
打印{}大小错误：{}。格式（str（filter），x）
持续
img_uux.save（'tmp{}-{}.jpg'.格式（str（filter_ux），x））
获取文本（工具、img）
def计数_出现次数（值）：
全局字符
对于索引，枚举中的c（值）：
如果索引为字符，c为字符[索引]：
字符[索引][c]+=1
持续
如果索引以字符和isinstance表示（字符[索引]，dict）：
字符[索引][c]=1
持续
字符[索引]={c:1}
tools=pyocr.get\u available\u tools（）
img=Image.open（'images/test18.jpg'））
#获取文本（工具）
#img=img.filter（ImageFilter.MaxFilter（5））
#img=img.filter（ImageFilter.锐化）
#img=img.filter（ImageFilter.SMOOTH\u MORE）
#获取文本（工具）
#获取文本（工具）
img=img.convert（'L'）
#获取文本（工具）
#img=img.filter（ImageFilter.MaxFilter（5））
#img=img.filter（ImageFilter.锐化）
#img=img.filter（ImageFilter.SMOOTH\u MORE）
#获取文本（工具）
#获取文本（工具）
img=img.点（λx:0，如果x<128，则为255，'1'）
应用过滤器和获取文本（img，ImageFilter.MedianFilter）
应用过滤器和获取文本（img，ImageFilter.MinFilter）
应用过滤器和获取文本（img，ImageFilter.MaxFilter）
应用过滤器和获取文本（img，ImageFilter.ModeFilter）
对于vin中的vin：
计数（vin）
#打印vin
#打印验证（vin）
打印字符

我找到了一个递归函数，它可以尝试每个字母组合，并优先选择权重更高的字符

def determine_character(characters_, tried=[]):
    next_character = ""
    current_rank = 0
    for ch in characters_:
        if characters_[ch] > current_rank and ch not in tried:
            next_character = ch
    return next_character


def determine_weight(word):
    global characters
    weight = 0
    for index, ch in enumerate(word):
        weight += characters[index][ch]
    return weight


def descramble(word="", index=0):
    global characters
    count = len(characters)
    if index == count and validate(word):
        global vin_count, valid_vins
        vin_count += 1
        valid_vins.append({'vin': word, 'weight': determine_weight(word)})
        return {'word': word, 'done': True}
    if index == count:
        return False
    tried = []
    while len(tried) < len(characters[index]):
        ch = determine_character(characters[index], tried)
        tried.append(ch)
        next_index = index + 1
        descramble("{word}{ch}".format(word=word, ch=ch), next_index)

def define_字符（字符，尝试=[]）：
next_character=“”
当前排名=0
对于输入字符：
如果字符\uU[ch]>当前\u秩，且ch未尝试：
下一个字符=ch
返回下一个字符
def确定重量（字）：
全局字符
重量=0
对于索引，请参阅枚举（word）：
权重+=字符[索引][ch]
回重
def解扰（word=”“，索引=0）：
全局字符
计数=长度（字符）
如果索引==计数和验证（word）：
全局vin\u计数，有效vin
vin_计数+=1
有效的\u-vin.append（{'vin'：单词，'weight'：确定\u-weight（单词）}）
返回{'word'：单词'done'：True}
如果索引==计数：
返回错误
已尝试=[]
而len（尝试）

我找到了一个递归函数，可以尝试lette的每一个组合

import re
from itertools import permutations

from PIL import Image, ImageFilter
import pyocr
from pyocr import builders

vins = []
characters = {}


def validate(vincode):
    """
    Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
    """
    maps = "0123456789X"
    weights = [
        8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
    ]
    table = {
        "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
        "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
        "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
        "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
    }

    if not isinstance(vincode, str) and not isinstance(vincode, unicode):
        return False

    if len(vincode) != 17:
        return False

    vincode = vincode.upper()
    if "I" in vincode or "O" in vincode or "Q" in vincode:
        return False

    total = 0
    for index, value in enumerate(vincode):
        try:
            products = table[value] * weights[index]
        except KeyError:
            break
        total += products

    index = total % 11

    return maps[index] == vincode[8]


def get_text(tools_, img_):
    for tool in tools_:
        if tool.get_name() == 'Cuneiform (sh)':
            continue
        # print '=======================\nUsing {}\n======================='.format(tool.get_name())
        boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
        global vins
        pattern = re.compile('[\W_]+')
        vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
        # boxes = [x for x in boxes if len(x.content.strip()) != 0]
        # print boxes[3].content
        # for box in boxes:
        #     print box.content


def apply_filters_and_get_text(img_, filter_):
    for x in range(1, 5):
        print 'Applying {} size: {}'.format(str(filter_), x)
        try:
            img_ = img_.filter(filter_(x))
        except ValueError:
            print 'error on {} size: {}'.format(str(filter_), x)
            continue
        img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
        get_text(tools, img_)


def count_occurrences(value):
    global characters
    for index, c in enumerate(value):
        if index in characters and c in characters[index]:
            characters[index][c] += 1
            continue
        if index in characters and isinstance(characters[index], dict):
            characters[index][c] = 1
            continue
        characters[index] = {c: 1}


tools = pyocr.get_available_tools()

img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)

for vin in vins:
    count_occurrences(vin)
    # print vin
    # print validate(vin)

print characters

def determine_character(characters_, tried=[]):
    next_character = ""
    current_rank = 0
    for ch in characters_:
        if characters_[ch] > current_rank and ch not in tried:
            next_character = ch
    return next_character


def determine_weight(word):
    global characters
    weight = 0
    for index, ch in enumerate(word):
        weight += characters[index][ch]
    return weight


def descramble(word="", index=0):
    global characters
    count = len(characters)
    if index == count and validate(word):
        global vin_count, valid_vins
        vin_count += 1
        valid_vins.append({'vin': word, 'weight': determine_weight(word)})
        return {'word': word, 'done': True}
    if index == count:
        return False
    tried = []
    while len(tried) < len(characters[index]):
        ch = determine_character(characters[index], tried)
        tried.append(ch)
        next_index = index + 1
        descramble("{word}{ch}".format(word=word, ch=ch), next_index)