Search code examples
pythontesseractpython-imaging-library

Try every weighted combination of letters from the text result of tesseract


I've been testing text recognition from images using pyocr (tesseract-ocr and libetesseract). I've been applying various PIL.ImageFilters and getting the result of one specific string in the image. It has not been accurate, but I have 14 different results. Between all of them, all of the correct letters of the string in the image are there. So I have enumerated each string and created a dict containing the characters' position as keys that contain a dict of each character that has appeared in that position at keys and the number of occurrences as the value. Here's a shortened example

String In Image:

2HG2

Results:

#Note: this is not the actual order in which the strings are produced
2HC2
2HC2
2HCZ
2HOZ
2HOZ
2HOZ
2HOZ
2HGZ
2HGZ
2HGZ
ZHGZ
ZHGZ
ZH6Z
ZN6z

Dictionary:

{
    0: {
        u'2': 10, 
        u'Z': 4
    }, 1: {
        u'H': 13, 
        u'N': 1
    }, 2: {
        u'C': 3, 
        u'O': 4, 
        u'G': 5, 
        u'6': 2
    }, 3: {
        u'2': 2, 
        u'Z': 11, 
        u'z': 1
    }
}

I'd like to try each combination of letters in each position until I get 2HG2. Any help would be appreciated.

EDIT: The goal I'm trying to achieve is to scan a car registration, get text from it, and then populate a form with the data. As a proof of concept, I'm trying to get the VIN number from my person registration. At the moment, I'm (most likely naively) applying a series of PIL.ImageFilters and getting text from each. Below is my script.

import re
from itertools import permutations

from PIL import Image, ImageFilter
import pyocr
from pyocr import builders

vins = []
characters = {}


def validate(vincode):
    """
    Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
    """
    maps = "0123456789X"
    weights = [
        8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
    ]
    table = {
        "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
        "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
        "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
        "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
    }

    if not isinstance(vincode, str) and not isinstance(vincode, unicode):
        return False

    if len(vincode) != 17:
        return False

    vincode = vincode.upper()
    if "I" in vincode or "O" in vincode or "Q" in vincode:
        return False

    total = 0
    for index, value in enumerate(vincode):
        try:
            products = table[value] * weights[index]
        except KeyError:
            break
        total += products

    index = total % 11

    return maps[index] == vincode[8]


def get_text(tools_, img_):
    for tool in tools_:
        if tool.get_name() == 'Cuneiform (sh)':
            continue
        # print '=======================\nUsing {}\n======================='.format(tool.get_name())
        boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
        global vins
        pattern = re.compile('[\W_]+')
        vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
        # boxes = [x for x in boxes if len(x.content.strip()) != 0]
        # print boxes[3].content
        # for box in boxes:
        #     print box.content


def apply_filters_and_get_text(img_, filter_):
    for x in range(1, 5):
        print 'Applying {} size: {}'.format(str(filter_), x)
        try:
            img_ = img_.filter(filter_(x))
        except ValueError:
            print 'error on {} size: {}'.format(str(filter_), x)
            continue
        img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
        get_text(tools, img_)


def count_occurrences(value):
    global characters
    for index, c in enumerate(value):
        if index in characters and c in characters[index]:
            characters[index][c] += 1
            continue
        if index in characters and isinstance(characters[index], dict):
            characters[index][c] = 1
            continue
        characters[index] = {c: 1}


tools = pyocr.get_available_tools()

img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)

for vin in vins:
    count_occurrences(vin)
    # print vin
    # print validate(vin)

print characters

Solution

  • I was able to figure out a recursive function that tries every combination of the letters with priority to characters with higher weight.

    def determine_character(characters_, tried=[]):
        next_character = ""
        current_rank = 0
        for ch in characters_:
            if characters_[ch] > current_rank and ch not in tried:
                next_character = ch
        return next_character
    
    
    def determine_weight(word):
        global characters
        weight = 0
        for index, ch in enumerate(word):
            weight += characters[index][ch]
        return weight
    
    
    def descramble(word="", index=0):
        global characters
        count = len(characters)
        if index == count and validate(word):
            global vin_count, valid_vins
            vin_count += 1
            valid_vins.append({'vin': word, 'weight': determine_weight(word)})
            return {'word': word, 'done': True}
        if index == count:
            return False
        tried = []
        while len(tried) < len(characters[index]):
            ch = determine_character(characters[index], tried)
            tried.append(ch)
            next_index = index + 1
            descramble("{word}{ch}".format(word=word, ch=ch), next_index)