Search code examples
pythonocrtesseractimage-preprocessing

Digit OCR using Tesseract


I'm trying to ocr some numbers:

enter image description here enter image description here enter image description here enter image description here enter image description here enter image description here

And I have made this code to test different psm arguments (6,7,8,13), I don't see much difference.

import os
import pytesseract
import matplotlib.pyplot as plt

import cv2
import numpy as np

pytesseract.pytesseract.tesseract_cmd = (
    r"path/to/tesseract"
)
def apply_tesseract(image_path, psm):
    image = cv2.imread(image_path)
    text = pytesseract.image_to_string(image, config=f"--psm {psm} digits")
    return image, text

def display_images_with_text(images, texts):
    num_images = len(images)
    num_rows = min(3, num_images)
    num_cols = (num_images + num_rows - 1) // num_rows

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 8), subplot_kw={'xticks': [], 'yticks': []})
    
    for i, (image, text) in enumerate(zip(images, texts)):
        ax = axes[i // num_cols, i % num_cols] if num_rows > 1 else axes[i % num_cols]
        ax.imshow(image)
        ax.axis("off")
        ax.set_title(text)

    plt.show()

def main(folder_path):
    for psm in [6]:
        images = []
        texts = []
        for filename in os.listdir(folder_path):
            if filename.lower().endswith((".png")):
                image_path = os.path.join(folder_path, filename)
                image, text = apply_tesseract(image_path, psm)
                images.append(image)
                texts.append(text)
        display_images_with_text(images, texts)

if __name__ == "__main__":
    folder_path = r"./digitImages"
    main(folder_path)

This is the output of --psm 6

enter image description here

As you can see, it's not that good.

How can I improve this? the number images are already black and white and quite small, I've tried some processing but I end up with the same black and white image.

# Read the original image
original_image = cv2.imread(image_path)

new_width = original_image.shape[1] * 2  # Double the width
new_height = original_image.shape[0] * 2  # Double the height
resized_image = cv2.resize(original_image, (new_width, new_height))


# Convert the original image to grayscale
gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

# Sharpen the blurred image
sharpen_kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
sharpen = cv2.filter2D(gray, -1, sharpen_kernel)

# Apply Otsu's thresholding to the blurred image
thresh = cv2.threshold(sharpen, 0, 255, cv2.THRESH_OTSU)[1]

Update:

Turns out simply adding some borders helped a ton, nto perfect but better.

enter image description here


Solution

  • Problem statement: Trying to OCR brief sequences of 2 or 3 digits yields sub-par recognition performance.

    Solution summary: Beginning each digit sequence with a short preamble that is "easy" to OCR gives Tesseract a hint about font size and will improve recognition performance.

    def apply_tesseract(image_path: Path, psm: int) -> tuple[np.ndarray, str]:
        image = cv2.imread(f"{image_path}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        assert np.max(image) <= 255
        h, w = image.shape
        both = np.concatenate((_get_hello(h), image), axis=1)
        text = pytesseract.image_to_string(both, config=f"--psm {psm} digits")
        return both, text
    
    
    def _get_hello(height: int, word: str = "Hello"):
        font = cv2.FONT_HERSHEY_PLAIN
        bottom_left = 6, height - 3
        font_scale = 1.35
        font_color = (0, 0, 0)
        thickness = 1
        line_type = cv2.LINE_AA
    
        img = 255 * np.ones((height, 70), dtype=np.uint8)
        cv2.putText(
            img, word, bottom_left, font, font_scale, font_color, thickness, line_type
        )
        return img
    

    This yields zero errors with PSM set to either 6 or 7, on the example number images you supplied.

    BTW, one can obtain much the same effect by simply catenating those number images. This produces a "986368798212196" recognition result.

    (I was going to try to recognize e.g. "Hello 212 world", but stopped when it turned out that a preamble suffices.)