I'm trying to ocr some numbers:
And I have made this code to test different psm arguments (6,7,8,13), I don't see much difference.
import os
import pytesseract
import matplotlib.pyplot as plt
import cv2
import numpy as np
pytesseract.pytesseract.tesseract_cmd = (
r"path/to/tesseract"
)
def apply_tesseract(image_path, psm):
image = cv2.imread(image_path)
text = pytesseract.image_to_string(image, config=f"--psm {psm} digits")
return image, text
def display_images_with_text(images, texts):
num_images = len(images)
num_rows = min(3, num_images)
num_cols = (num_images + num_rows - 1) // num_rows
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 8), subplot_kw={'xticks': [], 'yticks': []})
for i, (image, text) in enumerate(zip(images, texts)):
ax = axes[i // num_cols, i % num_cols] if num_rows > 1 else axes[i % num_cols]
ax.imshow(image)
ax.axis("off")
ax.set_title(text)
plt.show()
def main(folder_path):
for psm in [6]:
images = []
texts = []
for filename in os.listdir(folder_path):
if filename.lower().endswith((".png")):
image_path = os.path.join(folder_path, filename)
image, text = apply_tesseract(image_path, psm)
images.append(image)
texts.append(text)
display_images_with_text(images, texts)
if __name__ == "__main__":
folder_path = r"./digitImages"
main(folder_path)
This is the output of --psm 6
As you can see, it's not that good.
How can I improve this? the number images are already black and white and quite small, I've tried some processing but I end up with the same black and white image.
# Read the original image
original_image = cv2.imread(image_path)
new_width = original_image.shape[1] * 2 # Double the width
new_height = original_image.shape[0] * 2 # Double the height
resized_image = cv2.resize(original_image, (new_width, new_height))
# Convert the original image to grayscale
gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
# Sharpen the blurred image
sharpen_kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
sharpen = cv2.filter2D(gray, -1, sharpen_kernel)
# Apply Otsu's thresholding to the blurred image
thresh = cv2.threshold(sharpen, 0, 255, cv2.THRESH_OTSU)[1]
Update:
Turns out simply adding some borders helped a ton, nto perfect but better.
Problem statement: Trying to OCR brief sequences of 2 or 3 digits yields sub-par recognition performance.
Solution summary: Beginning each digit sequence with a short preamble that is "easy" to OCR gives Tesseract a hint about font size and will improve recognition performance.
def apply_tesseract(image_path: Path, psm: int) -> tuple[np.ndarray, str]:
image = cv2.imread(f"{image_path}")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
assert np.max(image) <= 255
h, w = image.shape
both = np.concatenate((_get_hello(h), image), axis=1)
text = pytesseract.image_to_string(both, config=f"--psm {psm} digits")
return both, text
def _get_hello(height: int, word: str = "Hello"):
font = cv2.FONT_HERSHEY_PLAIN
bottom_left = 6, height - 3
font_scale = 1.35
font_color = (0, 0, 0)
thickness = 1
line_type = cv2.LINE_AA
img = 255 * np.ones((height, 70), dtype=np.uint8)
cv2.putText(
img, word, bottom_left, font, font_scale, font_color, thickness, line_type
)
return img
This yields zero errors with PSM set to either 6 or 7, on the example number images you supplied.
BTW, one can obtain much the same effect by simply catenating those number images. This produces a "986368798212196" recognition result.
(I was going to try to recognize e.g. "Hello 212 world", but stopped when it turned out that a preamble suffices.)