python opencv tesseract image-segmentation python-tesseract

Segmenting image files with text (and pictures) into blocks

I'm trying to create bounding boxes for the text in an image I have. An example is the one below.

I would like to add a bounding box around each This is a test line. Unfortunately I'm not sure why this method is not automatically identifying the bounding boxes

import re
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt


# Plot character boxes on image using pytesseract.image_to_boxes() function
image = cv2.imread('Image.jpg')
b, g, r = cv2.split(image)
image = cv2.merge([r,g,b])
d = pytesseract.image_to_data(image, output_type=Output.DICT)
print('DATA KEYS: \n', d.keys())

n_boxes = len(d['text'])
for i in range(n_boxes):
    # condition to only pick boxes with a confidence > 60%
    if int(d['conf'][i]) > 60:
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        image = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

b, g, r = cv2.split(image)
rgb_img = cv2.merge([r, g, b])
plt.figure(figsize=(16, 12))
plt.imshow(rgb_img)
plt.title('SAMPLE IMAGE WITH WORD LEVEL BOXES')
plt.show()

Solution

Here is a different way to do that with Python/OpenCV.

Read the input
Convert to gray
(OTSU) Threshold (white text on black background)
Apply morphology dilate with horizontal kernel longer than letter spacing and then smaller vertical kernel to remove thin horizontal lines remaining from line in page.
Find contours
Draw bounding boxes of contours on input
Save result

Input:

import cv2
import numpy as np

# load image
img = cv2.imread("test_text.jpg")

# convert to gray
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# threshold the grayscale image
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# use morphology erode to blur horizontally
#kernel = np.ones((500,3), np.uint8)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (250, 3))
morph = cv2.morphologyEx(thresh, cv2.MORPH_DILATE, kernel)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 17))
morph = cv2.morphologyEx(morph, cv2.MORPH_OPEN, kernel)

# find contours
cntrs = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cntrs = cntrs[0] if len(cntrs) == 2 else cntrs[1]

# Draw contours
result = img.copy()
for c in cntrs:
    x,y,w,h = cv2.boundingRect(c)
    cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)

# write result to disk
cv2.imwrite("test_text_threshold.png", thresh)
cv2.imwrite("test_text_morph.png", morph)
cv2.imwrite("test_text_lines.jpg", result)

cv2.imshow("GRAY", gray)
cv2.imshow("THRESH", thresh)
cv2.imshow("MORPH", morph)
cv2.imshow("RESULT", result)
cv2.waitKey(0)
cv2.destroyAllWindows()

Thresholded image:

Dilated image:

Result: