I'm trying to write a Python script that would "clean up" scanned images before they can be processed with Tesseract. Apart from text, the images also have some dust, scanning artifacts, weird lines at the page margins, and so on. Here's what a typical page looks like
So far, here's what I have. It tries to remove little speck of dust using cv2.ConnectedComponentsWithStats, removes horizontal and vertical lines using morphological structuring elements, and then tries to crop the image to the text. It's better than nothing since it does remove some noise, but at times it also removes actual text, and leaves some lines at the page margins:
image = cv2.imread(path, 0)
logging.info('Opening image ' + path)
logging.info('Converting to grayscale...')
_, blackAndWhite = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV)
# Find and exclude small elements
logging.info('Removing small dotted regions (dust, etc.)...')
nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(blackAndWhite, None, None, None, 8, cv2.CV_32S)
sizes = stats[1:, -1] #get CC_STAT_AREA component
img2 = np.zeros((labels.shape), np.uint8)
for i in range(0, nlabels - 1):
if sizes[i] >= 40: #filter small dotted regions
img2[labels == i + 1] = 255
image = cv2.bitwise_not(img2)
cv2.imwrite(out_filename, image)
logging.info('Writing the modified image...')
# ------ START CROPPING ----- #
image = cv2.imread(out_filename)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Load image, grayscale, Gaussian blur, Otsu's threshold
blur = cv2.GaussianBlur(gray, (5,5), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
logging.info('Applying Otsu\'s Threshold')
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,4))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,32))
detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
detected_vlines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
for l in [detected_lines, detected_vlines]:
cnts = cv2.findContours(l, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (0,0,0), 50)
cv2.drawContours(image, [c], -1, (255,255,255), 50)
# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18,18))
dilate = cv2.dilate(thresh, kernel, iterations=4)
logging.info('Dilating text regions')
try:
# Find contours and draw rectangle
cnts, hierarchy = cv2.findContours(dilate, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
logging.info('Extracting contours')
# Search for contours and append their coordinates into an array
arr = []
for i,c in enumerate(cnts):
# Exclude small elements
x,y,w,h = cv2.boundingRect(c)
# Exclude oddly shaped elements
if w/h > 8 or h/w > 1.6:
continue
arr.append((x,y))
arr.append((x+w,y+h))
# Calculate the coordinates and crop the image
logging.info('Cropping the image')
x,y,w,h = cv2.boundingRect(np.asarray(arr))
image = image[y:y+h,x:x+w]
if debug:
logging.info('Showing the image (press "q" to continue)')
label = "STAGE FOUR: CROPPED IMAGE"
logging.info('Writing to ' + out_filename)
except cv2.error:
pass
cv2.imwrite(out_filename, image)
I'm fairly new to image processing and don't have a lot of experience. Would like to hear some suggestions as to how the algorithm can be improved!
I would first call pytesseract.image_to_data()
on the entire image. This will give you the position and OCR confidence of all the detected words (including invalid characters at the page edge). Then determine the region containing valid text based on the position of the words at high confidence. Finally, use pytesseract.image_to_string()
on that region to obtain the text (or filter the results from pytesseract.image_to_data()
that you already have).
This approach works for the given example. If you want to remove the specks of dust you could look into "salt and pepper noise filtering" but it seems to be unnecessary.
import cv2
import pandas as pd
import pytesseract
from io import StringIO
# Obtain OCR data
img_bgr = cv2.imread("XVePx.jpg")
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
ocr_data = pytesseract.image_to_data(img_rgb, lang="deu")
ocr_df = pd.read_table(StringIO(ocr_data), quoting=3)
# Determine the text region based on the words (2+ characters) of high confidence (>90%)
confident_words_df = ocr_df[
(ocr_df["conf"] > 90)
& (ocr_df["text"].str.len() - ocr_df["text"].str.count(" ") > 1)
]
top = confident_words_df["top"].min()
left = confident_words_df["left"].min()
bot = (confident_words_df["top"] + confident_words_df["height"]).max()
right = (confident_words_df["left"] + confident_words_df["width"]).max()
# Obtain OCR string
ocr_string = pytesseract.image_to_string(img_rgb[top:bot, left:right, :], lang="deu")
print(ocr_string)