I am OCR-ing a bunch of pdf-files. This works fine, but parts of the pdf's are black-lined. Actually, they are not really black-lined but 'rectangled with some text within the rectangels'. This text is messing up my OCR, even while using a word-list for targetting the various combinations of '(10)(2e)'.
I am working with .jpg's, converted from pdf's that contain bot text & images (with text in it). Here's a sample:
Since many variations of the '(10)(2e)' are messing up my OCR, my goal is find all rectangles - that most likely contain '(10)(2e)' and fill them. For finding the rectangles I followed this great answer from nathancy: How to detect all rectangular boxes python opencv without missing anything
However - as you can see in the upper green rectangle - sometimes the green rectangles overlap part of the data I need. In this case "@leiden.nl" and "@" in the second line.
I have experimented with many combinations of both (a) other settings for image processing (erode/dilate/blur/thershold) and (b) other settings as suggested in the answer by Nathancy (kernel settings/number of iterations).
What would be best practice for finding the smaller rectangles?
FYI: My code for finding the rectangles is more or less similar to Nathancy's answer:
# https://stackoverflow.com/questions/59979760/how-to-detect-all-rectangular-boxes-python-opencv-without-missing-anything
import cv2
import os
path = os.getcwd()
print(path+'/test_ocr3/_stuff_IN/')
# Load iamge, grayscale, adaptive threshold
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'1.png')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1.jpg')
image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_opt.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_A_erode_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_B_dilate_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_D_threshold_177255.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,51,9)
# Fill rectangular contours
# CHECK OTHER CONTOUR SETTINGS ? TO EXLCUDE OUTER ?
# https://docs.opencv.org/master/d9/d8b/tutorial_py_contours_hierarchy.html
# https://medium.com/analytics-vidhya/opencv-findcontours-detailed-guide-692ee19eeb18
# cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (255,255,255), -1)
# Morph open
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30,4))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=4)
# opening = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=4)
# Draw rectangles
# cnts = cv2.findContours(opening, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 3)
# filled
# cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), -1)
# cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'1_OUT.png', image)
cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'page_1_0_TST_OUT.jpg', image)
# https://stackoverflow.com/questions/59979760/how-to-detect-all-rectangular-boxes-python-opencv-without-missing-anything
import cv2
import os
path = os.getcwd()
print(path + '/test_ocr3/_stuff_IN/')
# Load iamge, grayscale, adaptive threshold
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'1.png')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1.jpg')
image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_opt.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_A_erode_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_B_dilate_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_D_threshold_177255.jpg')
result = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 51, 9)
# Fill rectangular contours
# CHECK OTHER CONTOUR SETTINGS ? TO EXLCUDE OUTER ?
# https://docs.opencv.org/master/d9/d8b/tutorial_py_contours_hierarchy.html
# https://medium.com/analytics-vidhya/opencv-findcontours-detailed-guide-692ee19eeb18
# cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (255, 255, 255), -1)
cv2.drawContours(thresh, [c], -1, (0, 0, 0), 1)
# Morph open
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 4))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=4)
# opening = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=4)
# Draw rectangles
# cnts = cv2.findContours(opening, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x, y, w, h = cv2.boundingRect(c)
cv2.rectangle(image, (x, y), (x + w, y + h), (36, 255, 12), 3)
# filled
# cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), -1)
# cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'1_OUT.png', image)
cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'page_1_0_TST_OUT.jpg', image)
Modified binary Because I hadn't higher resolution image, I modified image. I erased large box by hands and sharpened edges to 1px (If this image is not equal to Your raw image, Please upload the higher resolution and correct one.).
the key point is cv2.drawContours(thresh, [c], -1, (0, 0, 0), 1) . This split a large box (which You wanted to remove) into small boxes. Without this, connected area will be recognized as a large box and this will erase unwanted information.
image 2 compares Your question and My answer of large box. image 3 shows My answer.