I want to extract the text from this image. I tried removing the rectangle contour so I started detecting the horizontal and vertical lines that form the boxes. But I found a problem where some characters pixels were mistakenly identified as vertical lines. to obtain a clean image without the rectangle boxes, containing only the line texts, so I can then apply pytesseract for text extraction.
Can you help with any suggestions to remove the rectangular boxes?
Thank you!
import cv2
from PIL import Image
import matplotlib.pylab as plt
image = io.imread("sample.png")
result = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
#Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), 5)
plt.imshow(result)
# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), 5)
plt.imshow(result)
You can try to find connected components in the image and filter out those that are too wide or tall. For example:
import cv2
import numpy as np
im=cv2.imread('0AASU.png', cv2.IMREAD_GRAYSCALE)
im_monochrome=cv2.threshold(im, 127,255,cv2.THRESH_BINARY_INV)[1]
_, labels,stats,_=cv2.connectedComponentsWithStats(im_monochrome)
idx=np.nonzero((stats[:,2]>150) | (stats[:,3]>150)) # select CC with h>150 or w>150 px.
result=255*np.uint8(np.isin(labels, idx)) # remove this CC
cv2.imwrite( 'result.png', result)