import pytesseract
from PIL import Image
def textFromTesseractOCR(croppedImage):
for i in range(14):
text = pytesseract.image_to_string(croppedImage, lang = 'eng', boxes = False ,config = '--psm '+ str(i) +' --oem 3')
print("PSM Mode", i)
print("Text detected: ",text)
imgPath = "ImagePath" #you can use image I have uploaded
img = Image.open(imgPath)
textFromTesseractOCR(img)
I am working on extracting Table data from PDF. For this I am converting pdf to png. Detecting Lines, ascertaining table by line intersection and then cropping individual cells to get their text.
This all works fine, but tesseract is not working on cells image which has text of length 2 or less.
Works for this image:
Result from tesseract:
Does not work for this image:
Result from tesseract: return empty string. It also returns empty for numbers of text length 2 or less.
I have tried resizing the image(which I knew wouldn't work), also tried appending dummy text to the image but the result was bad(was working only for few and I didn't the exact location to append the dummy text in the image)
It would be great if someone could help me with this.
So I finally came with a workaround for this situation. The situation being tesseract-OCR giving empty string when the image contains only 1 or 2 length string(eg "1" or "25").
To get output in this situation I appended the same image multiple time at the original image so as to make its length greater than 2. For example, if the original image contained only "3", I appended "3" image(the same image) 4 more times and thereby making it an image which contains the text "33333". We then give this image to tesseract which gives output "33333"(most of the times).Then we just have to replace space with blank in the text output from the Tesseract and divide the resulting string length by 5 to get the index up to which we would want to text out from the whole text.
import pytesseract ## pip3 install pytesseract
def textFromTesseractOCR(croppedImage):
text = pytesseract.image_to_string(croppedImage)
if text.strip() == '': ### program that handles our problem
if 0 not in croppedImage:
return ""
yDir = 3
xDir = 3
iterations = 4
img = generate_blocks_dilation(croppedImage, yDir, xDir, iterations)
## we dilation to get only the text portion of the image and not the whole image
kernelH = np.ones((1,5),np.uint8)
kernelV = np.ones((5,1),np.uint8)
img = cv2.dilate(img,kernelH,iterations = 1)
img = cv2.dilate(img,kernelV,iterations = 1)
image = cropOutMyImg(img, croppedImage)
concateImg = np.concatenate((image, image), axis = 1)
concateImg = np.concatenate((concateImg, image), axis = 1)
concateImg = np.concatenate((concateImg, image), axis = 1)
concateImg = np.concatenate((concateImg, image), axis = 1)
textA = pytesseract.image_to_string(concateImg)
textA = textA.strip()
textA = textA.replace(" ","")
textA = textA[0:int(len(textA)/5)]
return textA
return text
def generate_blocks_dilation(img, yDir, xDir, iterations):
kernel = np.ones((yDir,xDir),np.uint8)
ret,img = cv2.threshold(img, 0, 1, cv2.THRESH_BINARY_INV)
return cv2.dilate(img,kernel,iterations = iterations)
def cropOutMyImg(gray, OrigImg):
mask = np.zeros(gray.shape,np.uint8) # mask image the final image without small pieces
_ , contours, hierarchy = cv2.findContours(gray,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
for cnt in contours:
if cv2.contourArea(cnt)!=0:
cv2.drawContours(mask,[cnt],0,255,-1) # the [] around cnt and 3rd argument 0 mean only the particular contour is drawn
# Build a ROI to crop the QR
x,y,w,h = cv2.boundingRect(cnt)
roi=mask[y:y+h,x:x+w]
# crop the original QR based on the ROI
QR_crop = OrigImg[y:y+h,x:x+w]
# use cropped mask image (roi) to get rid of all small pieces
QR_final = QR_crop * (roi/255)
return QR_final