Search code examples
pythontesseractpython-tesseract

How do I get Tesseract to properly detect text in an image?


I have written a simple python script that returns the text from an image.

import cv2
import pytesseract
import numpy
from PIL import Image

def getText(img): # accept PIL.Image
    cvimg = numpy.array(img) # convert to openCV
    greyscale = cv2.cvtColor(screen, cv2.COLOR_BGR2GRAY) # make greyscale
    textFromImg = pytesseract.image_to_string(screenGR) # get text
    
    return textFromImg # return text

I've tested my code on this image but the output is an empty string.

Why does tesseract not work properly with certain images and how can I fix this?


Solution

  • Image quality and size is important. See also what you can do with the pagesegmentation (-psm 1-13 or here):

    import cv2
    import pytesseract
    
    def getText(img): 
        # First step prepare image
        image = cv2.imread(img, cv2.IMREAD_UNCHANGED)
        grayImage = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # B&W image is best
        (thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 170, 255, cv2.THRESH_BINARY) 
        # focus on text area
        cropped_image = blackAndWhiteImage[59:96, 314:560] # img[y:y+h, x:x+w]
        # resize the character if necessary
        scale_percent = 100 # percent of original size
        width = int(cropped_image.shape[1] * scale_percent / 100)
        height = int(cropped_image.shape[0] * scale_percent / 100)
        dim = (width, height)
        resized = cv2.resize(cropped_image, dim, interpolation = cv2.INTER_AREA)
        # Second ocr the text
        custom_config = r'--psm 3 --oem 3  -l eng'
        textFromImg = pytesseract.image_to_string(resized, config=custom_config)
        # Show result of image transformation
        cv2.imshow("Black & White", resized)
        cv2.waitKey(1200)
        cv2.destroyAllWindows()
        return textFromImg
             
    if __name__ == '__main__':
        ocr_text = getText("sign.png")
        print(ocr_text)
    

    Output:

    SPIKE PLANTED