the image:
the image is handwritten line of text this is extracting the text some what but not the expect same in the image
and the code is
import cv2
img = cv2.imread("a.jpg")
img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
img = cv2.resize(img,(6000,100))
_, result1 = cv2.threshold(img,100,255,cv2.THRESH_BINARY)
_, result2 = cv2.threshold(img,100,255,cv2.THRESH_BINARY_INV)
adaptive_result1 = cv2.adaptiveThreshold(result1,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,199,5)
adaptive_result2 = cv2.adaptiveThreshold(result1,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,199,5)
adaptive_result3 = cv2.adaptiveThreshold(result2,100,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,199,5)
adaptive_result4 = cv2.adaptiveThreshold(result2,256,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,999,1)
# cv2.imshow("title",result)
# cv2.waitKey(0)
import pytesseract
from PIL import Image
from pytesseract import Output
configs = r'--psm 6 --oem 3'
pytesseract.pytesseract.tesseract_cmd =r'C:/Users/ASUS/OneDrive/Desktop/ubuntu file/tesseract.exe'
data = pytesseract.image_to_data(img, config=configs, output_type=Output.DICT)
data = pytesseract.image_to_data(result2, config=configs, output_type=Output.DICT)
data = pytesseract.image_to_data(adaptive_result3, config=configs, output_type=Output.DICT)
data = pytesseract.image_to_data(adaptive_result4, config=configs, output_type=Output.DICT)
# data = pytesseract.image_to_data(img, config=configs, output_type=Output.DICT)
# print(*data['text'])
# print(data.keys())
The Output
buttered off before. there, twinkling like new Yar sytemy, hung cuter of tempting Uwitationx they beaged hi to contre the extera
I have tried some image preprocessing techniques such as using dilation and manipulating the threshold. The image became crystal clear.
However, the confidence score sometimes is as low as 51, which means that you cannot really rely on the output. It is something that is related to the training of the engine.