Search code examples
pythonocropencvpython-tesseract

Extracting selected text by bounding box from an image


image

I am trying to fetch selected text by bounding box on an Image. like if only on word is selected by bounding box and I want to fetch that text and convert it into the text file. Please see my code and give some review so I can implement that functionality.

So far what I've done I've converted the PDF file to image with bounding box over the text.

import numpy as np
import csv
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
from pytesseract import Output
import cv2

pdf = wi(filename="samplecompany.pdf", resolution=100)
pdfImg = pdf.convert('jpg')
j = 1
for img in pdfImg.sequence:
    page = wi(image=img)
    page.save(filename=str(j)+".jpg")
    img1 = cv2.imread(str(j)+".jpg")

    d = pytesseract.image_to_data(img1, output_type=Output.DICT)
    n_boxes = len(d['level'])
    print(n_boxes)
    for i in range(n_boxes):
        (x, y, w, h) = (d['left'][i], d['top']
                        [i], d['width'][i], d['height'][i])
        print((x, y, w, h))
        cv2.rectangle(img1, (x, y), (x + w, y + h), (0, 255, 0), 2)

    cv2.imwrite(str(j)+".jpg", img1)

    cv2.waitKey(0)
    j += 1

this code is working fine I need to fetch desired text from images which I've created.using bounding box location


Solution

  • You can use this code to get custom text from a an image and change and modify accordingly and this is also save your text to an text file

    import io
    import cv2
    import numpy as np
    import pytesseract
    from PIL import Image
    from pytesseract import Output
    from wand.image import Image as wi
    import sys
    
    
    pdf = wi(filename="Resume.pdf", resolution=100)
    pdfImg = pdf.convert('jpg')
    j = 1
    imgBlobs = []
    img1= []
    for img in pdfImg.sequence:
        page = wi(image=img)
        page.save(filename=str(j)+".jpg")
        img1.append(cv2.imread(str(j)+".jpg"))
        j += 1
    
    extracted_text = []
    
    for img2 in img1:
        d = pytesseract.image_to_data(img2, output_type=Output.DICT)
        n_boxes = len(d['level'])
        print(n_boxes)
        extracted_text.append(d['text'][9])
        (x, y, w, h) = (d['left'][9], d['top'][9], d['width'][9], d['height'][9])
        cv2.rectangle(img2, (x, y), (x + w, y + h), (0, 255, 0), 2)
    
    
        cv2.imshow('img', img2)
    
        print(d)
    
    
    with open('Prototype.txt', 'w') as filehandle:
            for listitem in extracted_text:
                filehandle.write('%s\n' % listitem)