Search code examples
pythonstringpdftextpdfminer

How to exctract text from a PDF and manipulate it (python)


Hi i used to following code to exctract the text(as a string) from the following insurance contract:

    import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage



def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=False):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()

            converter = TextConverter(resource_manager,
                                      fake_file_handle)

            page_interpreter = PDFPageInterpreter(resource_manager,
                                                  converter)

            page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()

            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()


def extract_text(pdf_path):
    text = ""
    for page in extract_text_by_page(pdf_path):
        #print(page)
            text= page+" "+text
        return text
        # Driver code
    
    
if __name__ == '__main__':
    text=extract_text('document.pdf')
    print(text)

i would like to extract the following values highlithed in red and the corralated value(the ones highlighted in blue)IMAGE

some exmaple outputs:

print(oneri_di_registrazione_atti_giudiziari)

"Valore in lite minimo 300 Limite di indennizzo 500" (as string)

print(tutela_dati_personali)

"Massimale per evento 25000" (as string)

The red part will not change if i change file but the blue part could so would like to link the numeric values to their red counterpart does anyone know how? I also share the raw string i exctracted if can help

Valoreinliteminimoeuro300,00OneridiregistrazionediattigiudiziariLimitediindennizzoeuro500,00PerlagaranziaATTIVITA'AZIENDALECOMPLETAMassimalepereventoeuro50.000,00SpeseperunsecondolegaledomiciliatarioLimitediindennizzoeuro2.000,00ControversiecontrattualiconifornitoriValoreinlitemassimoEuro50.000,00ControversiecontrattualiconifornitoriScoperto20%PerlagaranziaSALUTEESICUREZZASULLAVOROMassimalepereventoeuro25.000,00PerlagaranziaTUTELADEIDATIPERSONALIMassimalepereventoeuro25.000,00

comment if you need any more information thanks in advance to anyone who is able to solve it


Solution

  • disclaimer: I am the author of borb, the library used in this answer

    I describe a scenario similar to yours in the examples repository of the library. For the sake of completeness I'll repeat the answer here.

    #!chapter_005/src/snippet_008.py
    import typing
    from decimal import Decimal
    
    from borb.pdf.canvas.geometry.rectangle import Rectangle
    from borb.pdf import Document
    from borb.pdf import PDF
    from borb.toolkit import LocationFilter
    from borb.toolkit import RegularExpressionTextExtraction
    from borb.toolkit import PDFMatch
    from borb.toolkit import SimpleTextExtraction
    
    
    def main():
    
        # set up RegularExpressionTextExtraction
        # fmt: off
        l0: RegularExpressionTextExtraction = RegularExpressionTextExtraction("[nN]isi .* aliquip")
        # fmt: on
    
        # process Document
        doc: typing.Optional[Document] = None
        with open("output.pdf", "rb") as in_file_handle:
            doc = PDF.loads(in_file_handle, [l0])
        assert doc is not None
    
        # find match
        m: typing.Optional[PDFMatch] = next(iter(l0.get_matches_for_page(0)), None)
        assert m is not None
    
        # get page width
        w: Decimal = doc.get_page(0).get_page_info().get_width()
    
        # change rectangle to get more text
        r0: Rectangle = m.get_bounding_boxes()[0]
        r1: Rectangle = Rectangle(
            r0.get_x() + r0.get_width(), r0.get_y(), w - r0.get_x(), r0.get_height()
        )
    
        # process document (again) filtering by rectangle
        l1: LocationFilter = LocationFilter(r1)
        l2: SimpleTextExtraction = SimpleTextExtraction()
        l1.add_listener(l2)
        doc: typing.Optional[Document] = None
        with open("output.pdf", "rb") as in_file_handle:
            doc = PDF.loads(in_file_handle, [l1])
        assert doc is not None
    
        # get text
        print(l2.get_text_for_page(0))
    
    
    if __name__ == "__main__":
        main()
    

    The idea is that you use RegularExpressionTextExtraction to find text inside the PDF. This class can then return a list of PDFMatch objects, which contain the bounding boxes of the matching text.

    You can then do something with those Rectangle objects (in your case, move them to the rightmost side of the Page) and extract the text from the PDF on those given Rectangle(s).