Search code examples
pythonpandastext-extractionpymupdf

Efficiently extract the highlighted portion from PDFs using PyMuPDF python?


I have a use case where I have to highlight table from PDF document and then extract the highlighted part using python. Once it is highlighted, I have to transform the extracted part to a dataframe such that the dataframe should look like this:

name      |   value
apple inc     0.84
google inc    0.95

I have implemented a logic to extract the highlighted text from PDFs. I am able to get it extracted however the highlighted part is appearing twice in my list and if I add a comma after each space I tend to loose out the company name for eg:- Apple Inc, Google Inc comes out to be Apple,Inc,Google,Inc which I do not want. Here is the code for the same:

from typing import List, Tuple

import fitz  # install with 'pip install pymupdf'
import pandas as pd

def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    return sentence


def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, wordlist))
        annot = annot.next
    return highlights


def main(filepath: str) -> List:
    doc = fitz.open(filepath)

    highlights = []
    for page in doc:
        highlights += handle_page(page)

    #print(highlights)
    highlighted_text = [",".join(i.split(" ")) for i in highlights] # adds a comma in place of spaces
    print(highlighted_text)
    # df = pd.DataFrame(highlights)
    # print(df.head())


if __name__ == "__main__":
    (main("pdfname.pdf"))

The output That I get is:

['Security,Name,%,to,Net,Assets*,Assets*,DEBENtURES,0.04,Britannia,Industries,Ltd.,0.04,Britannia,Industries,Ltd.,0.04,EQUity,&,EQUity,RELAtED,96.83,EQUity,&,EQUity,RELAtED,96.83,HDFC,Ban
k,Ltd.,6.98,HDFC,Bank,Ltd.,6.98,ICICI,Bank,Ltd.,4.82,ICICI,Bank,Ltd.,4.82,Infosys,Ltd.,4.37,Infosys,Ltd.,4.37,Reliance,Industries,Ltd.,4.05,Reliance,Industries,Ltd.,4.05,Bajaj\tFinance\tL
td.,3.82,Bajaj\tFinance\tLtd.,3.82,Housing,Development,Finance,Corpn.,Ltd.,3.23,Housing,Development,Finance,Corpn.,Ltd.,3.23,Grindwell,Norton,Ltd.,3.22,SRF,Ltd.,3.22,SRF,Ltd.,3.22,Sun,Pha
rmaceutical,Industries,Ltd.,2.85,Sun,Pharmaceutical,Industries,Ltd.,2.85,Bharti,Airtel,Ltd.,2.82,DLF,Ltd.,2.64,DLF,Ltd.,2.64,Ultratech,Cement,Ltd.,2.62,Ultratech,Cement,Ltd.,2.62,SKF,Indi
a,Ltd.,2.45,Crompton,Greaves,Consumer,Electricals,Ltd.,2.42,Crompton,Greaves,Consumer,Electricals,Ltd.,2.42,Avenue,Supermarts,Ltd.,2.41,Avenue,Supermarts,Ltd.,2.41,Axis,Bank,Ltd.,2.41,ABB
,India,Ltd.,2.35,ABB,India,Ltd.,2.35,Titan,Co.,Ltd.,2.29,Titan,Co.,Ltd.,2.29,Kotak,Mahindra,Bank,Ltd.,2.09,Cipla,Ltd.,2.05,Cipla,Ltd.,2.05,Laurus,Labs,Ltd.,2.04,Laurus,Labs,Ltd.,2.04,Wipr
o,Ltd.,1.77,Happiest,Minds,Technologies,Ltd.,1.68,Happiest,Minds,Technologies,Ltd.,1.68,Canara,Bank,1.67,Canara,Bank,1.67,Shree,Cement,Ltd.,1.63,Security,Name,%,to,Net,Assets*,Assets*,Mah
indra,&,Mahindra,Ltd.,1.59,Pidilite,Industries,Ltd.,1.50,Pidilite,Industries,Ltd.,1.50,ICICI,Lombard,General,Insurance,Co.,Ltd.,1.48,ICICI,Lombard,General,Insurance,Co.,Ltd.,1.48,Cholaman
dalam,Investment,&,Finance,Co.,Ltd.,1.45,Cholamandalam,Investment,&,Finance,Co.,Ltd.,1.45,Tech,Mahindra,Ltd.,1.35,Tech,Mahindra,Ltd.,1.35,State,Bank,of,India,1.31,State,Bank,of,India,1.31
,Hindustan,Unilever,Ltd.,1.30,Hindustan,Unilever,Ltd.,1.30,Vardhman,Textiles,Ltd.,1.30,Vardhman,Textiles,Ltd.,1.30,Larsen,&,Toubro,Ltd.,1.24,Larsen,&,Toubro,Ltd.,1.24,Dabur,India,Ltd.,1.2
2,Neogen,Chemicals,Ltd.,1.10,Neogen,Chemicals,Ltd.,1.10,Eicher,Motors,Ltd.,1.09,Eicher,Motors,Ltd.,1.09,Thermax,Ltd.,1.08,TATA,Consultancy,Services,Ltd.,1.05,TATA,Consultancy,Services,Ltd
.,1.05,Indian,Railway,Catering,&,Tourism,Corpn.,Ltd.,0.98,Indian,Railway,Catering,&,Tourism,Corpn.,Ltd.,0.98,Firstsource,Solutions,Ltd.,0.97,Nestle,India,Ltd.,0.86,Nestle,India,Ltd.,0.86,
Asian,Paints,Ltd.,0.84,Asian,Paints,Ltd.,0.84,Welspun,India,Ltd.,0.72,IndusInd,Bank,Ltd.,0.63,IndusInd,Bank,Ltd.,0.63,SBI,Life,Insurance,Co.,Ltd.,0.50,SBI,Life,Insurance,Co.,Ltd.,0.50,Dee
pak,Nitrite,Ltd.,0.46,Adani,Ports,and,Special,Economic,Zone,Ltd.,0.36,Adani,Ports,and,Special,Economic,Zone,Ltd.,0.36,Gateway,Distriparks,Ltd.,0.33,Gateway,Distriparks,Ltd.,0.33,Bharat,Fo
rge,Ltd.,0.22,tREPS,on,G-Sec,or,t-Bills,2.81,tREPS,on,G-Sec,or,t-Bills,2.81,Cash,&,Cash,Receivables,0.32,Cash,&,Cash,Receivables,0.32,tOtAL']

To sum it up: How do I ensure that while extracting the highlighted text from pdf I do not get duplicates and 2. How do I make sure that values are the same as they are seen in pdf like Google Inc, Apple Inc and not like Google,Inc,Apple,Inc.

Please help me on this.

EDIT:- implemented Counter from collections library, still could not remove redundant names.

from collections import Counter
def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    new_wordlist = sorted(Counter(wordlist), key=lambda w: (w[3], w[0]))
    #wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, new_wordlist))
        annot = annot.next
    return highlights

EDIT-2: implemented this logic and now I am able to remove the redundant words from the list, however words like("LTD") are getting removed.

def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    string_split = sentence.split()
    sent = " ".join(sorted(set(string_split), key=string_split.index))
    return sent

Solution

  • This was solved by using re.sub()

    def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
        points = annot.vertices
        quad_count = int(len(points) / 4)
        sentences = []
        for i in range(quad_count):
            # where the highlighted part is
            r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
    
            words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
            sentences.append(" ".join(w[4] for w in words))
        sentence = " ".join(sentences)
        string_split = sentence.split(",")
        sent = " ".join(sorted(set(string_split), key=string_split.index))
        output = re.sub(r'\b(\d+(?:\.\d+)?)\b', r'\1,', sent)
        return output