Search code examples

Efficiently extract the highlighted portion from PDFs using PyMuPDF python?

I have a use case where I have to highlight table from PDF document and then extract the highlighted part using python. Once it is highlighted, I have to transform the extracted part to a dataframe such that the dataframe should look like this:

name      |   value
apple inc     0.84
google inc    0.95

I have implemented a logic to extract the highlighted text from PDFs. I am able to get it extracted however the highlighted part is appearing twice in my list and if I add a comma after each space I tend to loose out the company name for eg:- Apple Inc, Google Inc comes out to be Apple,Inc,Google,Inc which I do not want. Here is the code for the same:

from typing import List, Tuple

import fitz  # install with 'pip install pymupdf'
import pandas as pd

def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    return sentence

def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, wordlist))
        annot =
    return highlights

def main(filepath: str) -> List:
    doc =

    highlights = []
    for page in doc:
        highlights += handle_page(page)

    highlighted_text = [",".join(i.split(" ")) for i in highlights] # adds a comma in place of spaces
    # df = pd.DataFrame(highlights)
    # print(df.head())

if __name__ == "__main__":

The output That I get is:


To sum it up: How do I ensure that while extracting the highlighted text from pdf I do not get duplicates and 2. How do I make sure that values are the same as they are seen in pdf like Google Inc, Apple Inc and not like Google,Inc,Apple,Inc.

Please help me on this.

EDIT:- implemented Counter from collections library, still could not remove redundant names.

from collections import Counter
def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    new_wordlist = sorted(Counter(wordlist), key=lambda w: (w[3], w[0]))
    #wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, new_wordlist))
        annot =
    return highlights

EDIT-2: implemented this logic and now I am able to remove the redundant words from the list, however words like("LTD") are getting removed.

def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    string_split = sentence.split()
    sent = " ".join(sorted(set(string_split), key=string_split.index))
    return sent


  • This was solved by using re.sub()

    def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
        points = annot.vertices
        quad_count = int(len(points) / 4)
        sentences = []
        for i in range(quad_count):
            # where the highlighted part is
            r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
            words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
            sentences.append(" ".join(w[4] for w in words))
        sentence = " ".join(sentences)
        string_split = sentence.split(",")
        sent = " ".join(sorted(set(string_split), key=string_split.index))
        output = re.sub(r'\b(\d+(?:\.\d+)?)\b', r'\1,', sent)
        return output