Search code examples
pythonpdfhighlightpymupdf

Use PyMuPDF to bold parts of text


I am trying to use PyMuPDF to bold portions of each word in a PDF file.

So, for example, a file with the string "There are many pies" will result in "There are many pies"

I have seen that you can use Page.get_textpage().extractWORDS() to sort of extract a tuple of the various words. However, I'm not sure exactly how to bold portions of them.

I had thought maybe you could erase them and then re-write them, but I'm not sure if PyMuPDF can erase words.


Solution

  • It looks like PyMuPDF does not allow the deleting of text, like you noted:

    In a nutshell, this is what you can do with PyMuPDF:

    • Modify page rotation and the visible part (“cropbox”) of the page.
    • Insert images, other PDF pages, text and simple geometrical objects.
    • Add annotations and form fields.

    So, following @K_J's suggestion of using redaction, I created this script which will:

    1. inspect the text of a page character-by-character
    2. build up list of contiguous of PyMuPDF Character Dictionaries at the start of a word; I call these list of contiguous chars segments
    3. create a super Rect that covers each segment, along with the string of chars, and use those two to create a redaction annotation, with a bold font, that's essentially "placed over" the original chars

    The effect is incomplete as the redaction annotation tries to center itself vertically (see align property); and I try to compensate for that.

    Original Highlighted
    Original text Highlighted text

    I bet the effect would be complete by creating new text on top of the original (maybe still redacted) text, but I've run out of time, for now, to try that out.

    import fitz
    
    from fitz import Document, Page
    from fitz import Matrix, Point, Rect
    
    
    Normal_style = dict(fontname="helv", fontsize=24)
    Bold_style = dict(fontname="hebo", fontsize=24)
    
    RawDictChar = dict  # See "Character Dictionary for extractRAWDICT()" in PyMuPDF docs
    CharSegment = list[RawDictChar]
    
    
    def main():
        doc: Document = fitz.open()
        page: Page = doc.new_page()
    
        page.insert_text(Point(50, 72), "A number of words and things on line 1", **Normal_style)
        page.insert_text(Point(50, 144), "A number of words on line 2", **Normal_style)
        page.insert_text(Point(50, 216), "Line 3", **Normal_style)
    
        page_to_image(page, "page-orig.png")
    
        char_segments = get_char_segments(page)
    
        apply_segment_redactions(page, char_segments)
    
        page_to_image(page, "page-edit.png")
    
    
    def get_char_segments(page: Page, num_chars: int = 3) -> list[CharSegment]:
        """
        Breaks a page down in groups ("segments") of individual characters, and returns a list of these "character segments".
    
        Each character segment is at most `num_chars` long and will be the first number of characters of a word (delimited by a space).
        """
        char_segments: list[CharSegment] = []
    
        rawdict = page.get_text("rawdict")
        for block in rawdict["blocks"]:
            if block["type"] == 1:
                continue  # skip "image" block
    
            for line in block["lines"]:
                for span in line["spans"]:
                    chars = span["chars"]
                    word_chars = []
                    for char in chars:
                        # Break on "space"
                        if char["c"] == " ":
                            char_segments.append(word_chars[:num_chars])
                            word_chars = []
                            continue
    
                        word_chars.append(char)
    
                    # Get any end-of-line chars
                    if word_chars:
                        char_segments.append(word_chars[:num_chars])
    
        return char_segments
    
    
    def apply_segment_redactions(page: Page, char_segments: list[CharSegment]):
        """Turns each character segment into a redaction annotation, applying the same characters but now in a boldened font."""
        M_shift_down = Matrix(1, 1).pretranslate(0, 2.5)  # try to compensate for redactions being vertically centered
    
        for char_segment in char_segments:
            first_cs = char_segment[0]
    
            # Build up replacement/redaction text
            highlight_txt = first_cs["c"]
            # Build up "super rect" of redaction area through rectangle unions of each subsequent char in segment
            highlight_rect: Rect = Rect(*first_cs["bbox"])
    
            for cs in char_segment[1:]:
                highlight_rect = highlight_rect | Rect(*cs["bbox"])
                highlight_txt += cs["c"]
    
            highlight_rect.transform(M_shift_down)
    
            page.add_redact_annot(highlight_rect, text=highlight_txt, fill=False, **Bold_style)
    
        page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
    
    
    def page_to_image(page: Page, fname):
        """Helper to visualize the original and redacted/highlighted."""
        Zoom_x = 2.0  # horizontal zoom
        Zoom_y = 2.0  # vertical zoom
        Z_matrix = fitz.Matrix(Zoom_x, Zoom_y)  # zoom factor 2 in each dimension
    
        pix = page.get_pixmap(matrix=Z_matrix)  # use 'mat' instead of the identity matrix
        pix.save(fname)  # store image as a PNG
    
    
    if __name__ == "__main__":
        main()