Search code examples
python-3.xcanvaspypdfreportlab

Adding Text to a long pdf


I am using PyPDF2 and reportlab, I want to insert text(a number) after every 10 lines. Using drawString(x, y, text) but the y axis has to be dynamic to be able to append the text at the end of the tenth line. How to achieve this?

import PyPDF2
import pdfplumber
import io

from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4

# Open the PDF file
with pdfplumber.open(r'C:\PyCharm Projects\AirRumi\pdfs\test.pdf') as pdf:
  # Iterate through each page
  for page in pdf.pages:
    print(page)
    # Count pages
    pdf_file = doc = open(r'C:\PyCharm Projects\AirRumi\pdfs\test.pdf', 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)
    # print(total_pages)
    pdfWriter = PyPDF2.PdfWriter()

    for page_num in range(total_pages):
        pag = pdf_reader.pages[page_num]
        # Extract text from the page
        text = page.extract_text()
        # Split the text into lines
        lines = text.split('\n')

        block_number = 10
        # Print lines in blocks of 10
        for i in range(0, len(lines), 10):
            block = lines[i:i + 10]
            desired_line = block[- 1]  # Replace 0 with the desired line number
            # print(desired_line)
            # print(block_number, block)
            # block_number += 10
            modified_text = desired_line + str(block_number)
            block_number += 10
            print(modified_text)
            packet = io.BytesIO()
            line_num = 1
            can = canvas.Canvas(packet, pagesize=A4)
            can.drawString(580, 10 * inch - line_num * 1 * inch, "10")
            can.save()

            # move to the beginning of the StringIO buffer
            packet.seek(0)

            # create a new PDF with Reportlab
            new_pdf = PyPDF2.PdfReader(packet)

            pag.merge_page(new_pdf.pages[0])
            pdfWriter.add_page(pag)
            pdfWriter.write(r'C:\PyCharm Projects\AirRumi\pdfs\text.pdf')

Solution

  • I have simplified your example a bit to only run on one page, but it should be easy enough to extend this to all pages. The following code uses pypdf for most of the functionality, while relying on reportlab to generate the appropriate text commands for the overlay:

    import sys
    from collections import defaultdict
    from io import BytesIO
    
    from pypdf import PdfReader, PdfWriter
    from reportlab.pdfgen.canvas import Canvas
    
    
    POSITIONS = defaultdict(lambda: sys.maxsize)
    SIZES = {}
    
    
    def record_line_positions(text, cm, tm, font_dict, font_size):
        x, y = tm[4], tm[5]
        # Record the x offset of the leftmost text operator.
        if POSITIONS[y] > x:
            POSITIONS[y] = x
            # Record the font size for further usage.
            # You might want to keep track of the font data as well.
            # Please note that in some cases the font size might be reported as 1.0
            # and thus might need further fixing.
            SIZES[y] = font_size
    
    
    
    # Read the original file.
    writer = PdfWriter(clone_from='file.pdf')
    
    # Look at a specific page.
    page = writer.pages[3]
    
    # Clear the global status.
    POSITIONS.clear()
    SIZES.clear()
    
    # Record the page data.
    page.extract_text(visitor_text=record_line_positions)
    
    # Add the line numbers.
    for i, (y, x) in enumerate(sorted(POSITIONS.items())):
        if i % 10:
            continue
        overlay = BytesIO()
        canvas = Canvas(
            filename=overlay,
            pagesize=(page.mediabox.width, page.mediabox.height)
        )
    
        # As we want to set the font and font size, use the more complex approach.
        text = canvas.beginText(x=x - 20, y=page.mediabox.height - y)
        text.setFont(psfontname='Helvetica', size=SIZES[y])
        text.textLine(text=str(i))
        canvas.drawText(text)
    
        # Generate the overlay page.
        canvas.save()
        overlay.seek(0)
    
        # Add the overlay itself.
        page.merge_page(PdfReader(overlay).pages[0])
    
    # Write the generated/updated file.
    writer.write('out.pdf')
    

    Please note that this uses the most basic approach for retrieving the text itself. Similar results can be achieved by using MuPDF (mutool convert with stext output, or PyMuPDF if you do not have to care about the AGPL implications), pdftotext or pdfminer.six, although they might require different post-processing (parsing (X)HTML, merging character level results etc.)