I am using PyPDF2 and reportlab, I want to insert text(a number) after every 10 lines. Using drawString(x, y, text)
but the y axis has to be dynamic to be able to append the text at the end of the tenth line. How to achieve this?
import PyPDF2
import pdfplumber
import io
from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
# Open the PDF file
with pdfplumber.open(r'C:\PyCharm Projects\AirRumi\pdfs\test.pdf') as pdf:
# Iterate through each page
for page in pdf.pages:
print(page)
# Count pages
pdf_file = doc = open(r'C:\PyCharm Projects\AirRumi\pdfs\test.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
total_pages = len(pdf_reader.pages)
# print(total_pages)
pdfWriter = PyPDF2.PdfWriter()
for page_num in range(total_pages):
pag = pdf_reader.pages[page_num]
# Extract text from the page
text = page.extract_text()
# Split the text into lines
lines = text.split('\n')
block_number = 10
# Print lines in blocks of 10
for i in range(0, len(lines), 10):
block = lines[i:i + 10]
desired_line = block[- 1] # Replace 0 with the desired line number
# print(desired_line)
# print(block_number, block)
# block_number += 10
modified_text = desired_line + str(block_number)
block_number += 10
print(modified_text)
packet = io.BytesIO()
line_num = 1
can = canvas.Canvas(packet, pagesize=A4)
can.drawString(580, 10 * inch - line_num * 1 * inch, "10")
can.save()
# move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PyPDF2.PdfReader(packet)
pag.merge_page(new_pdf.pages[0])
pdfWriter.add_page(pag)
pdfWriter.write(r'C:\PyCharm Projects\AirRumi\pdfs\text.pdf')
I have simplified your example a bit to only run on one page, but it should be easy enough to extend this to all pages. The following code uses pypdf for most of the functionality, while relying on reportlab to generate the appropriate text commands for the overlay:
import sys
from collections import defaultdict
from io import BytesIO
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen.canvas import Canvas
POSITIONS = defaultdict(lambda: sys.maxsize)
SIZES = {}
def record_line_positions(text, cm, tm, font_dict, font_size):
x, y = tm[4], tm[5]
# Record the x offset of the leftmost text operator.
if POSITIONS[y] > x:
POSITIONS[y] = x
# Record the font size for further usage.
# You might want to keep track of the font data as well.
# Please note that in some cases the font size might be reported as 1.0
# and thus might need further fixing.
SIZES[y] = font_size
# Read the original file.
writer = PdfWriter(clone_from='file.pdf')
# Look at a specific page.
page = writer.pages[3]
# Clear the global status.
POSITIONS.clear()
SIZES.clear()
# Record the page data.
page.extract_text(visitor_text=record_line_positions)
# Add the line numbers.
for i, (y, x) in enumerate(sorted(POSITIONS.items())):
if i % 10:
continue
overlay = BytesIO()
canvas = Canvas(
filename=overlay,
pagesize=(page.mediabox.width, page.mediabox.height)
)
# As we want to set the font and font size, use the more complex approach.
text = canvas.beginText(x=x - 20, y=page.mediabox.height - y)
text.setFont(psfontname='Helvetica', size=SIZES[y])
text.textLine(text=str(i))
canvas.drawText(text)
# Generate the overlay page.
canvas.save()
overlay.seek(0)
# Add the overlay itself.
page.merge_page(PdfReader(overlay).pages[0])
# Write the generated/updated file.
writer.write('out.pdf')
Please note that this uses the most basic approach for retrieving the text itself. Similar results can be achieved by using MuPDF (mutool convert
with stext output, or PyMuPDF if you do not have to care about the AGPL implications), pdftotext
or pdfminer.six
, although they might require different post-processing (parsing (X)HTML, merging character level results etc.)