Search code examples
pythonpdfextractpypdfpython-pdfreader

Extract consecutive two pages from a pdf document and save each file with a text from each first page as the filenames


I have a 100 page pdf document. Each two pages contain unique employee data. I need a python code to extract each of the two pages and save them as separate files with filenames as the text extracted from each first page. For example

  • The 100 page pdf document will be saved at 50 separate files
  • The first page of each file contains the text Dear Miles Wood, Dear Kate Aaron etc,
  • The first extracted filename should be Miles_Wood.pdf and second Kate_Aaron.pdf and so on..

Will be most pleased with a python solution

Thanks in advance

I have tried to adapt a seemingly similar python solution by the following but it doesn't appear to work for me

from PyPDF2 import PdfReader, PdfWriter
import re
import argparse
import os
 
cwd = os.getcwd()
output_dir = os.path.join(cwd, 'output')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--pdf", dest="pdf", required=True)
    parser.add_argument("-c", "--count", dest="count", type=int, required=True)
    parser.add_argument("-r", "--regex", dest="regex", required=True)
    return parser.parse_args()
 
def split_pdf(file, page_count, regex):
    reader = PdfReader(file)
    os.chdir(output_dir)
    for i in range(0, reader.numPages, page_count):
        writer = PdfWriter()
        if reader.numPages > 1 :
            for y in range(page_count):
                writer.add_page(reader.pages[i])
                writer.add_page(reader.pages[i + y])
        else :
            writer.add_page(reader.pages[i])
        text = reader.pages[i].extract_text()
        search = re.search(regex, text)
        newname = search.group(1) + ".pdf"
        outputStream = open(newname, "wb")
        writer.write(outputStream)
 
 
if __name__ == "__main__" :
    arguments = get_arguments()
    split_pdf(arguments.pdf, arguments.count, arguments.regex)](https://stackoverflow.com)

Credit https://pastebin.com/mDRV77pp

Solution

  • A solution based on PyMuPDF:

    import fitz  # PyMuPDF
    
    doc = fitz.open("input.pdf")
    i = 0
    while i < len(doc):
        page = doc[i]
        words = page.get_text("words", sort=True)
        for k, word in enumerate(words):
            if word[4] != "Dear":
                continue
            j = k + 2  # 1st name (skipped "Mr.", "Mrs." etc.)
            # append name components until one ends with a comma
            names = [words[j][4]]
            while not words[j][4].endswith(","):
                j += 1
                names.append(words[j][4])
            filename = "-".join(names)[:-1] + ".pdf"
        new = fitz.open()
        new.insert_pdf(doc, from_page=i, to_page=i + 1)
        new.ez_save(filename)
        new.close()
        i += 2
    

    Whole thing works on the assumption, that the greeting contains "Dear", then a Mr./Mrs./Ms. or whatever honorifics, then however number of names the last one ending with a comma. We are extracting the words of the odd pages in word format and sort them (just to be sure) by y, then by x coordinate.