Search code examples
pythonpdfextracttext-extractionpdfminer

Working with singe pages with PDFMiner


I have some PDF documents from which I can not extract text with PyPDF, only with PDFMiner. The following code works fine to extract all text from the PDFs, it goes through the whole document, then returns all the text. Is there a way to only work with certain pages of the PDF? The PDFs I have are all 2000-3000 long and I only need to work with every second page.

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
         interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

Solution

  • Couldn't you use enumerate to get the page number and the page content while iterating through all the pages? If you only want every second page, use modulus. If you want specific pages only, use ranges.

    Example:

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
    
        for pagenumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
            print pagenumber
            if pagenumber % 2 == 0:
                print("even page number")
                interpreter.process_page(page)
            else:
                print("odd page number")
            if 5 <= pagenumber <= 10:
                print("pages 5 to 10")
    
        text = retstr.getvalue()
    
        fp.close()
        device.close()
        retstr.close()
        return text