Search code examples
pythonpdfminer

How can I adjust the 'word_margin' for reading PDFs with pdfminer in python?


I've tried to manipulate the 'word_margin' with python using the following code but it throws me an error TypeError: get_pages() got an unexpected keyword argument 'word_margin'. PDFminer reads the document fine if I remove the word_margin=word_marginfrom the arguments.

Code:

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    word_margin = 1

    for page in PDFPage.get_pages(fp, pagenos, word_margin=word_margin,maxpages=maxpages,password=password,caching=caching, check_extractable=True):

        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

Solution

  • word_margin is a parameter of LAParams class. If I understand correctly, the code should look like this:

    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams
    from pdfminer.converter import TextConverter
    
    from StringIO import StringIO
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
    
        word_margin = 1
        laparams = LAParams(word_margin=word_margin)
    
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(path, 'rb')
    
    
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
    
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
    
            interpreter.process_page(page)
    
        text = retstr.getvalue()
    
        fp.close()
        device.close()
        retstr.close()
        return text