Search code examples
pythonpython-3.xpython-2.7pdfminer

Python Pdfminer exclude lines from parser


Hi im using a typical way to parse txt from pdf. The problem is that each page has headers footers etc. than i dont want to write in the text file output. Is there a way to exclude e.g. the first 3 lines of each page (as i know how many lines the headers is)

My code is here:

from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from io import StringIO
import os
import re



def F_update(pdf_paths,number_upd,dir):

    base_path = dir

    for i in pdf_paths:

        my_file =  i                                 
        log_file = "C:\\Users\\vagos\\Desktop\\trelovagos\\text\\databases\\testnewafrica.txt"


        file2 = open("C:\\Users\\vagos\\Desktop\\trelovagos\\text\\databases\\testnewafrica.txt" ,mode = 'w+',buffering = 1 ,encoding ='utf-8')
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'ascii'
        laparams = LAParams(char_margin = 35,word_margin = 2 ,line_margin = 0.3 ,all_texts = True)#detect_vertical = True)
        device = TextConverter(rsrcmgr, retstr, codec=codec , laparams=laparams)
        fp = open(i, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        caching = True
        pagenos=set()

        for PageNumer,page in enumerate(PDFPage.get_pages(fp, pagenos , password=password,caching=caching, check_extractable=True)):
            interpreter.process_page(page)

        text = retstr.getvalue()

        file2.write(text)
        file2.close()

        fp.close()
        device.close()
        retstr.close()

Solution

  • It was harded than i thought so , using regex was more practical way to solve this problem.