Search code examples
pythonexceptionerror-handlingpdfminerpdf-parsing

Ensure loop runs through every file even when errors are raised


I am iterating over a bunch of pdf in a folder, parse their content and append it to a list. It works on a subset of pdf-files. I dont want to manually remove some of the pdf, run the code and then add a few to run it again until i found the malfunctioning pdfs. Since some pdf cannot be opened or may have corrupted content, i did the following to ensure the loop runs through: check_extractable (pdfminer should throw an error if a pdf is not extractable) is the method of an inhouse class (PDFTextExtractionNotAllowed) that can prevent it from trying to open pdf it actually cannot

Question: What do i need to do in order to make the code keep running even if there is a pdf that cannot be opened or has not content (assuming this is why the error is thrown at that specific point in the code)

import pdfminer
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
import os
import io
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTFigure, LTImage, 
LTTextLine, LTTextContainer, LTChar, LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser, PDFSyntaxError

directory = 'C:/Users/'
data = []
for file in os.listdir(directory):
    if not file.endswith(".pdf"):
        continue
    fake_file_handle = io.StringIO()


    with open(os.path.join(directory, file), 'rb') as fh:
        resource_manager = PDFResourceManager()
        laparams = LAParams(line_margin = 0.6)
        device = PDFPageAggregator(resource_manager, laparams = laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, device)

        positions = []
        raw_text = []
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()
            layout = device.get_result()
            for lobj in layout:
                
                if isinstance(lobj, LTTextContainer) or isinstance(lobj, LTTextBox) or isinstance(lobj, pdfminer.layout.LTTextBoxHorizontal):
                    coord, word = int(lobj.bbox[1]), lobj.get_text().strip()
                    raw_text.append([coord, word])

                    for text_line in lobj:
                        for character in text_line:
                            if isinstance(character, LTChar):
                                if character.matrix[0]>0 :
                                    position = character.bbox 
                        positions.append(position)

                # if it's a container, recurse
                elif isinstance(lobj, LTFigure):
                    pass

        # extract elements below y0=781 und above y0=57
        text_pos = []
        maxFontpos = 780
        minFontpos = 58
        for coord, word in raw_text:
            if coord <= maxFontpos and coord >= minFontpos:
                text_pos.append(word)
            else:
                pass
 
        try:
            wap = text_pos[0]
        except:
            pass
        
    data.append([text_pos, wap])
    fake_file_handle.close()

The specific error is thrown at

---> 28                         for character in text_line:
     29                             if isinstance(character, LTChar):
     30                                 if character.matrix[0]>0 :

TypeError: 'LTChar' object is not iterable

Solution

  • If this is just a quick and dirty script, I would recommend just surrounding the entire with block in a general try/except. Typically you don't want to just blindly except/catch exceptions without specifying what type you are looking for in case a different exception/error occurs that you were not expecting, but in a situation like this I think it would be okay:

    from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
    
    directory = 'C:/Users/'
    data = []
    for file in os.listdir(directory):
        if not file.endswith(".pdf"):
            continue
        fake_file_handle = io.StringIO()
    
        try:
            with open(os.path.join(directory, file), 'rb') as fh:
                resource_manager = PDFResourceManager()
                laparams = LAParams(line_margin = 0.6)
                device = PDFPageAggregator(resource_manager, laparams = laparams)
                page_interpreter = PDFPageInterpreter(resource_manager, device)
    
                positions = []
                raw_text = []
                for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                    page_interpreter.process_page(page)
                    text = fake_file_handle.getvalue()
                    layout = device.get_result()
                    for lobj in layout:
                    
                        if isinstance(lobj, LTTextContainer) or isinstance(lobj, LTTextBox) or isinstance(lobj, pdfminer.layout.LTTextBoxHorizontal):
                            coord, word = int(lobj.bbox[1]), lobj.get_text().strip()
                            raw_text.append([coord, word])
    
                            for text_line in lobj:
                                for character in text_line:
                                    if isinstance(character, LTChar):
                                        if character.matrix[0]>0 :
                                            position = character.bbox  # font-positon
                                positions.append(position)
    
                        # if it's a container, recurse
                        elif isinstance(lobj, LTFigure):
                            pass
    
                # extract elements below y0=781 und above y0=57
                text_pos = []
                maxFontpos = 780
                minFontpos = 58
                for coord, word in raw_text:
                    if coord <= maxFontpos and coord >= minFontpos:
                        text_pos.append(word)
                    else:
                        pass
     
                try:
                    wap = text_pos[0]
                except:
                    pass
        except:
            continue # Move on to next loop iteration
    
        data.append([text_pos, wap])
        fake_file_handle.close()