Search code examples
pythonpdftext

Read and extract multiple PDF's from multiple folders using python


I have a folder named as SOURCE. This SOURCE folder has multiple Folders- A, B, C, D,E, F, G, H. All these folders have multiple PDF files. I want to read a single pdf file from All- One from A, One from B, One from C, One from D, till H. So All in all from 8 folders, I want to read the first pdf file and extract the text data from it. To extract text data from 1pdf is fine, but how do I extract text data from multiple pdfs? here's my code to extract a text data for a single pdf.


from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.converter import TextConverter
import io
import glob as g

resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)


with open('F:/technophile/Proj/SOURCE/A/abc.pdf', 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
        page_interpreter.process_page(page)

    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)

Solution

  • Maybe you could try something like this :

    # your code
    
    import os
    
    folder = ['A','B','C','D','E','F','G','H']
    allyourpdf = []
    
    
    for fold in folder:
        allyourfiles = os.listdir(fold)
        firstpdf = ""
        for i in allyourfiles:
            if '.pdf' in i:
                firstpdf = i
                break
    
        with open('F:/technophile/Proj/SOURCE/'+fold+firstpdf, 'rb') as fh:
    
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                page_interpreter.process_page(page)
    
            text = fake_file_handle.getvalue()
            allyourpdf.append(text)
    
    # your code
    

    I think it should work