Search code examples
pythonpdfpdfplumber

Extract text from pdf file using pdfplumber


I want to extract text from a pdf file, tried:

directory = r'C:\Users\foo\folder'

for x in os.listdir(directory):
    print(x)
    x = x.replace('.pdf','')
    filename = os.fsdecode(x)
    print(x)

    if filename.endswith('.pdf'):
        with pdfplumber.open(x) as pdf1:
            page1 = pdf1.pages[0]
            text1 = page1.extract_text()
            print(text1)

and it printed:

20170213091544343.pdf
20170213091544343

Seeing the file has a name of 20170213091544343, I added:


    else:
        with pdfplumber.open(x) as pdf1:
                page1 = pdf1.pages[0]
                text1 = page1.extract_text()
                print(text1)
            

to read the file in case the file name doesn't have .pdf and it caught error:


20170213091544343.pdf
20170213091544343
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-34-e370b214f9ba> in <module>
     16 
     17     else:
---> 18         with pdfplumber.open(x) as pdf1:
     19                 page1 = pdf1.pages[0]
     20                 text1 = page1.extract_text()

C:\Python38\lib\site-packages\pdfplumber\pdf.py in open(cls, path_or_fp, **kwargs)
     56     def open(cls, path_or_fp, **kwargs):
     57         if isinstance(path_or_fp, (str, pathlib.Path)):
---> 58             fp = open(path_or_fp, "rb")
     59             inst = cls(fp, **kwargs)
     60             inst.close = fp.close

FileNotFoundError: [Errno 2] No such file or directory: '20170213091544343'

Solution

  • os.listdir() gives only filename and you have to join it with directory

    for filename in os.listdir(directory):
    
        fullpath = os.path.join(directory, filename)
    
        #print(fullpath)
    

    And you have to keep exension .pdf

    import os
    import pdfplumber
    
    directory = r'C:\Users\foo\folder'
    
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
    
            fullpath = os.path.join(directory, filename)
            #print(fullpath)
    
            #all_text = ""
    
            with pdfplumber.open(fullpath) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    print(text)
                    #all_text += text
    
            #print(all_text)
    

    or with page number

            with pdfplumber.open(fullpath) as pdf:
                for number, page in enumerate(pdf.pages, 1):
                    print('--- page', number, '---')
                    text = page.extract_text()
                    print(text)