Search code examples
pythonpdfpymupdfpython-pdfreader

Comparing keywords with PDF files


Here is the program that called the files through folder name and extract data. Now i want to compare the data with the keywords that I used in the program below. But it gives me:

pdfReader = pdfFileObj.loadPage(0)
AttributeError: '_io.BufferedReader' object has no attribute 'loadPage'

I want to remove the error and compare the key words with the extracted data. I used PyMuPDF Library for this program.

import fitz
import os

pdfFiles = []
for filename in os.listdir('resume/'):
    if filename.endswith('.pdf'):
        print(filename)
        # pdfFiles.append(filename)
        os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
        print('Current working dir : %s' % os.getcwd())
        pdfFileObj = open(filename, 'rb')
        pdfReader = pdfFileObj.loadPage(0)
        with fitz.open(pdfFileObj) as doc:
            text = ""
            for page in doc:
                text += page.getText()
                print(text)
                # split the docs
                pageObj = pdfReader.getpage(0)
                t1 = (pageObj.getText())
                t1 = t1.split(",")
                search_keywords = ['python', 'Laravel', 'Java']
                for sentence in t1:
                    lst = []
                    for word in search_keywords:
                        if word in search_keywords:
                            list.append(word)
                        print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
        pdfFileObj.close()

Solution

  • You missed two lines: import PyPDF2 and pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    Notice that getPage(0) will return page number 0 object, in your for loop you are constantly reading the same page, if you want to read every iteration new page you should check how many pages there is in the doc and create i parameter that runs from 0 to pdfReader.numPages.

    import fitz
    import os
    import PyPDF2
    
    pdfFiles = []
    for filename in os.listdir('resume/'):
        if filename.endswith('.pdf'):
            print(filename)
            # pdfFiles.append(filename)
            os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
            print('Current working dir : %s' % os.getcwd())
            pdfFileObj = open(filename, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            pageObj = pdfReader.getPage(0)
            with fitz.open(pdfFileObj) as doc:
                text = ""
                for page in doc:
                    text += page.getText()
                    print(text)
                    # split the docs
                    pageObj = pdfReader.getPage(0)
                    t1 = (pageObj.getText())
                    t1 = t1.split(",")
                    search_keywords = ['python', 'Laravel', 'Java']
                    for sentence in t1:
                        lst = []
                        for word in search_keywords:
                            if word in search_keywords:
                                list.append(word)
                            print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
            pdfFileObj.close()
    

    working-with-pdf-files-in-python