Search code examples
pythonpdfxlsxxlsxwriterpdf-extraction

Get all PDF files name under same folder and save in excel according to PDF file name


I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name. This is what I have tried

def get_files(pdf_path):
    import os
    os.chdir(pdf_path)
    files = os.listdir()
    files = [x for x in files if x.endswith(".pdf")]
    return files 
    
files = get_files(pdf_path)

for i in files:
    save_as_excel(pdf_path, i)

Solution

  • As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.

    Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files

    Assumption and logic would remain same from my previous answer.

    I have moved the data and methods and encapsulated to a class PdfExtractor.

    Please follow the below steps to extract text from pdf and save as xlsx. Before moving ahead install the packages pdfplumber, xlsxwriter

    1. Save the below code with filename PdfExtractor.py
    import pdfplumber
    import xlsxwriter
    import re
    
    # regex pattern for keys in line1 of data entity
    my_regex_dict_line1 = {
        'Our Ref' : r'Our Ref :(.*?)Name',
        'Name' : r'Name:(.*?)Ref 1',
        'Ref 1' : r'Ref 1 :(.*?)Ref 2',
        'Ref 2' : r'Ref 2:(.*?)$'
    }
    
    # regex pattern for keys in line2 of data entity
    my_regex_dict_line2 = {
        'Amount' : r'Amount:(.*?)Total Paid',
        'Total Paid' : r'Total Paid:(.*?)Balance',
        'Balance' : r'Balance:(.*?)Date of A/C',
        'Date of A/C' : r'Date of A/C:(.*?)Date Received',
        'Date Received' : r'Date Received:(.*?)$'
    }
    
    # regex pattern for keys in line3 of data entity
    my_regex_dict_line3 ={
        'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
        'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
        'A/C Status': r'A/C\s+Status:(.*?)Collector',
        'Collector' : r'Collector :(.*?)$'
    }
    
    class PdfExtractor:
        data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
        
        def __init__(self, pdf_path):
            self.pdf_path = pdf_path
            self.json_data = {}
            self.pdf_text = ''
    
        def __preprocess_data(self, data):
            return [el.strip() for el in data.splitlines() if el.strip()]
        
        def __get_header_data(self, text):
            header_data_list = self.__preprocess_data(text)
            # third line in text of header contains Date Created field
            self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
            # fourth line in text contains Number of Pages, Client Code, Client Name
            self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
            # fifth line in text contains Client Code and ClientName
            self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
            self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
        
        def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
            ''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
            for key, regex in regex_dict.items():
                matched_value = re.search(regex, text)
                if matched_value is not None:
                    data_dict[key] = matched_value.group(1).strip()
                    
        def __populate_date_notes(self, data_dict, text):
            ''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
            data_dict['Date'] = []
            data_dict['Notes'] = []
            iter = 4
            while(iter < len(text)):
                date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
                data_dict['Date'].append(date_match.group(1).strip())
                notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
                data_dict['Notes'].append(notes_match.group(1).strip())
                iter += 1
        
        def get_pdf_text(self):
            data_index = 1
            with pdfplumber.open(self.pdf_path) as pdf:
                index = 0
                while(index < len(pdf.pages)):
                    page = pdf.pages[index]
                    self.pdf_text += '\n' + page.extract_text()
                    index += 1
                            
            split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
            # first data in the split_on_data_entity list will contain the header information
            self.__get_header_data(split_on_data_entity[0])
            while(data_index < len(split_on_data_entity)):
                data_entity = {}
                data_processed = self.__preprocess_data(split_on_data_entity[data_index])
                self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
                self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
                self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
                if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
                    self.__populate_date_notes(data_entity, data_processed)
                    self.json_data['data_entity' + str(data_index)] = data_entity
                data_index += 1
                            
            return self.json_data 
            
        def save_as_xlsx(self, file_name):
            
            if(not self.json_data):
                print("Data was not read from PDF")
                return
            
            workbook = xlsxwriter.Workbook(file_name)
            worksheet = workbook.add_worksheet("Sheet 1")
            row = 0
            col = 0
    
            # write column
            columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
            worksheet.write_row(row, col,  tuple(columns))
            row += 1
    
            column_index_map = {}
            for index, col in enumerate(columns):
                    column_index_map[col] =  index
    
            # write the header
            worksheet.write(row, column_index_map['Date Created'],  self.json_data['Date Created'])
            worksheet.write(row, column_index_map['Number of Pages'],  self.json_data['Number of Pages'])
            worksheet.write(row, column_index_map['Client Code'],  self.json_data['Client Code'])
            worksheet.write(row, column_index_map['ClientName'],  self.json_data['ClientName'])
            data_entity_index = 1
    
    
            #iterate through each data entity and for each key insert the values in the sheet
            while True:
                data_entity_key = 'data_entity' + str(data_entity_index)
                row_size = 1
                if(self.json_data.get(data_entity_key) != None):
                    for key, value in self.json_data.get(data_entity_key).items():
                        if(type(value) == list):
                            worksheet.write_column(row, column_index_map[key],  tuple(value))
                            row_size = len(value)
                        else:
                            worksheet.write(row, column_index_map[key], value)
                else:
                    break
                data_entity_index += 1
                row += row_size
                    
            workbook.close()
            print(file_name + " saved successfully")
    
    1. Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
    import os
    from PdfExtractor import PdfExtractor
    
    path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
    files = os.listdir(path_of_pdf_files)
    
    for file in files:
        if(not file.endswith(".pdf")):
            continue
        filename = os.path.splitext(file)[0]
        pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
        pdf_text = pdf_obj.get_pdf_text()
        pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
    

    Output :

    C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
    C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
    C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
    

    enter image description here

    Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx

    Let me know if you have any doubts in the above code.