python pdf xlsx xlsxwriter pdf-extraction

Get all PDF files name under same folder and save in excel according to PDF file name

I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name. This is what I have tried

def get_files(pdf_path):
    import os
    os.chdir(pdf_path)
    files = os.listdir()
    files = [x for x in files if x.endswith(".pdf")]
    return files 
    
files = get_files(pdf_path)

for i in files:
    save_as_excel(pdf_path, i)

Solution

As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.

Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files

Assumption and logic would remain same from my previous answer.

I have moved the data and methods and encapsulated to a class PdfExtractor.

Please follow the below steps to extract text from pdf and save as xlsx. Before moving ahead install the packages pdfplumber, xlsxwriter

Save the below code with filename PdfExtractor.py

import pdfplumber
import xlsxwriter
import re

# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
    'Our Ref' : r'Our Ref :(.*?)Name',
    'Name' : r'Name:(.*?)Ref 1',
    'Ref 1' : r'Ref 1 :(.*?)Ref 2',
    'Ref 2' : r'Ref 2:(.*?)$'
}

# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
    'Amount' : r'Amount:(.*?)Total Paid',
    'Total Paid' : r'Total Paid:(.*?)Balance',
    'Balance' : r'Balance:(.*?)Date of A/C',
    'Date of A/C' : r'Date of A/C:(.*?)Date Received',
    'Date Received' : r'Date Received:(.*?)$'
}

# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
    'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
    'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
    'A/C Status': r'A/C\s+Status:(.*?)Collector',
    'Collector' : r'Collector :(.*?)$'
}

class PdfExtractor:
    data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
    
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.json_data = {}
        self.pdf_text = ''

    def __preprocess_data(self, data):
        return [el.strip() for el in data.splitlines() if el.strip()]
    
    def __get_header_data(self, text):
        header_data_list = self.__preprocess_data(text)
        # third line in text of header contains Date Created field
        self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
        # fourth line in text contains Number of Pages, Client Code, Client Name
        self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
        # fifth line in text contains Client Code and ClientName
        self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
        self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
    
    def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
        ''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
        for key, regex in regex_dict.items():
            matched_value = re.search(regex, text)
            if matched_value is not None:
                data_dict[key] = matched_value.group(1).strip()
                
    def __populate_date_notes(self, data_dict, text):
        ''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
        data_dict['Date'] = []
        data_dict['Notes'] = []
        iter = 4
        while(iter < len(text)):
            date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
            data_dict['Date'].append(date_match.group(1).strip())
            notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
            data_dict['Notes'].append(notes_match.group(1).strip())
            iter += 1
    
    def get_pdf_text(self):
        data_index = 1
        with pdfplumber.open(self.pdf_path) as pdf:
            index = 0
            while(index < len(pdf.pages)):
                page = pdf.pages[index]
                self.pdf_text += '\n' + page.extract_text()
                index += 1
                        
        split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
        # first data in the split_on_data_entity list will contain the header information
        self.__get_header_data(split_on_data_entity[0])
        while(data_index < len(split_on_data_entity)):
            data_entity = {}
            data_processed = self.__preprocess_data(split_on_data_entity[data_index])
            self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
            self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
            self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
            if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
                self.__populate_date_notes(data_entity, data_processed)
                self.json_data['data_entity' + str(data_index)] = data_entity
            data_index += 1
                        
        return self.json_data 
        
    def save_as_xlsx(self, file_name):
        
        if(not self.json_data):
            print("Data was not read from PDF")
            return
        
        workbook = xlsxwriter.Workbook(file_name)
        worksheet = workbook.add_worksheet("Sheet 1")
        row = 0
        col = 0

        # write column
        columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
        worksheet.write_row(row, col,  tuple(columns))
        row += 1

        column_index_map = {}
        for index, col in enumerate(columns):
                column_index_map[col] =  index

        # write the header
        worksheet.write(row, column_index_map['Date Created'],  self.json_data['Date Created'])
        worksheet.write(row, column_index_map['Number of Pages'],  self.json_data['Number of Pages'])
        worksheet.write(row, column_index_map['Client Code'],  self.json_data['Client Code'])
        worksheet.write(row, column_index_map['ClientName'],  self.json_data['ClientName'])
        data_entity_index = 1


        #iterate through each data entity and for each key insert the values in the sheet
        while True:
            data_entity_key = 'data_entity' + str(data_entity_index)
            row_size = 1
            if(self.json_data.get(data_entity_key) != None):
                for key, value in self.json_data.get(data_entity_key).items():
                    if(type(value) == list):
                        worksheet.write_column(row, column_index_map[key],  tuple(value))
                        row_size = len(value)
                    else:
                        worksheet.write(row, column_index_map[key], value)
            else:
                break
            data_entity_index += 1
            row += row_size
                
        workbook.close()
        print(file_name + " saved successfully")

Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py

import os
from PdfExtractor import PdfExtractor

path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)

for file in files:
    if(not file.endswith(".pdf")):
        continue
    filename = os.path.splitext(file)[0]
    pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
    pdf_text = pdf_obj.get_pdf_text()
    pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))

Output :

C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully

Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx

Let me know if you have any doubts in the above code.