Search code examples
pythonpython-3.xcsvimportconcatenation

Reading multiple CSV files from different subfolders based on date range (in Python)


I have a Main Folder in which a new subfolder gets placed every day. In these subfolders, there are multiple CSV files. Of these multiple CSV files I need to concatenate one specific file from each subfolder based on a pre-defined date interval.

the subfolders are always named the date (etc. "20230227") and the files are always named "Some_File-/*.csv"

I'm assuming the answer would be something in line of this:

import pandas as pd
import glob
from pathlib import Path

Start_date = '20230101'
End_date = '20230227'

directory = "\Main-Folder-Path\subfolders....."

#Read CSV file
df_temp = []
for sub_folder in #all subfolders in the above date range: 
    for file in Path(directory).glob("Some_File-/*.csv"):
        df_temp.append(pd.read_csv(file))

#concatenate into a single data frame
df = pd.concat(df_temp)

Solution

  • I found the solution:

    def read_csv_file(start_date, end_date, location):
        """Function to read csv file from a given location and return a dataframe"""
        parsed_start_date = datetime.datetime.strptime(start_date, "%Y%m%d").date()
        parsed_end_date = datetime.datetime.strptime(end_date, "%Y%m%d").date()
        final_dataframe = pd.DataFrame()
    
        # loop over sub-folders
        for folder_name in os.listdir(location):
            try:
                # try to parse the folder name as a date, if not possible, skip it
                parsed_date = datetime.datetime.strptime(folder_name, "%Y%m%d").date()
            except:
                continue
    
            # if the sub-folder fulfills the date criteria
            if parsed_start_date <= parsed_date <= parsed_end_date:
                print(f"processing folder {folder_name}")
                # go into the folder and load csv file starting with name 'front'
                for file_name in os.listdir(location + '/' + folder_name):
                    # if the file name starts with 'front' and ends with '.csv'
                    if file_name.startswith('CSV_FILE_NAME-') and file_name.endswith('.csv'):
                        df = pd.read_csv(location + '/' + folder_name + '/' + file_name, encoding= 'unicode_escape', low_memory=False)
                        # concat the dataframe into the final_dataframe
                        final_dataframe = pd.concat([final_dataframe, df])
                        break
    
        print("--- the job is done ---")
        return final_dataframe
    
    
    # Define the start and end date
    my_start_date = '20230101'
    my_end_date = '20230301'
    
    # Run the script
    result = read_csv_file(my_start_date, my_end_date, '/MAIN/FOLDER/LOCATION')
    # Save result pandas dataframe into csv file inside code folder
    result.to_csv('CONCATENATED_FILE.csv', index=False)