I have a Main Folder in which a new subfolder gets placed every day. In these subfolders, there are multiple CSV files. Of these multiple CSV files I need to concatenate one specific file from each subfolder based on a pre-defined date interval.
the subfolders are always named the date (etc. "20230227") and the files are always named "Some_File-/*.csv"
I'm assuming the answer would be something in line of this:
import pandas as pd
import glob
from pathlib import Path
Start_date = '20230101'
End_date = '20230227'
directory = "\Main-Folder-Path\subfolders....."
#Read CSV file
df_temp = []
for sub_folder in #all subfolders in the above date range:
for file in Path(directory).glob("Some_File-/*.csv"):
df_temp.append(pd.read_csv(file))
#concatenate into a single data frame
df = pd.concat(df_temp)
I found the solution:
def read_csv_file(start_date, end_date, location):
"""Function to read csv file from a given location and return a dataframe"""
parsed_start_date = datetime.datetime.strptime(start_date, "%Y%m%d").date()
parsed_end_date = datetime.datetime.strptime(end_date, "%Y%m%d").date()
final_dataframe = pd.DataFrame()
# loop over sub-folders
for folder_name in os.listdir(location):
try:
# try to parse the folder name as a date, if not possible, skip it
parsed_date = datetime.datetime.strptime(folder_name, "%Y%m%d").date()
except:
continue
# if the sub-folder fulfills the date criteria
if parsed_start_date <= parsed_date <= parsed_end_date:
print(f"processing folder {folder_name}")
# go into the folder and load csv file starting with name 'front'
for file_name in os.listdir(location + '/' + folder_name):
# if the file name starts with 'front' and ends with '.csv'
if file_name.startswith('CSV_FILE_NAME-') and file_name.endswith('.csv'):
df = pd.read_csv(location + '/' + folder_name + '/' + file_name, encoding= 'unicode_escape', low_memory=False)
# concat the dataframe into the final_dataframe
final_dataframe = pd.concat([final_dataframe, df])
break
print("--- the job is done ---")
return final_dataframe
# Define the start and end date
my_start_date = '20230101'
my_end_date = '20230301'
# Run the script
result = read_csv_file(my_start_date, my_end_date, '/MAIN/FOLDER/LOCATION')
# Save result pandas dataframe into csv file inside code folder
result.to_csv('CONCATENATED_FILE.csv', index=False)