I am currently opening a csv file as is:
request_csv = s3_client.get_object(Bucket='bucketname', Key='dw/file.csv')
I'd like to change this to open all files inside dw/folder
(they are all CSV) into a single Dataframe. How can I approach this? Any pointers would be appreciated.
This should work:
import boto3
import pandas as pd
from io import StringIO
# Initialize S3 client
s3_client = boto3.client('s3')
# Define the bucket and folder
bucket_name = 'bucketname'
folder_prefix = 'dw/folder/' # Ensure it ends with a /
# List all objects in the folder
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
# Filter to get only .csv files
csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]
# Initialize an empty list to store DataFrames
dataframes = []
# Loop through each file and read its content into a DataFrame
for file_key in csv_files:
# Get the file content
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
csv_content = response['Body'].read().decode('utf-8')
# Load the CSV content into a DataFrame
df = pd.read_csv(StringIO(csv_content))
dataframes.append(df)
# Combine all DataFrames into a single DataFrame
final_dataframe = pd.concat(dataframes, ignore_index=True)
# Display the final DataFrame
print(final_dataframe)