Search code examples
pythonpandasdataframetkinterconcatenation

How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with just the function results


I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.

So far I have:

import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd

file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)

for index, file in enumerate(files):
    df = pd.read_csv(file, sep=',', index_col=[0])
    df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
    pd.concat(df2[df.index == 'total'])

df.to_csv('file_path')

I'm sure there are several ways in which this is messed up, but any advice is appreciated


Solution

  • OK I figured it out:

    import os.path
    import tkinter.filedialog as filedialog
    import glob
    import pandas as pd
    
    file_path = filedialog.askdirectory()
    pattern = os.path.join(file_path, '*.csv')
    files = glob.glob(pattern)
    
    filename = pd.DataFrame(columns=['Filename'])
    filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
    
    dfs = []
    for index, file in enumerate(files):
        df = pd.read_csv(file, sep=',', index_col=[0])
        # Would remove the .loc, but it does no harm
        df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function  
        dfs.append(df)
    
    dfs = pd.concat(dfs)
    total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
    total_named = filename.join(total.set_index(filename.index))
    total_named.to_csv('file_path')