Search code examples
pythonpandasdataframeparallel-processing

How to save two separate CSVs when you are parallelizing your code?


I currently have this code running

def single_iteration(iter:int):
   some_parameters = 100
   data1, data2, data3, data4 = do_something()
   result = []
   for i in range(100):
       data = {'Col1':data1, 'Col2':data1, 'Col3':data3, 'Col4':data4}
       result.append(data)
   df = pd.DataFrame(result)
   return df

if __name__ == "__main__":
   run_stop = 100
   number_of_cores = int(os.environ['SLURM_CPUS_PER_TASK'])
   with multiprocessing.Pool(number_of_cores) as pool:
      results = pool.map(single_iteration, range(run_stop))
   df = pd.concat(results, ignore_index=True)
   df.to_csv(path, file_name)

However, now I want two different CSVs , like with df1 = pd.DaFrame[{'Col1':data1, 'Col2':data2}] and df2 = pd.DaFrame[{'Col3':data3, 'Col4':data4}] and return them in each run and then concat them separately and save them.


Solution

  • import os
    import multiprocessing
    import pandas as pd
    
    
    def single_iteration(iter: int):
        some_parameters = 100
    
        data1 = [1] * 100
        data2 = [2] * 100
        data3 = [3] * 100
        data4 = [4] * 100
    
        result1 = []
        result2 = []
    
        for i in range(100):
            data1_dict = {'Col1': data1[i], 'Col2': data2[i]}
            data2_dict = {'Col3': data3[i], 'Col4': data4[i]}
            result1.append(data1_dict)
            result2.append(data2_dict)
    
        df1 = pd.DataFrame(result1)
        df2 = pd.DataFrame(result2)
        return df1, df2
    
    
    if __name__ == "__main__":
        run_stop = 100
        number_of_cores = int(os.environ.get('SLURM_CPUS_PER_TASK', 4))
    
        with multiprocessing.Pool(number_of_cores) as pool:
            results = pool.map(single_iteration, range(run_stop))
    
        # Separate the results into two lists of DataFrames
        df1_list = [res[0] for res in results]
        df2_list = [res[1] for res in results]
    
        # Concatenate all DataFrames in each list
        df1 = pd.concat(df1_list, ignore_index=True)
        df2 = pd.concat(df2_list, ignore_index=True)
    
        df1.to_csv('df1_file_name.csv', index=False)
        df2.to_csv('df2_file_name.csv', index=False)