Search code examples
pythonpandasjoinmergefunctools

How to put values on a single raw from multiple columns in Pandas


I have been scratching my head for days about this problem. Please, find below the structure of my input data and the output that I want. I color-coded per ID, Plot, Survey, Trial and the 3 estimation methods. In the output, I want to get all the scorings for each group, which are represented by color, on the same row. By doing that, we should get rid of the Estimation Method column in the output. I kept it for sake of clarity.

This is my code. Thank you in advance for your time.

import pandas as pd
import functools
data_dict = {'ID': {0: 'id1',
  1: 'id1',
  2: 'id1',
  3: 'id1',
  4: 'id1',
  5: 'id1',
  6: 'id1',
  7: 'id1',
  8: 'id1',
  9: 'id1',
  10: 'id1',
  11: 'id1',
  12: 'id1',
  13: 'id1',
  14: 'id1',
  15: 'id1',
  16: 'id1',
  17: 'id1',
  18: 'id1',
  19: 'id1',
  20: 'id1',
  21: 'id1',
  22: 'id1',
  23: 'id1'},
 'Plot': {0: 'p1',
  1: 'p1',
  2: 'p1',
  3: 'p1',
  4: 'p1',
  5: 'p1',
  6: 'p1',
  7: 'p1',
  8: 'p1',
  9: 'p1',
  10: 'p1',
  11: 'p1',
  12: 'p1',
  13: 'p1',
  14: 'p1',
  15: 'p1',
  16: 'p1',
  17: 'p1',
  18: 'p1',
  19: 'p1',
  20: 'p1',
  21: 'p1',
  22: 'p1',
  23: 'p1'},
 'Survey': {0: 'Sv1',
  1: 'Sv1',
  2: 'Sv1',
  3: 'Sv1',
  4: 'Sv1',
  5: 'Sv1',
  6: 'Sv2',
  7: 'Sv2',
  8: 'Sv2',
  9: 'Sv2',
  10: 'Sv2',
  11: 'Sv2',
  12: 'Sv1',
  13: 'Sv1',
  14: 'Sv1',
  15: 'Sv1',
  16: 'Sv1',
  17: 'Sv1',
  18: 'Sv2',
  19: 'Sv2',
  20: 'Sv2',
  21: 'Sv2',
  22: 'Sv2',
  23: 'Sv2'},
 'Trial': {0: 't1',
  1: 't1',
  2: 't1',
  3: 't2',
  4: 't2',
  5: 't2',
  6: 't1',
  7: 't1',
  8: 't1',
  9: 't2',
  10: 't2',
  11: 't2',
  12: 't1',
  13: 't1',
  14: 't1',
  15: 't2',
  16: 't2',
  17: 't2',
  18: 't1',
  19: 't1',
  20: 't1',
  21: 't2',
  22: 't2',
  23: 't2'},
 'Mission': {0: 'mission1',
  1: 'mission1',
  2: 'mission1',
  3: 'mission1',
  4: 'mission1',
  5: 'mission1',
  6: 'mission1',
  7: 'mission1',
  8: 'mission1',
  9: 'mission1',
  10: 'mission1',
  11: 'mission2',
  12: 'mission2',
  13: 'mission2',
  14: 'mission2',
  15: 'mission2',
  16: 'mission2',
  17: 'mission2',
  18: 'mission2',
  19: 'mission2',
  20: 'mission2',
  21: 'mission2',
  22: 'mission2',
  23: 'mission2'},
 'Estimation Method': {0: 'MCARI2',
  1: 'NDVI',
  2: 'NDRE',
  3: 'MCARI2',
  4: 'NDVI',
  5: 'NDRE',
  6: 'MCARI2',
  7: 'NDVI',
  8: 'NDRE',
  9: 'MCARI2',
  10: 'NDVI',
  11: 'NDRE',
  12: 'MCARI2',
  13: 'NDVI',
  14: 'NDRE',
  15: 'MCARI2',
  16: 'NDVI',
  17: 'NDRE',
  18: 'MCARI2',
  19: 'NDVI',
  20: 'NDRE',
  21: 'MCARI2',
  22: 'NDVI',
  23: 'NDRE'},
 'MCARI2_sd': {0: 1.5,
  1: np.nan,
  2: np.nan,
  3: 10.0,
  4: np.nan,
  5: np.nan,
  6: 1.5,
  7: np.nan,
  8: np.nan,
  9: 10.0,
  10: np.nan,
  11: np.nan,
  12: 101.0,
  13: np.nan,
  14: np.nan,
  15: 23.5,
  16: np.nan,
  17: np.nan,
  18: 111.0,
  19: np.nan,
  20: np.nan,
  21: 72.0,
  22: np.nan,
  23: np.nan},
 'MACRI2_50': {0: 12.4,
  1: np.nan,
  2: np.nan,
  3: 11.0,
  4: np.nan,
  5: np.nan,
  6: 12.4,
  7: np.nan,
  8: np.nan,
  9: 11.0,
  10: np.nan,
  11: np.nan,
  12: 102.0,
  13: np.nan,
  14: np.nan,
  15: 2.1,
  16: np.nan,
  17: np.nan,
  18: 112.0,
  19: np.nan,
  20: np.nan,
  21: 74.0,
  22: np.nan,
  23: np.nan},
 'MACRI2_AVG': {0: 15.0,
  1: np.nan,
  2: np.nan,
  3: 12.0,
  4: np.nan,
  5: np.nan,
  6: 15.0,
  7: np.nan,
  8: np.nan,
  9: 12.0,
  10: np.nan,
  11: np.nan,
  12: 103.0,
  13: np.nan,
  14: np.nan,
  15: 24.0,
  16: np.nan,
  17: np.nan,
  18: 113.0,
  19: np.nan,
  20: np.nan,
  21: 77.0,
  22: np.nan,
  23: np.nan},
 'NDVI_sd': {0: np.nan,
  1: 2.9,
  2: np.nan,
  3: np.nan,
  4: 20.0,
  5: np.nan,
  6: np.nan,
  7: 2.9,
  8: np.nan,
  9: np.nan,
  10: 20.0,
  11: np.nan,
  12: np.nan,
  13: 201.0,
  14: np.nan,
  15: np.nan,
  16: 11.0,
  17: np.nan,
  18: np.nan,
  19: 200.0,
  20: np.nan,
  21: np.nan,
  22: 32.0,
  23: np.nan},
 'NDVI_50': {0: np.nan,
  1: 21.0,
  2: np.nan,
  3: np.nan,
  4: 21.0,
  5: np.nan,
  6: np.nan,
  7: 21.0,
  8: np.nan,
  9: np.nan,
  10: 21.0,
  11: np.nan,
  12: np.nan,
  13: 201.0,
  14: np.nan,
  15: np.nan,
  16: 12.0,
  17: np.nan,
  18: np.nan,
  19: 300.0,
  20: np.nan,
  21: np.nan,
  22: 39.0,
  23: np.nan},
 'NDVI_AVG': {0: np.nan,
  1: 27.0,
  2: np.nan,
  3: np.nan,
  4: 22.0,
  5: np.nan,
  6: np.nan,
  7: 27.0,
  8: np.nan,
  9: np.nan,
  10: 22.0,
  11: np.nan,
  12: np.nan,
  13: 203.0,
  14: np.nan,
  15: np.nan,
  16: 13.0,
  17: np.nan,
  18: np.nan,
  19: 400.0,
  20: np.nan,
  21: np.nan,
  22: 40.0,
  23: np.nan},
 'NDRE_sd': {0: np.nan,
  1: np.nan,
  2: 3.1,
  3: np.nan,
  4: np.nan,
  5: 31.0,
  6: np.nan,
  7: np.nan,
  8: 3.1,
  9: np.nan,
  10: np.nan,
  11: 31.0,
  12: np.nan,
  13: np.nan,
  14: 301.0,
  15: np.nan,
  16: np.nan,
  17: 15.0,
  18: np.nan,
  19: np.nan,
  20: 57.0,
  21: np.nan,
  22: np.nan,
  23: 21.0},
 'NDRE_50': {0: np.nan,
  1: np.nan,
  2: 33.0,
  3: np.nan,
  4: np.nan,
  5: 32.0,
  6: np.nan,
  7: np.nan,
  8: 33.0,
  9: np.nan,
  10: np.nan,
  11: 32.0,
  12: np.nan,
  13: np.nan,
  14: 302.0,
  15: np.nan,
  16: np.nan,
  17: 16.0,
  18: np.nan,
  19: np.nan,
  20: 58.0,
  21: np.nan,
  22: np.nan,
  23: 22.0},
 'NDRE_AVG': {0: np.nan,
  1: np.nan,
  2: 330.0,
  3: np.nan,
  4: np.nan,
  5: 33.0,
  6: np.nan,
  7: np.nan,
  8: 330.0,
  9: np.nan,
  10: np.nan,
  11: 33.0,
  12: np.nan,
  13: np.nan,
  14: 303.0,
  15: np.nan,
  16: np.nan,
  17: 17.0,
  18: np.nan,
  19: np.nan,
  20: 59.0,
  21: np.nan,
  22: np.nan,
  23: 32.0}}

df_test = pd.DataFrame(data_dict)


def generate_data_per_EM(df):
    data_survey = []
    for (survey,mission,trial,em),data in df.groupby(['Survey','Mission','Trial','Estimation Method']):               
        df_em = data.set_index('ID').dropna(axis=1)
        df_em.to_csv(f'tmp_data_{survey}_{mission}_{trial}_{em}.csv') #This generates 74 files, but not sure how to join/merge them
        data_survey.append(df_em)

    #Merge the df_em column-wise
    df_final = functools.reduce(lambda left, right: pd.merge(left, right, on=['ID','Survey','Mission','Trial']), data_survey)
    df_final.to_csv(f'final_{survey}_{mission}_{em}.csv') #Output is not what I expected

generate_data_per_EM(df_test) 

my data and the output


Solution

  • You need a groupby:

    (df_test
    .groupby(['ID', 'Plot', 'Survey', 'Trial','Mission'], as_index=False, sort=False)
    .first(numeric_only=True)
    
        ID Plot Survey Trial   Mission  MCARI2_sd  MACRI2_50  MACRI2_AVG  NDVI_sd  NDVI_50  NDVI_AVG  NDRE_sd  NDRE_50  NDRE_AVG
    0  id1   p1    Sv1    t1  mission1        1.5       12.4        15.0      2.9     21.0      27.0      3.1     33.0     330.0
    1  id1   p1    Sv1    t2  mission1       10.0       11.0        12.0     20.0     21.0      22.0     31.0     32.0      33.0
    2  id1   p1    Sv2    t1  mission1        1.5       12.4        15.0      2.9     21.0      27.0      3.1     33.0     330.0
    3  id1   p1    Sv2    t2  mission1       10.0       11.0        12.0     20.0     21.0      22.0      NaN      NaN       NaN
    4  id1   p1    Sv2    t2  mission2       72.0       74.0        77.0     32.0     39.0      40.0     31.0     32.0      33.0
    5  id1   p1    Sv1    t1  mission2      101.0      102.0       103.0    201.0    201.0     203.0    301.0    302.0     303.0
    6  id1   p1    Sv1    t2  mission2       23.5        2.1        24.0     11.0     12.0      13.0     15.0     16.0      17.0
    7  id1   p1    Sv2    t1  mission2      111.0      112.0       113.0    200.0    300.0     400.0     57.0     58.0      59.0