Search code examples
pythonpython-3.xtuplestext-filesdata-files

How to add data fields in a tuple with calculations involved in python


The snippet below works with an old data format however, I am trying to read an updated datasource.txt with an additional data field. I tried regex but cant seem to have it working.

data = {}
with open('datasource.txt') as f:
    for line in f:
        parts = line.split()
        team, a, b, c = parts if len(parts) == 5 else parts[:-1] + ['($0)'] + parts[-1]
        data[team] = tuple(map(sum, zip((int(a), float(b.replace(',', '')), float(c[2:-1].replace(',', ''))), data.get(team, (0, 0, 0)))))

data = {t: (a, b, c) for a, b, c, t in reversed(sorted((a, b, c, t) for t, (a, b, c) in data.items()))}

for team, (a, b, c) in data.items():
    print(f'{team:8} {a:4} {b:,} (${c:,})')

datasource.txt

alpha 1 54,00.01 ABC DSW2S
bravo 3 500,000.00 ACDEF
charlie 1 27,722.29 ($250.45) DGAS-CAS
charlie 10 252,336,733.383 ($492.06) DGAS-CAS
delta 2 11 ($10) SWSDSASS-CCSSW
echo 5 143,299.00 ($101) ACS34S1
echo 8 145,300 ($125.01) ACS34S1
falcon 3 0.1234 DSS2SFS3
falcon 5 9.19 DSS2SFS3
lima 6 45.00181 ($38.9) FGF5GGD-DDD
romeo 12 980 ASDS SSSS SDSD

Expected Output:

echo       13 288,599.0 ($226.01)            ACS34S1
romeo      12 980.0 ($0.0)                   ASDS SSSS SDSD    
charlie    11 252,364,455.67299998 ($742.51) DGAS-CAS
falcon      8 9.3134 ($0.0)                  DSS2SFS3   
lima        6 45.00181 ($38.9)               FGF5GGD-DDD
bravo       3 500,000.0 ($0.0)               ACDEF    
delta       2 11.0 ($10.0)                   SWSDSASS-CCSSW
alpha       1 54,000.01 ($0.0)               ABC DSW2S

Solution

  • You can do that with pandas.

    • First I have done some preprocessing to get the data to a stable format like converting to int/floats, adding $(0), joining the last column values etc.
    • Then used pandas to groupby and sum up the values.
    import pandas as pd
    
    dl = []
    with open('text.txt') as f:
        for line in f:
            parts = line.split()
            # Cleaning data here.. Conversions to int/float etc,
            if not parts[3][:2].startswith('($'):
                parts.insert(3,'0')
            if len(parts) > 5:
                temp = ' '.join(parts[4:])
                parts = parts[:4] + [temp]
            parts[1] = int(parts[1])
            parts[2] = float(parts[2].replace(',', ''))
            parts[3] = float(parts[3].strip('($)'))
            
            dl.append(parts)
        
    headers = ['col1', 'col2', 'col3', 'col4', 'col5']
    df = pd.DataFrame(dl,columns=headers)
    df = df.groupby(['col1','col5']).sum().reset_index()
    df = df.sort_values('col2',ascending=False)
    df['col4'] =  '($' + df['col4'].astype(str) + ')'
    df = df[headers]
    print(df)
    
    
          col1  col2          col3       col4            col5
    4     echo    13  2.885990e+05  ($226.01)         ACS34S1
    7    romeo    12  9.800000e+02     ($0.0)  ASDS SSSS SDSD
    2  charlie    11  2.523645e+08  ($742.51)        DGAS-CAS
    5   falcon     8  9.313400e+00     ($0.0)        DSS2SFS3
    6     lima     6  4.500181e+01    ($38.9)     FGF5GGD-DDD
    1    bravo     3  5.000000e+05     ($0.0)           ACDEF
    3    delta     2  1.100000e+01    ($10.0)  SWSDSASS-CCSSW
    0    alpha     1  5.400010e+03     ($0.0)       ABC DSW2S