Search code examples
pythondataframe

Import .dat file containing index, as pandas dataframe


I have a .dat file, containing column data separated by indices, for example here i show just 2 indices:

# t= 400000
1601 -0.207587E-02   0.454623E-03   0.193855E-02   0.996661E+00
1602 -0.202550E-02   0.447789E-03   0.191164E-02   0.996661E+00
1603 -0.197461E-02   0.440763E-03   0.188373E-02   0.996660E+00
1604 -0.192326E-02   0.433553E-03   0.185483E-02   0.996660E+00
1605 -0.187150E-02   0.426169E-03   0.182494E-02   0.996659E+00

 # t= 410000
1601 -0.207587E-02   0.454623E-03   0.193855E-02   0.996661E+00
1602 -0.202550E-02   0.447789E-03   0.191164E-02   0.996661E+00
1603 -0.197461E-02   0.440763E-03   0.188373E-02   0.996660E+00
1604 -0.192326E-02   0.433553E-03   0.185483E-02   0.996660E+00
1605 -0.187150E-02   0.426169E-03   0.182494E-02   0.996659E+00

I would like to import them as panda dataframes and give names at each column. Also, I want to keep the index information (probably in a new column). I have tried the following but it does not work

 import pandas as pd
 from io import StringIO
 df = pd.read_csv(StringIO(filename), 
                   sep="\t", 
                   skiprows=1, 
                   usecols=[0,1,2,3,4], 
                   names=['position','ux', 'uy', 'uz', 'rho'])
  
  print(df)

It has multiple entries. Each index "t" corresponds to different times of a simulation output. And at each time I have the 5 columns. I want to save the .dat file in the following dataframe form

t       pos       ux              uy            uz       rho
400000 1601 -0.207587E-02   0.454623E-03   0.193855E-02   
0.996661E+00
400000 1602 -0.202550E-02   0.447789E-03   0.191164E-02   
0.996661E+00 ....
410000 1603 -0.197461E-02   0.440763E-03   0.188373E-02   
0.996660E+00
410000 1604 -0.192326E-02   0.433553E-03   0.185483E-02   
0.996660E+00
410000  1605 -0.187150E-02   0.426169E-03   0.182494E-02 
0.996659E+00

like merging everything together.


Solution

  • Using your supplied dataset, below a trivial example , parse and transform to a format suited to simple conversion a df, save results as a csv. If you want to save the actual df ... then you chose the method ... parquet/pickled/json etc, I'll leave that to your discretion

    cat riri.py
    import sys
    import os
    import pandas as pd
    
    def parseInput(filename):
        data = []  # store rows for the DF in here
        simTime = None
    
        #
        # need to organise the input file to a simplifed format to transform
        #
        with open(filename, 'r') as file:
            for line in file:
                line = line.strip()             
                if line.startswith('# t='):     # extract value from the header
                    simTime = int(line.split('=')[1].strip())  # entries like # t=....
                elif line:                      # parse each row
                    cols = line.split()
                    pos = int(cols[0])  # Position
                    ux, uy, uz, rho = map(float, cols[1:])  # columns are all floats ... ux, uy, uz, rho
                    data.append([simTime, pos, ux, uy, uz, rho])
        return data
    
    def cvtToDf( parsedList ):
        df = pd.DataFrame(parsedList, columns=['t', 'pos', 'ux', 'uy', 'uz', 'rho'])
        return df
    
    input = ""
    output = ""
    
    if len(sys.argv) < 3:
        print(f'USAGE:python inputFile outputFile')
        sys.exit(1)
    
    input = sys.argv[1]
    
    output = sys.argv[2]
    
    print( f'processing {input}')
    
    df = cvtToDf( parseInput(input) )
    print(df)
    
    print( f'saving to {output}' )
    df.to_csv( output , index=False)
    
    #
    # execute
    # 
    python riri.py input.dat output.csv
    processing input.dat
            t   pos        ux        uy        uz       rho
    0  400000  1601 -0.002076  0.000455  0.001939  0.996661
    1  400000  1602 -0.002025  0.000448  0.001912  0.996661
    2  400000  1603 -0.001975  0.000441  0.001884  0.996660
    3  400000  1604 -0.001923  0.000434  0.001855  0.996660
    4  400000  1605 -0.001871  0.000426  0.001825  0.996659
    5  410000  1601 -0.002076  0.000455  0.001939  0.996661
    6  410000  1602 -0.002025  0.000448  0.001912  0.996661
    7  410000  1603 -0.001975  0.000441  0.001884  0.996660
    8  410000  1604 -0.001923  0.000434  0.001855  0.996660
    9  410000  1605 -0.001871  0.000426  0.001825  0.996659
    saving to output.csv
    
    
    cat output.csv
    t,pos,ux,uy,uz,rho
    400000,1601,-0.00207587,0.000454623,0.00193855,0.996661
    400000,1602,-0.0020255,0.000447789,0.00191164,0.996661
    400000,1603,-0.00197461,0.000440763,0.00188373,0.99666
    400000,1604,-0.00192326,0.000433553,0.00185483,0.99666
    400000,1605,-0.0018715,0.000426169,0.00182494,0.996659
    410000,1601,-0.00207587,0.000454623,0.00193855,0.996661
    410000,1602,-0.0020255,0.000447789,0.00191164,0.996661
    410000,1603,-0.00197461,0.000440763,0.00188373,0.99666
    410000,1604,-0.00192326,0.000433553,0.00185483,0.99666
    410000,1605,-0.0018715,0.000426169,0.00182494,0.996659