Search code examples
pythonnumpydata-cleaning

Python clean text file to make it searchable


I have a very messy text file that consist of both comma and space seperated data that looks like the following:

NBLOCK,3,,13
(1i9,3e20.9e3)
        1     4.000000000E+01    -6.000000000E+01     0.000000000E+00
        2     4.000000000E+01     6.000000000E+01     0.000000000E+00
        3     4.000000000E+01    -2.000000000E+01     0.000000000E+00
        4     4.000000000E+01     2.000000000E+01     0.000000000E+00

I need to clean up the file such that I get an easily searchable 2d array using the following rules:

  • make letters lower case
  • delete repeating spaces
  • replace spaces by commas
  • delete comma if line is starting with a comma
  • convert to numpy 2d array

The output should look something like the following

my_array = [['nblock','3','','13'],
['(1i9','3e20.9e3)','',''],
['1','4.000000000e+01','-6.000000000e+01','0.000000000e+00'],
['2','4.000000000e+01','6.000000000e+01','0.000000000e+00'],
['3','4.000000000e+01','-2.000000000e+01','0.000000000e+00'],
['4','4.000000000e+01','2.000000000e+01','0.000000000e+00']]

Update more realistic example of text file:

ET,       1, 42
KEYOP,   1, 2,        1
KEYOP,   1, 3,        3
RLBLOCK,       1,       1,       6,       7
N,R5.3,LOC, -1,
NBLOCK,3,,13
(1i9,3e20.9e3)
1     4.000000000E+01    -6.000000000E+01     0.000000000E+00
        2     4.000000000E+01     6.000000000E+01     0.000000000E+00
        3     4.000000000E+01    -2.000000000E+01     0.000000000E+00
        4     4.000000000E+01     2.000000000E+01     0.000000000E+00
-1
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,EX  ,       1, 1, 2.100000000E+11,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,NUXY,       1, 1, 0.300000000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,DENS,       1, 1,  7800.00000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,PRXY,       1, 1, 0.300000000    ,
EXTOPT,ATTR,  0,  0,  0
EXTOPT,ESIZE,  0,  0.0000    
EXTOPT,ACLEAR,  0
BFUNIF,TEMP,_TINY

Another example:

DMPOPT,EMAT,NO
*IF,_CDRDOFF,EQ,1,THEN     !if solid model was read in
_CDRDOFF=             !reset flag, numoffs already performed
*ELSE              !offset database for the following FE model
NUMOFF,NODE,       12
NUMOFF,ELEM,        8
NUMOFF,MAT ,        1
NUMOFF,REAL,        1
NUMOFF,TYPE,        2
NUMOFF,CSYS,       12
*ENDIF
KUSE,     0
TIME,  0.00000000
NBLOCK,6,SOLID,        12,        12
(3i9,6e21.13e3)
        1        0        0 4.0000000000000E+001-6.0000000000000E+001
        2        0        0 4.0000000000000E+001 6.0000000000000E+001
        3        0        0 4.0000000000000E+001-2.0000000000000E+001
        4        0        0 4.0000000000000E+001 2.0000000000000E+001   
EBLOCK,19,SOLID,    1250
(19i8)
       1       1       1       1
       1       1       1       1       
       1       1       1       1
-1
N,R5.3,LOC,     -1,
MPDATA,R5.0, 1,EX  ,       1, 1, 2.100000000E+11,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,NUXY,       1, 1, 0.300000000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,

MPDATA,R5.0, 1,DENS,       1, 1,  7800.00000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,PRXY,       1, 1, 0.300000000    

Solution

  • Consider this kind of solution:

    import numpy as np
    from pprint import pprint
    
    class Block:
        def __init__(self):
            self.data = {}
            self.array = []
    
        def ingest(self, lines):
            for line in lines:
                if 'A' <= line[0] <= 'Z':
                    parts = [k.strip() for k in line.split(',')]
                    parts = [int(k) if k.isdigit() else k for k in parts]
                    key = parts[0].lower()
                    if key not in self.data:
                        self.data[key] = [parts[1:]]
                    else:
                        self.data[key].append( parts[1:] )
                elif line[0] == '(':
                    self.data['nblock'].append( [line.strip().split(',')] )
                    self.data['nblock'].append( [] )
                else:
                    parts = line.split()
                    if parts[0] != '-1':
                        self.data['nblock'][2].append( [float(k) for k in parts[1:]] )
    
    blk = Block()
    blk.ingest( open('x.txt') )
    pprint(blk.data)
    

    Output:

    {'bfunif': [['TEMP', '_TINY']],
     'et': [[1, 42]],
     'extopt': [['ATTR', 0, 0, 0], ['ESIZE', 0, '0.0000'], ['ACLEAR', 0]],
     'keyop': [[1, 2, 1], [1, 3, 3]],
     'mpdata': [['R5.0', 1, 'EX', 1, 1, '2.100000000E+11', ''],
                ['R5.0', 1, 'NUXY', 1, 1, '0.300000000', ''],
                ['R5.0', 1, 'DENS', 1, 1, '7800.00000', ''],
                ['R5.0', 1, 'PRXY', 1, 1, '0.300000000', '']],
     'mptemp': [['R5.0', 1, 1, '0.00000000', ''],
                ['R5.0', 1, 1, '0.00000000', ''],
                ['R5.0', 1, 1, '0.00000000', ''],
                ['R5.0', 1, 1, '0.00000000', '']],
     'n': [['R5.3', 'LOC', '-1', '']],
     'nblock': [[3, '', 13],
                [['(1i9', '3e20.9e3)']],
                [[40.0, -60.0, 0.0],
                 [40.0, 60.0, 0.0],
                 [40.0, -20.0, 0.0],
                 [40.0, 20.0, 0.0]]],
     'rlblock': [[1, 1, 6, 7]]}
    

    Note that I've provided a string lines iterator, but you could also pass a file. Anything that will iterate through a list of strings.