Search code examples
pythonparsingcomplex-networks

Python: how to parse and and add contents to a text file


Hello I have a network in a particular format, i.e. .gdf. However this is a text file in the following format

network:
nodedef>name VARCHAR,label VARCHAR
0,' 0 '
1,' 1 '
2,' 2 '
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333

where the first part refers to nodes and the second part to edges.

I want to add feature to read the file and add a feature to the nodes and return the following:

network:
nodedef>name VARCHAR,label VARCHAR, att1 VARCHAR
0,' 0 ', 'Paul'
1,' 1 ', 'Jack'
2,' 2 ', 'John'
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333

Solution

  • Here is some code that does the first half of what you asked for. It will parse the .GDF file and make the information available to you. Adding attributes and writing them is left as an exercise for the reader.

    import ast
    import collections
    import re
    
    
    def main():
        parser = GDFParser()
        with open('network.gdf') as file:
            parser.read(file)
        print(*parser.data, sep='\n')
    
    
    def pivot(iterable):
        columns = []
        for row in iterable:
            columns.extend([] for _ in range(len(row) - len(columns)))
            for column, cell in zip(columns, row):
                column.append(cell)
        return columns
    
    
    class GDFParser:
    
        HEADER = re.compile('\w+:')
        DEF = re.compile('\w+>\w+ (?:DOUBLE|VARCHAR)(?:,\w+ (?:DOUBLE|VARCHAR))*')
        CAST = dict(DOUBLE=float, VARCHAR=str)
    
        def __init__(self):
            self.__header = None
            self.__type = []
            self.__data = []
    
        @property
        def header(self):
            return self.__header
    
        @property
        def data(self):
            return tuple(self.__data)
    
        def read(self, file):
            for line in file:
                self.__read_line(line.strip())
    
        def __read_line(self, line):
            if self.HEADER.fullmatch(line):
                self.__process_header(line)
            elif self.DEF.fullmatch(line):
                self.__process_def(line)
            else:
                self.__process_data(line)
    
        def __process_header(self, line):
            if self.header:
                raise ValueError('header was previously set')
            self.__header = line[:-1]
    
        def __process_def(self, line):
            name, fields = line.split('>')
            columns, casts = pivot(field.split() for field in fields.split(','))
            self.__type.append((collections.namedtuple(name, columns),
                                tuple(map(self.CAST.__getitem__, casts))))
    
        def __process_data(self, line):
            if not self.__type:
                raise ValueError('a definition must come before its data')
            kind, casts = self.__type[-1]
            self.__data.append(kind(*(cast(item) for cast, item in
                                      zip(casts, ast.literal_eval(line)))))
    
    
    if __name__ == '__main__':
        main()