Search code examples
pythonstringparsingmonadsparsec

Parsing string using parsec.py


I want to parse a string like this:

Object A -> Object B [AB_name] Object B -> Object C [BC_name] ...

My goal is to obtain three lists:

Parents = ['Object A', 'Object B', ...]

Children = ['Object B', 'Object C', ...]

PC_names = ['AB_name','BC_name', ...]

I already have a working solution but it is incomprehensible thus difficult to maintain and not very robust. Basically my code iterates with two loops over the string and thereby adds the substrings the several lists.

To fix this I read about the parsec.py lib, but so far I couldn't find a good example for complete newbies like me. I already tried to figure out how it works with the help of other articles and the documentation but with little success by now.

I'm glad about every hint.

The test_input.txt:

Society {
    A man -> a child [was once]
    A man -> an automobile [has]
    A woman -> a person [is]
    A man -> a person [is]
    A man -> a child [was once] 
}

My current code:

from typing import List
from parsec import *

class Type(object):
    label: str

class Aspect(object):
    domain: Type
    codomain: Type
    label: str
    def __init__(self) -> None:
        self.domain = Type()
        self.codomain = Type()
    
class Olog(object):
    name: str
    aspects: List[Aspect]
    def __init__(self):
        self.aspects = []

with open ('testinput.txt', 'r') as f:
        f_content = f.read()
        olog_name = f_content.split("{")[0]
        first = Olog()
        first.aspects = []
        first.name = olog_name
        olog_data = f_content.split("{")[1]
        olog_data_lines = olog_data.split(']')

        orientation = str

        counter1 = 0 
        counter2 = 0
        domain_str = ''
        codomain_str = ''
        type_comma = Type()
        type_comma.label = ","
        string_store = str
        string_store = ''
        type_store = Type()
        type_store_split = [Type]
        

        for lines in olog_data_lines:
            first_type = ''
            second_type = ''
            aspect_label = str
            first_T = Type()
            second_T = Type()
            lines += ']'
            lines_split = lines.split()
            type_in_list = False

            for word in lines_split:
                if word == "}" and counter1 == 0:
                        print("Olog is empty")
                if word == "}":
                        print(">>>Olog was saved")
                        break

                if word == "->":

                    counter1 +=1
                if counter1 == counter2 and lines_split.index(word) == 0:

                    first_type = word
                if counter1 == counter2 and not lines_split.index(word) == 0:
                    first_type = first_type + (" " + word)
                if  word == "->": 
                        orientation = "->"
                        string_store = string_store + first_type + ", "

                        type_store.label = string_store
                        type_store_split = type_store.label.split(",")
                        
                        
                        for types in type_store_split: 
                                                
                            if types == first_type:

                                domain_str = int(type_store_split.index(types))
                                type_in_list = True
                                break
                        if not type_in_list:

                            domain_str = int(len(type_store_split)-2)

                if not counter1 == counter2:
                    if word[0] == "[":
                        aspect_label = (lines.split('[', 1)[1].split(']')[0])
                    else: second_type = second_type.replace('->','', 1) + " " + word
                    
                    if (word[len(word)-1]=="]"):
                        second_T.label = second_type
                        string_store = string_store + second_type + ", "
                        type_store.label = string_store
                        type_store_split = type_store.label.split(",")

                        for types in type_store_split:               
                            if types == second_type:
                                codomain_str = int(type_store_split.index(types))
                                second_T.label = codomain_str
                                break
                            elif types == type_store_split[len(type_store_split)-1]:
                                codomain_str = int(len(type_store_split)-2)
                                second_T.label = codomain_str

                        aspect_A = Aspect()
                        aspect_A.label = aspect_label
                        aspect_A.domain = Type()
                        aspect_A.codomain = Type()
                        aspect_A.domain.label = domain_str
                        aspect_A.codomain.label = codomain_str
                        first.aspects.append(aspect_A)
                        counter2 += 1
                    
``


Solution

  • This solution uses re and recursion parse the input lines and traverse the result, yielding back parents, children, and pc_names:

    import re, collections
    def parse_line(l):
       return [re.findall('\[.*?\]|[\w\s]+', i.strip()) for i in re.split('\s*\-\>\s*', l)]
    
    lines = [parse_line(i) for i in open('test_input.txt') if not re.findall('[\{\}]', i)]
    def get_vals(d, s = []):
      if len(d) > 1:
         yield ('pc_names', d[-1][1:-1])
      if not (c:=[b for a, b in lines if d[0] == a[0]]):
         yield ('children', d[0])
         if (k:=[a for a, _ in lines if a[0] not in s]):
            yield from get_vals(k[0], s+[d[0]])
      else:
         yield ('parents', d[0])
         for i in c:
            yield from get_vals(i, s+[d[0]])
    
    result = collections.defaultdict(set)
    for a, b in get_vals(lines[0][0]):
       result[a].add(b)
    
    print({a:list(b) for a, b in result.items()})
    

    Output:

    {'parents': ['A woman', 'A man'], 
     'pc_name': ['was once', 'is', 'has'], 
     'children': ['a person ', 'an automobile ', 'a child ']}
    

    Second test_input.txt content:

    Object A -> Object B [AB_name] 
    Object B -> Object C [BC_name]
    

    Result:

    {'parents': ['Object B', 'Object A'], 
     'pc_names': ['AB_name', 'BC_name'], 
     'children': ['Object B ', 'Object C ']}