Search code examples
pythonpyparsing

Parsing nested brace/bracket groups


I'm attempting to parse files that look like this:

MSH
[  PD1  ]
[{ ROL }]
[
  { ROL }
]
[
    {
        PR1
        [{ ROL }]
    }
]
[
    {
        IN1
        [  IN2  ]
        [{ IN3 }]
    }
]
[ ACC ]

Where:

  • 3 single alphanumerics represent a SEGMENT
  • [ SEGMENT ] represents a Optional Segment
  • { SEGMENT } represents a Repeating Segment
  • [{ SEGMENT }] represents an Optional Repeating Segment
  • Any of the above SEGMENT profiles can be grouped together in nesting Optional ([]) and/or Repeating ({}) GROUPS.
  • Examples of nested repeating groups are lines 4 - 19 in the code above.

The ideal result would be something like this:

    {
  "MSH": {
    "name": "placeholder",
    "opt": false,
    "rep": false,
    "description": "Plain Segment"
  },
  "PD1": {
    "name": "placeholder",
    "opt": true,
    "rep": false,
    "description": "Optional Segment"
  },
  // some segments here
  "group": {
    "opt": true,
    "rep": false,
    "description": "Optionals group placeholder text",
    "segment0": {
      "ROL": {
        "name": "placeholder",
        "opt": false,
        "rep": true,
        "description": "Repeating Segment"
      }
    }
  }
}

I've read most of the pyparsing posts on SO and the Pyparsing wiki, including the fourFn.py examples and the regexinverter. I believe I need to use Infixnotation but I'm not quite understanding how to use it.

This is what I have so far:

lbrack = pp.Literal("[")
rbrack = pp.Literal("]")
lbrace = pp.Literal("{")
rbrace = pp.Literal("}")
segment = pp.Word(pp.alphanums,exact=3)
optsegment = lbrack + segment + rbrack
repsegment = lbrace + segment + rbrace
optrepsegment = lbrack + lbrace + segment + rbrace + rbrack


segments = (segment.setResultsName("RawSegment") |
           optsegment.setResultsName("OptionalSegment") |
           repsegment.setResultsName("RepeatingSegment") |
           optrepsegment.setResultsName("OptionalRepeatingSegment"))

opt_group = pp.Group(lbrack + segments + rbrack)
rep_group = pp.Group(lbrace + segments + rbrace)

message = pp.Group(segments | opt_group | rep_group)

expr = pp.infixNotation(message,
            [
            ('[', 2, pp.opAssoc.LEFT),
            ('{', 2, pp.opAssoc.LEFT),
            ('}', 1, pp.opAssoc.RIGHT),
            (']', 1, pp.opAssoc.RIGHT),
            ])

msg = message.searchString(data)

for item in msg:
    print(item)

I havent hammered out the output format yet, I'm just trying to get the output parsed correctly at this point.


Solution

  • Here is the code with lark:

    import json
    import lark
    
    l = lark.Lark("""
    start: _segment
    SIMPLE_SEGMENT: ("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)
    o_segment: "["_segment"]"
    r_segment: "{"_segment"}"
    _segment: (SIMPLE_SEGMENT|o_segment|r_segment)+
    %import common.LETTER
    %import common.DIGIT
    %import common.WS
    %ignore WS
    """, parser='lalr') # using lalr as parser is better than the default parser
    
    
    class TreeTransformer(lark.Transformer):
    
        @staticmethod
        def o_segment(content):
            if len(content) == 1 and isinstance(content[0], tuple) and content[0][0] == 'rep':
                return "rep_opt", content[0][1]
            return "opt", tuple(content) if len(content) != 1 else content[0]
    
        @staticmethod
        def r_segment(content):
            return "rep", tuple(content) if len(content) != 1 else content[0]
    
        def start(self, content):
            out = []
            for token in content:
                if isinstance(token, str):
                    out.append({"name": "placeholder",
                                "opt": False,
                                "rep": False,
                                "description": "Plain Segment",
                                "token_name": token})
                else:
                    if isinstance(token[1], str):
                        opt = 'opt' in token[0]
                        rep = 'rep' in token[0]
                        out.append({"name": "placeholder",
                                    "opt": opt,
                                    "rep": rep,
                                    "description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Segment",
                                    "token_name": token[1]})
                    else:
                        opt = 'opt' in token[0]
                        rep = 'rep' in token[0]
                        out.append({"name": "placeholder",
                                    "opt": opt,
                                    "rep": rep,
                                    "description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Group",
                                    "segments": self.start(token[1])})
            return out
    
    
    transformer = TreeTransformer()
    
    tree = l.parse("""
    MSH
    [  PD1  ]
    [{ ROL }]
    [
      { ROL }
    ]
    [
        {
            PR1
            [{ ROL }]
        }
    ]
    [
        {
            IN1
            [  IN2  ]
            [{ IN3 }]
        }
    ]
    [ ACC ]
    """)
    
    print(json.dumps(transformer.transform(tree), indent=4))