Search code examples
regexnestedregex-group

Parsing multiple json like objects in a file


I am trying to parse F5 object. It has below structure.

Some Garbage text
ltm virtual The_Name_51244_sdfasfdasd {
    address-status yes
    enabled
    fallback-persistence none
    profiles {
        /Common/GLOBAL_PROFILE {
            context all
        }
        /Common/http {
            context all
        }
    }
    rate-class none
    rules {
        /Common/X-F-F
    }
    log-profiles none
    source-address-translation {
        pool SOME-SNAT-POOL
        type snat
    }
    source-port preserve
    vlans {
        Vlan1111
    }
    service-down-immediate-action none
    service-policy none
    source 0.0.0.0/0
}
barbage text
ltm virtual The_Object_51244 {
    address-status yes
    enabled
    fallback-persistence none
    profiles {
        /Common/GLOBAL_PROFILE {
            context all
        }
        /Common/http {
            context all
        }
    }
    rate-class none
    rules {
        /Common/X-F-F
    }
    log-profiles none
    source-address-translation {
        pool SOME-SNAT-POOL
        type snat
    }
    source-port preserve
    vlans {
        Vlan2222
    }
    service-down-immediate-action none
    service-policy none
    source 0.0.0.0/0
}
Trailing garbage text

My regex is ((ltm virtual) ([a-zA-Z0-9_-]*) {(.|\n)*?}) https://regex101.com/r/ATJZys/1

What I am trying to capture is name after ltm virtual and everything between curly brackets. But, above regex seems to stop at first close curly bracket match.

How do I extent it to match until next parent group or match until curly brackets are balanced?


Solution

  • You can use

    ((ltm virtual) ([a-zA-Z0-9_-]*) ({(?:[^{}]++|\g<4>)*}))
    

    See the regex demo.

    Details:

    • ( - Group 1 start
      • (ltm virtual) - Group 2: ltm virtual string
      • - space
      • ([a-zA-Z0-9_-]*) - Group 3: zero or more letters, digits, underscores or hyphens (can be re-written as [\w-]*)
      • - space
      • ({(?:[^{}]++|\g<4>)*}) - Group 4: {, then zero or more repetitions of either one or more chars other than { and } or Group 4 recursed, and then a } char
    • ) - end of Group 1.

    In Java, to match nested curly braces, or parentheses, or any other single char delimiters after a certain regex pattern (at the end of the pattern to be more precise) you can use

    import java.util.*;
    import java.util.regex.*;
    
    class Test
    {
        public static void main (String[] args) throws java.lang.Exception
        {
            Character markStart = '{';
            Character markEnd = '}';
            Boolean includeMarkers = true;
            int offset = (includeMarkers ? -1 : 0);
            String s = "Some Garbage text\r\nltm virtual The_Name_51244_sdfasfdasd {\r\n    address-status yes\r\n    enabled\r\n    fallback-persistence none\r\n    profiles {\r\n        /Common/GLOBAL_PROFILE {\r\n            context all\r\n        }\r\n        /Common/http {\r\n            context all\r\n        }\r\n    }\r\n    rate-class none\r\n    rules {\r\n        /Common/X-F-F\r\n    }\r\n    log-profiles none\r\n    source-address-translation {\r\n        pool SOME-SNAT-POOL\r\n        type snat\r\n    }\r\n    source-port preserve\r\n    vlans {\r\n        Vlan1111\r\n    }\r\n    service-down-immediate-action none\r\n    service-policy none\r\n    source 0.0.0.0/0\r\n}\r\nbarbage text\r\nltm virtual The_Object_51244 {\r\n    address-status yes\r\n    enabled\r\n    fallback-persistence none\r\n    profiles {\r\n        /Common/GLOBAL_PROFILE {\r\n            context all\r\n        }\r\n        /Common/http {\r\n            context all\r\n        }\r\n    }\r\n    rate-class none\r\n    rules {\r\n        /Common/X-F-F\r\n    }\r\n    log-profiles none\r\n    source-address-translation {\r\n        pool SOME-SNAT-POOL\r\n        type snat\r\n    }\r\n    source-port preserve\r\n    vlans {\r\n        Vlan2222\r\n    }\r\n    service-down-immediate-action none\r\n    service-policy none\r\n    source 0.0.0.0/0\r\n}\r\nTrailing garbage text";
            Pattern patternBefore = Pattern.compile("(ltm\\s+virtual)\\s+([\\w-]+)\\s+" + Pattern.quote(markStart.toString()));
            
            Matcher m = patternBefore.matcher(s);
            while (m.find()) {
                System.out.println("Group 1: " + m.group(1));
                System.out.println("Group 2: " + m.group(2));
                String res = getBalancedSubstring(s.substring(m.end()), markStart, markEnd, includeMarkers);
                
                System.out.println("Found nested: " + res + "\n----");
                if (res == null) {
                    System.out.println("No nested parens match found, this match must be failed.");
                }
                else {
                    s = s.substring(m.end()+res.length() + offset);
                    m.reset(s);
                }
            }
        }
        public static String getBalancedSubstring(String s, Character markStart, Character markEnd, Boolean includeMarkers) 
        {
            int level = 1;
            for (int i = 0; i < s.length(); i++) {
                char c = s.charAt(i);
                if (c == markStart) {
                    level++;
                }
                else if (c == markEnd) {
                    if (level == 1) {
                        return (includeMarkers ? markStart.toString() : "") + s.substring(0, (includeMarkers ? i + 1 : i));
                    }
                    if (level > 0) level--;
                }
            }
            return null;
        }
    }
    

    See the Java demo online.