Search code examples
pythondictionary

How to parse a .blk text file into a dictionary?


I have a readable block file (.blk) that is converted to a .txt file (from War Thunder). I'd like to parse the contents so that they are easy to access in my Python script.

Here's a snippet from such a block file:

areas{
  spawn_zone{
    type:t="Sphere"
    tm:m=[[9.70537, 0, 0] [0, 9.70537, 0] [0, 0, 9.70537] [2881.52, 75.8896, 182.321]]
    objLayer:i=0

    props{}
  }
}

How can I parse it so that I can access the different parts in my script? The goal is to be able to type something along the lines of areas.spawn_zone.type and it would return "Sphere".

A block file has a lot of clauses but they are identifiable by name (areas{...}, units{...} etc.), so it needs to account for that.


Solution

  • Based on the documentation available and the example you gave, a plain Python parser for .blk file from the War Thunder game might look like this:

    EXAMPLE = """
    areas{
      spawn_zone{
        type:t="Sphere"
        tm:m=[[9.7053, 0, 0] [0, 9.7053, 0] [0, 0, 9.7053] [2881.52, 75.8896, 182.321]]
        objLayer:i=0
        height:r=0.25
        line{ line:p4=115, +10000, 117, 0; move:b=no; thousandth:b=yes; }
      }
    }
    """
    
    
    def parse_blk(data: str, start: int = 0) -> (dict, int):
        from enum import Enum
        from itertools import islice
        from re import findall
    
        class States(Enum):
            ID_NEXT = 1
            ID = 2
            BLOCK_NEXT = 3
            TYPE_NEXT = 4
            TYPE = 5
            EQUALS_NEXT = 6
            VALUE_NEXT = 7
            VALUE = 8
    
        def unexpected():
            raise SyntaxError(f'Unexpected character #{i}: {ch}')
    
        def matrix(m: str) -> list | float:
            m = m.strip()
    
            if not m.startswith('[') or not m.endswith(']'):
                xs = m.split(',')
                if len(xs) > 1:
                    return [matrix(v) for v in xs]
                try:
                    v = float(m)
                    return v
                except ValueError:
                    raise SyntaxError(f'Invalid matrix format {s}')
    
            m = m[1:-1]
            return [matrix(v) for v in findall(r'\[([^]]+)]', m)]
    
        state = States.ID_NEXT
        s = ''
        _id = ''
        _type = ''
        result = {}
        enum_data = iter(enumerate(data))
        next(islice(enum_data, start, start), None)
        for i, ch in enum_data:
            match state:
                case States.ID_NEXT:
                    if ch.isalpha() or ch == '_':
                        s = ch
                        state = States.ID
                    elif ch.isspace():
                        pass
                    elif ch == '}':
                        return result, i
                    else:
                        unexpected()
                case States.ID:
                    if ch == ':':
                        _id = s
                        state = States.TYPE_NEXT
                    elif ch.isspace():
                        _id = s
                        state = States.BLOCK_NEXT
                    elif ch.isalpha() or ch == '_':
                        s += ch
                    elif ch == '{':
                        _id = s
                        result[_id], n = parse_blk(data, i + 1)
                        next(islice(enum_data, n, n), None)
                        state = States.ID_NEXT
                    else:
                        unexpected()
                case States.BLOCK_NEXT:
                    if ch == '{':
                        result[_id], n = parse_blk(data, i + 1)
                        next(islice(enum_data, n, n), None)
                        state = States.ID_NEXT
                    elif ch.isspace():
                        pass
                    else:
                        unexpected()
                case States.TYPE_NEXT:
                    if ch.isalpha():
                        s = ch
                        state = States.TYPE
                    elif ch.isspace():
                        pass
                    else:
                        unexpected()
                case States.TYPE:
                    if ch.isalnum():
                        s += ch
                    elif ch == '=':
                        _type = s
                        if _type not in ['i', 'r', 't', 'b', 'm', 'p2', 'p3', 'p4']:
                            raise ValueError(f'Unknown type {_type}')
                        state = States.VALUE_NEXT
                    elif ch.isspace():
                        _type = s
                        state = States.EQUALS_NEXT
                    else:
                        unexpected()
                case States.EQUALS_NEXT:
                    if ch == '=':
                        state = States.VALUE_NEXT
                    elif ch.isspace():
                        pass
                    else:
                        unexpected()
                case States.VALUE_NEXT:
                    if ch.isalnum() or ch in '"[+-':
                        s = ch
                        state = States.VALUE
                    elif ch.isspace():
                        pass
                    else:
                        unexpected()
                case States.VALUE:
                    if ch in [';', '\n']:
                        state = States.ID_NEXT
                        result[_id] = s
                        match _type:
                            case 'i':
                                result[_id] = int(s)
                            case 'r':
                                result[_id] = float(s)
                            case 't':
                                result[_id] = s
                            case 'b':
                                if s not in ['yes', 'true', 'no', 'false']:
                                    raise ValueError(f'Unknown boolean value {s}')
                                result[_id] = s in ['yes', 'true']
                            case 'm':
                                result[_id] = matrix(s)
                            case 'p2' | 'p3' | 'p4':
                                result[_id] = tuple(float(v) for v in s.split(','))
                                if (r := len(result[_id])) != (e := int(_type[1])):
                                    raise ValueError(
                                        f'Expected {e} values, got {r}')
                            case '_':
                                raise SyntaxError(f'Unknown type {_type}')
                    elif ch.isalnum() or ch.isspace() or ch in '_"[].,+-':
                        s += ch
                    elif ch == '}':
                        result[_id] = s
                        return result, i
                    else:
                        unexpected()
                case _:
                    raise SyntaxError(f'Unknown state {state}')
        return result, len(data)
    
    
    # the function returns both the dictionary and the number of characters parsed
    parsed, _ = parse_blk(EXAMPLE)
    print(parsed)
    print(parsed['areas']['spawn_zone']['type'])
    

    The output:

    {'areas': {'spawn_zone': {'type': '"Sphere"', 'tm': [[9.7053, 0.0, 0.0], [0.0, 9.7053, 0.0], [0.0, 0.0, 9.7053], [2881.52, 75.8896, 182.321]], 'objLayer': 0, 'height': 0.25, 'line': {'line': (115.0, 10000.0, 117.0, 0.0), 'move': False, 'thousandth': True}}}}
    "Sphere"
    

    Note that I added some data to the example to show the other types documented for the format - I know you don't need all of them, but someone else may be looking to read .blk files from War Thunder with Python as well.

    Note that I named _id and _type with an underscore because using id and type would shadow keywords, but I feel those are the appropriate names to use here, so I used the underscore versions. You could name them key and t, if you don't like that.

    And in case you're wondering - an LLM like ChatGPT doesn't do great on parsing a file like this, although it would allow you to solve the specific problem you wanted solved quite well. But I do use GitHub CoPilot in my editor, so writing a parser like this is actually not a lot of work - it pretty much writes itself if you guide it in the right direction.