Search code examples
pythonjsonparsingtext

I want to extract all JSON objects from this text file and create a dictionary. As you can see, in my text there are nested objects as a key value


text = Autotune exists! Hoorah! You can use microbolus-related features. {"iob":0.121,
"activity":0.0079,
"basaliob":-1.447,
"bolusiob":1.568,
"netbasalinsulin":-1.9,
"bolusinsulin":6.5,
"time":"2022-12-25T21:17:45.000Z",

"iobWithZeroTemp":
{"iob":0.121,
"activity":0.0079,
"basaliob":-1.447,
"bolusiob":1.568,
"netbasalinsulin":-1.9,
"bolusinsulin":6.5,
"time":"2022-12-25T21:17:45.000Z"},
"lastBolusTime":1671999216000,

"lastTemp":
{"rate":0,
"timestamp":"2022-12-25T23:56:14+03:00",
"started_at":"2022-12-25T20:56:14.000Z",
"date":1672001774000,
"duration":22.52}}
# Regular expression pattern to match nested JSON objects
pattern = r'(?<=\{)\s*[^{]*?(?=[\},])'


matches = re.findall(pattern, text)


parsed_objects = [json.loads(match) for match in matches]


for obj in parsed_objects:
    print(obj)

JSONDecodeError: Extra data: line 1 column 6 (char 5)


Solution

  • Here is an attempt to get all valid JSON dicts from text using JSONDecoder.raw_decode():

    text = """\
    text = Autotune exists! Hoorah! You can use microbolus-related features. {"iob":0.121,
    "activity":0.0079,
    "basaliob":-1.447,
    "bolusiob":1.568,
    "netbasalinsulin":-1.9,
    "bolusinsulin":6.5,
    "time":"2022-12-25T21:17:45.000Z",
    
    "iobWithZeroTemp":
    {"iob":0.121,
    "activity":0.0079,
    "basaliob":-1.447,
    "bolusiob":1.568,
    "netbasalinsulin":-1.9,
    "bolusinsulin":6.5,
    "time":"2022-12-25T21:17:45.000Z"},
    "lastBolusTime":1671999216000,
    
    "lastTemp":
    {"rate":0,
    "timestamp":"2022-12-25T23:56:14+03:00",
    "started_at":"2022-12-25T20:56:14.000Z",
    "date":1672001774000,
    "duration":22.52}}
    
    This is some other text with { not valid JSON }
    
    {"another valid JSON object": [1, 2, 3]}
    """
    
    import json
    
    decoder = json.JSONDecoder()
    
    decoded_objs, idx = [], 0
    while True:
        try:
            idx = text.index("{", idx)
        except ValueError:
            break
    
        while True:
            try:
                obj, new_idx = decoder.raw_decode(text[idx:])
                decoded_objs.append(obj)
                idx += new_idx
            except json.decoder.JSONDecodeError:
                idx += 1
                break
    
    
    print(decoded_objs)
    

    Prints:

    [
        {
            "iob": 0.121,
            "activity": 0.0079,
            "basaliob": -1.447,
            "bolusiob": 1.568,
            "netbasalinsulin": -1.9,
            "bolusinsulin": 6.5,
            "time": "2022-12-25T21:17:45.000Z",
            "iobWithZeroTemp": {
                "iob": 0.121,
                "activity": 0.0079,
                "basaliob": -1.447,
                "bolusiob": 1.568,
                "netbasalinsulin": -1.9,
                "bolusinsulin": 6.5,
                "time": "2022-12-25T21:17:45.000Z",
            },
            "lastBolusTime": 1671999216000,
            "lastTemp": {
                "rate": 0,
                "timestamp": "2022-12-25T23:56:14+03:00",
                "started_at": "2022-12-25T20:56:14.000Z",
                "date": 1672001774000,
                "duration": 22.52,
            },
        },
        {"another valid JSON object": [1, 2, 3]},
    ]