Search code examples
javapythonpython-3.xdataformat

java map string data to dictionary in python


I am getting below a java map string from a data source.

{0={_shards={total=1, failed=0, successful=1, skipped=0}, hits={hits=[{_index=filebeat-7.10.0-2021.02.02-000001, _type=_doc, _source={input={type=log}, agent={hostname=ubuntu_fresh, name=ubuntu_fresh, id=879f36f2-4ade-47b6-a7b9-7972634c7b8c, type=filebeat, ephemeral_id=5676523f-bc61-4c12-b319-8b463348ba63, version=7.10.0}, @timestamp=2021-02-04T12:36:33.475Z, ecs={version=1.6.0}, log={file={path=/var/log/auth.log}, offset=46607}, service={type=system}, host={hostname=ubuntu_fresh, os={kernel=4.15.0-135-generic, codename=bionic, name=Ubuntu, family=debian, version=18.04.1 LTS (Bionic Beaver), platform=ubuntu}, containerized=false, ip=[10.0.2.15, fe80::a00:27ff:fe82:f598, 192.168.56.22, fe80::a00:27ff:fe32:fab0], name=ubuntu_fresh, id=cdfcdf6a39d44b98b2aa51700134f415, mac=[08:00:27:82:f5:98, 08:00:27:32:fa:b0], architecture=x86_64}, fileset={name=auth}, message=Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2, error={message=Provided Grok expressions do not match field value: [Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2]}, event={ingested=2021-02-04T12:36:39.482598548Z, timezone=+00:00, module=system, dataset=system.auth}}, _id=nNALbXcBbfKg8Fh6Zci7, _score=25.188179}], total={value=1, relation=eq}, max_score=25.188179}, took=1, timed_out=false}}

I don't have the privilege to convert it in java. I have a python application I want to access that data as a python dictionary. So want to convert it into a python dictionary.


Solution

  • The .toString() for Java Collections (Map, List, etc.) is lossy because it does not disambiguate delimiters. As such, there is no way to 100% reliably reconstruct the data-structure from the output of Map.toString(). However, if there are some constraints applied to the problem:

    1. the keys and values do not contain certain characters (approximately {}=[],")
    2. arrays do not contain a mixture of primitive values and objects/arrays

    then we can somewhat reliably transform the output of toString() to JSON, and then parse the JSON into a Python data-structure. I wouldn't use this code in production, but as long as you know it can break, it could be useful in certain cases:

    TEST_VALUE = "{0={_shards={total=1, failed=0, successful=1, skipped=0}, hits={hits=[{_index=filebeat-7.10.0-2021.02.02-000001, _type=_doc, _source={input={type=log}, agent={hostname=ubuntu_fresh, name=ubuntu_fresh, id=879f36f2-4ade-47b6-a7b9-7972634c7b8c, type=filebeat, ephemeral_id=5676523f-bc61-4c12-b319-8b463348ba63, version=7.10.0}, @timestamp=2021-02-04T12:36:33.475Z, ecs={version=1.6.0}, log={file={path=/var/log/auth.log}, offset=46607}, service={type=system}, host={hostname=ubuntu_fresh, os={kernel=4.15.0-135-generic, codename=bionic, name=Ubuntu, family=debian, version=18.04.1 LTS (Bionic Beaver), platform=ubuntu}, containerized=false, ip=[10.0.2.15, fe80::a00:27ff:fe82:f598, 192.168.56.22, fe80::a00:27ff:fe32:fab0], name=ubuntu_fresh, id=cdfcdf6a39d44b98b2aa51700134f415, mac=[08:00:27:82:f5:98, 08:00:27:32:fa:b0], architecture=x86_64}, fileset={name=auth}, message=Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2, error={message=Provided Grok expressions do not match field value: [Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2]}, event={ingested=2021-02-04T12:36:39.482598548Z, timezone=+00:00, module=system, dataset=system.auth}}, _id=nNALbXcBbfKg8Fh6Zci7, _score=25.188179}], total={value=1, relation=eq}, max_score=25.188179}, took=1, timed_out=false}}"
    
    def quote_value_array_values(match):
        s = match.group()
        qvalues = [f'"{value}"' for value in s.split(r", ")]
        return ", ".join(qvalues)
    
    def javastr_to_jsonstr(s):
        import re
        s = re.sub(r"(?<==\[)[^{\[\]]+(?=\])", quote_value_array_values, s)
        s = re.sub(r'(?<={)([^"=]+)[=:](?!{|\[)([^,}]+)', r'"\1":"\2"', s)
        s = re.sub(r'(?<=, )([^"=]+)[=:](?!{|\[)([^,}]+)', r'"\1":"\2"', s)
        s = re.sub(r'(?<={)([^"=]+)=(?!")', r'"\1":', s)
        s = re.sub(r'(?<=, )([^"=]+)=(?!")', r'"\1":', s)
        return s
    
    import json
    json_str = javastr_to_jsonstr(TEST_VALUE)
    json_obj = json.loads(json_str)
    print(json.dumps(json_obj, indent=1))
    

    Output:

    {
     "0": {
      "_shards": {
       "total": "1",
       "failed": "0",
       "successful": "1",
       "skipped": "0"
      },
      "hits": {
       "hits": [
        {
         "_index": "filebeat-7.10.0-2021.02.02-000001",
         "_type": "_doc",
         "_source": {
          "input": {
           "type": "log"
          },
          "agent": {
           "hostname": "ubuntu_fresh",
           "name": "ubuntu_fresh",
           "id": "879f36f2-4ade-47b6-a7b9-7972634c7b8c",
           "type": "filebeat",
           "ephemeral_id": "5676523f-bc61-4c12-b319-8b463348ba63",
           "version": "7.10.0"
          },
          "@timestamp": "2021-02-04T12:36:33.475Z",
          "ecs": {
           "version": "1.6.0"
          },
          "log": {
           "file": {
            "path": "/var/log/auth.log"
           },
           "offset": "46607"
          },
          "service": {
           "type": "system"
          },
          "host": {
           "hostname": "ubuntu_fresh",
           "os": {
            "kernel": "4.15.0-135-generic",
            "codename": "bionic",
            "name": "Ubuntu",
            "family": "debian",
            "version": "18.04.1 LTS (Bionic Beaver)",
            "platform": "ubuntu"
           },
           "containerized": "false",
           "ip": [
            "10.0.2.15",
            "fe80::a00:27ff:fe82:f598",
            "192.168.56.22",
            "fe80::a00:27ff:fe32:fab0"
           ],
           "name": "ubuntu_fresh",
           "id": "cdfcdf6a39d44b98b2aa51700134f415",
           "mac": [
            "08:00:27:82:f5:98",
            "08:00:27:32:fa:b0"
           ],
           "architecture": "x86_64"
          },
          "fileset": {
           "name": "auth"
          },
          "message": "Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2",
          "error": {
           "message": "Provided Grok expressions do not match field value: [Feb 4 12:36:28 ubuntu_fresh sshd[2662]: Failed password for root from 192.168.56.1 port 35830 ssh2]"
          },
          "event": {
           "ingested": "2021-02-04T12:36:39.482598548Z",
           "timezone": "+00:00",
           "module": "system",
           "dataset": "system.auth"
          }
         },
         "_id": "nNALbXcBbfKg8Fh6Zci7",
         "_score": "25.188179"
        }
       ],
       "total": {
        "value": "1",
        "relation": "eq"
       },
       "max_score": "25.188179"
      },
      "took": "1",
      "timed_out": "false"
     }
    }