Search code examples
pythonarraysjsondictionarytxt

From text file to JSON file with python


Suppose I have a txt file that looks like this (indentation is 4 spaces):

key1=value1
key2
    key2_1=value2_1
    key2_2
        key2_2_1=value2_2_1
    key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3

I want to convert it into any VALID json, like this one:

{
'key1':'value1',
'key2': {
    'key2_1':'value2_1',
    'key2_2':{
        'key2_2_1':'value2_2_1'
        },
    'key2_3':['value2_3_1','value2_3_2','value2_3_3']
    },
'key3':['value3_1','value3_2','value3_3']
}

I have tried this (which I got from another post):

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
            inputString[position] = strSplit[0] + '\n'
            inputString.insert(position+1, newVal)
    flatList = "".join(inputString)
    return flatList

# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()

    def add_children(self, nodes):
        childlevel = nodes[0].level

        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text

# process our file here
with open(filename, 'r') as fh:
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = root.as_dict()['root']
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
    print(jsonOutput)

which yields the following:

[
    {
        "key1": "value1"
    },
    {
        "key2": [
            {
                "key2_1": "value2_1"
            },
            {
                "key2_2": {
                    "key2_2_1": "value2_2_1"
                }
            },
            {
                "key2_3": "value2_3_1,value2_3_2,value2_3_3"
            },
        ]
    },
    {
        "key3": "value3_1,value3_2,value3_3"
    }
]

Yet this is still not a valid JSON file.

When I try to open the output file using 'json' module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".

with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
    data = json.load(read_file)

output:

JSONDecodeError                           Traceback (most recent call last)
Input In [2], in <cell line: 1>()
      1 with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2     data = json.load(read_file)

File ~\Anaconda3\lib\json\__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
    275         parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
    276     """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
    277     a JSON document) to a Python object.
    278 
   (...)
    291     kwarg; otherwise ``JSONDecoder`` is used.
    292     """
--> 293     return loads(fp.read(),
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
    296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)

File ~\Anaconda3\lib\json\__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    341     s = s.decode(detect_encoding(s), 'surrogatepass')
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:
    348     cls = JSONDecoder

File ~\Anaconda3\lib\json\decoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):

File ~\Anaconda3\lib\json\decoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
    344 """Decode a JSON document from ``s`` (a ``str`` beginning with
    345 a JSON document) and return a 2-tuple of the Python
    346 representation and the index in ``s`` where the document ended.
   (...)
    350 
    351 """
    352 try:
--> 353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
    355     raise JSONDecodeError("Expecting value", s, err.value) from None

JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)

The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!

I truly appreciate any comments. Best,

Nigel


Solution

  • An aside for users that land on this page: I could not reproduce the error that the OP posted. json.dumps() would be very highly unlikely to output "bad json".

    Splitting The Strings Into Lists

    I am assuming per your comment that you mean that you want to take your strings, for example, this line key2_3=value2_3_1,value2_3_2,value2_3_3 and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"].

    To do so, you'd have to make the following adjustment to the code provided to you:

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text.split(",") # was self.text
    

    Dictionaries of Dictionaries Instead of Lists

    To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}, and of the like, we have to make 2 changes.

    1. Update the as_dict method to use {} instead of [].
    2. Include a function to compress keys.

    When I was doing this, I had a hard time outputting the correct data structure... it'd look basically like this, {k1: {k1: {k2: {k2: value}}}}. This becomes obvious when you don't run the d = compress(root.as_dict()['root']) (d = root.as_dict()['root']) function in the code. So the code went from

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text.split(",") if "," in self.text else self.text
    

    to:

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: {node.text: node.as_dict() for node in self.children}}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text.split(",") if "," in self.text else self.text
    

    Then I included the compress function:

    # for merging like sub keys and values
    def compress(dictionary):
        if isinstance(dictionary, dict):
            for k, v in dictionary.items():
                if isinstance(v, dict):
                    if k in v.keys():
                        dictionary[k] = dictionary[k].pop(k)
                    compress(dictionary[k])
                compress(k)
        return dictionary
    

    Full Code

    If you put the below in a file and run it from the command line, it should work 100%. Otherwise it's probably a problem with Anaconda or version of Python (though that doesn't really seem likely).

    from io import StringIO
    import json
    
    # for merging like sub keys and values
    def compress(dictionary):
        if isinstance(dictionary, dict):
            for k, v in dictionary.items():
                if isinstance(v, dict):
                    if k in v.keys():
                        dictionary[k] = dictionary[k].pop(k)
                    compress(dictionary[k])
                compress(k)
        return dictionary
    
    # helper method to convert equals sign to indentation for easier parsing
    def convertIndentation(inputString):
        indentCount = 0
        indentVal = "    "
        for position, eachLine in enumerate(inputString):
            if "=" not in eachLine:
                continue
            else:
                strSplit = eachLine.split("=", 1)
                #get previous indentation
                prevIndent = inputString[position].count(indentVal)
                newVal = (indentVal * (prevIndent + 1)) + strSplit[1]
                inputString[position] = strSplit[0] + '\n'
                inputString.insert(position+1, newVal)
        flatList = "".join(inputString)
        return flatList
    
    
    
    # helper class for node usage
    class Node:
        def __init__(self, indented_line):
            self.children = []
            self.level = len(indented_line) - len(indented_line.lstrip())
            self.text = indented_line.strip()
        def add_children(self, nodes):
            childlevel = nodes[0].level
            while nodes:
                node = nodes.pop(0)
                if node.level == childlevel: # add node as a child
                    self.children.append(node)
                elif node.level > childlevel: # add nodes as grandchildren of the last child
                    nodes.insert(0,node)
                    self.children[-1].add_children(nodes)
                elif node.level <= self.level: # this node is a sibling, no more children
                    nodes.insert(0,node)
                    return
        def as_dict(self):
            if len(self.children) > 1:
                return {self.text: {node.text: node.as_dict() for node in self.children}}
            elif len(self.children) == 1:
                return {self.text: self.children[0].as_dict()}
            else:
                return self.text.split(",") if "," in self.text else self.text
    
    if __name__ == "__main__":
    
        s = """
            key1=value1
            key2
                key2_1=value2_1
                key2_2
                    key2_2_1
                        key2_2_1_1=value2_2_1_1
                key2_3=value2_3_1,value2_3_2,value2_3_3
            key3=value3_1,value3_2,value3_3
        """
    
        fh = StringIO(s)
        fileContent = fh.readlines()
        fileParse = convertIndentation(fileContent)
        # convert equals signs to indentation
        root = Node('root')
        root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
        d = compress(root.as_dict()['root'])
        # this variable is storing the json output
        jsonOutput = json.dumps(d, indent=4, sort_keys=False)
        f = StringIO(jsonOutput)
    
        # load the "file"
        loaded = json.load(f)
    
        print(s)
        print(jsonOutput)
        print(loaded)