Search code examples
python-3.xyamlnumpy-ndarrayruamel.yaml

Implicit resolvers and robust representers for human-friendly tuple and np.array support in ruamel.yaml


I have a project where the user is expected to manually write a yaml file. This yaml file might have some of its entries formatted as tuples or numpy arrays. We distinguish tuples and list internally in python to provide a convenient interface to the user, e.g. (1, 2, 3) is different than [1, 2, 3].

For convenience, I'd like the user to be able to enter a tuple directly using parenthesis, like so name: (1,2,3). I'd also like the user to be able to provide with numpy arrays by entering something like other_name: np.array([1,2,3]). I know this won't preserve exact numerical accuracy of the numpy arrays, but we determined that this is a fair compromise for the improved user experience.

I'm using ruamel.yaml, mainly because it preserves comments.

I managed to do something that works, but it's does not feel "right" to me, especially the Resolving part. There's basically no implicit resolver and I'm using a dirty eval instead. I did manage to find some information about implicit resolvers in ruamel.yaml online, on SO, and by rummaging through the source, but I could not really make sense of it.

Here's a minimal working example, with comments pointing out where I feel like the implementation is not robust or unclean.

import sys
import numpy as np
import ruamel.yaml


def _tupleRepresenter(dumper, data):
    # TODO: Make this more robust
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', str(data))


def _numpyRepresenter(dumper, data):
    # TODO: Make this more robust
    as_string = 'np.array(' + np.array2string(data, max_line_width=np.inf, precision=16, prefix='np.array(', separator=', ', suffix=')') + ')'
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', as_string)


def load_yaml(file):
    # TODO: Resolve tuples and arrays properly when loading
    yaml = ruamel.yaml.YAML()
    yaml.Representer.add_representer(tuple, _tupleRepresenter)
    yaml.Representer.add_representer(np.ndarray, _numpyRepresenter)
    return yaml.load(file)


def dump_yaml(data, file):
    yaml = ruamel.yaml.YAML()
    yaml.Representer.add_representer(tuple, _tupleRepresenter)
    yaml.Representer.add_representer(np.ndarray, _numpyRepresenter)
    return yaml.dump(data, file)


yaml_file = """
test_tuple: (1, 2, 3)
test_array: np.array([4,5,6])
"""

data = load_yaml(yaml_file)

data['test_tuple'] = eval(data['test_tuple']) # This feels dirty
data['test_array'] = eval(data['test_array']) # This feels dirty

dump_yaml(data, sys.stdout)
# test_tuple: (1, 2, 3)
# test_array: np.array([4, 5, 6])

I welcome any help on improving this implementation with a proper implicit resolver, with robusts representers, and generally using ruamel.yaml more like it is intended to be.


Update:

With help from the comments, I managed to do something that works almost completely. Let's ignore that I'd need to write a proper non-eval parser for now.

The only issue left is that the new tags are now exported as strings, so they are not properly interpreted when reloading. They become strings instead and they won't survive many roundtrips.

How can I avoid that?

Here's a minimal working example:

import sys
import numpy as np
import ruamel.yaml

# TODO: Replace evals by actual parsing
# TODO: Represent custom types without the string quotes

_tuple_re = "^(?:\((?:.|\n|\r)*,(?:.|\n|\r)*\){1}(?: |\n|\r)*$)"
_array_re = "^(?:(np\.|)array\(\[(?:.|\n|\r)*,(?:.|\n|\r)*\]\){1}(?: |\n|\r)*$)"
_complex_re = "^(?:(?:\d+(?:(?:\.\d+)?(?:e[+\-]\d+)?)?)?(?: *[+\-] *))?(?:\d+(?:(?:\.\d+)?(?:e[+\-]\d+)?)?)?[jJ]$"


def _tuple_constructor(self, node):
    return eval(self.construct_scalar(node))


def _array_constructor(self, node):
    value = node.value
    if not value.startswith('np.'):
        value = 'np.' + value
    return eval(value)


def _complex_constructor(self, node):
    return eval(node.value)


def _tuple_representer(dumper, data):
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', str(data))


def _array_representer(dumper, data):
    as_string = 'np.array(' + np.array2string(data, max_line_width=np.inf, precision=16, prefix='np.array(', separator=', ', suffix=')') + ')'
    as_string = as_string.replace(' ', '').replace(',', ', ')
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', as_string)


def _complex_representer(dumper, data):
    repr = str(data).replace('(', '').replace(')', '')
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', repr)


custom_types = {
    '!tuple':   {'re':_tuple_re,   'constructor': _tuple_constructor,   'representer':_tuple_representer,   'type': tuple,      'first':list('(')             },
    '!nparray': {'re':_array_re,   'constructor': _array_constructor,   'representer':_array_representer,   'type': np.ndarray, 'first':list('an')            },
    '!complex': {'re':_complex_re, 'constructor': _complex_constructor, 'representer':_complex_representer, 'type': complex,    'first':list('0123456789+-jJ')},
}


def load_yaml(file):
    yaml = ruamel.yaml.YAML()
    for tag,ct in custom_types.items():
        yaml.Constructor.add_constructor(tag, ct['constructor'])
        yaml.Resolver.add_implicit_resolver(tag, ruamel.yaml.util.RegExp(ct['re']), ct['first'])
        yaml.Representer.add_representer(ct['type'], ct['representer'])
    return yaml.load(file)


def dump_yaml(data, file):
    yaml = ruamel.yaml.YAML()
    for tag,ct in custom_types.items():
        yaml.Constructor.add_constructor(tag, ct['constructor'])
        yaml.Resolver.add_implicit_resolver(tag, ruamel.yaml.util.RegExp(ct['re']), ct['first'])
        yaml.Representer.add_representer(ct['type'], ct['representer'])
    return yaml.dump(data, file)

yaml_file = """
test_tuple: (1, 2, 3)
test_array: array([4.0,5+0j,6.0j])
test_complex: 3 + 2j
"""

data = load_yaml(yaml_file)

dump_yaml(data, sys.stdout)
# test_tuple: '(1, 2, 3)'
# test_array: 'np.array([4.+0.j, 5.+0.j, 0.+6.j])'
# test_complex: '3+2j'

Thank you!


Solution

  • With help from Anthon in the comments, and reading through his ruamel.yaml source, I managed to answer my question.

    I'm putting a minimum viable solution here for reference. It'd probably be a good idea to replace the evals with an actual parser to avoid exploits if this is ever to be executed on a yaml file form a source you don't trust.

    import sys
    import numpy as np
    import ruamel.yaml
    
    from ruamel.yaml.comments import TaggedScalar
    
    # TODO: Replace evals by actual parsing
    
    _tuple_re = "^(?:\((?:.|\n|\r)*,(?:.|\n|\r)*\){1}(?: |\n|\r)*$)"
    _array_re = "^(?:(np\.|)array\(\[(?:.|\n|\r)*,(?:.|\n|\r)*\]\){1}(?: |\n|\r)*$)"
    
    
    def _tuple_constructor(self, node):
        return eval(self.construct_scalar(node))
    
    
    def _array_constructor(self, node):
        value = node.value
        if not value.startswith('np.'):
            value = 'np.' + value
        return eval(value)
    
    
    def _tuple_representer(dumper, data):
        repr = str(data)
        return dumper.represent_tagged_scalar(TaggedScalar(repr, style=None, tag='!tuple'))
    
    
    def _array_representer(dumper, data):
        repr = 'np.array(' + np.array2string(data, max_line_width=np.inf, precision=16, prefix='np.array(', separator=', ', suffix=')') + ')'
        repr = repr.replace(' ', '').replace(',', ', ')
        return dumper.represent_tagged_scalar(TaggedScalar(repr, style=None, tag='!nparray'))
    
    
    custom_types = {
        '!tuple':   {'re':_tuple_re,   'constructor': _tuple_constructor,   'representer':_tuple_representer,   'type': tuple,      'first':list('(')             },
        '!nparray': {'re':_array_re,   'constructor': _array_constructor,   'representer':_array_representer,   'type': np.ndarray, 'first':list('an')            },
    }
    
    
    def load_yaml(file):
        yaml = ruamel.yaml.YAML()
        for tag,ct in custom_types.items():
            yaml.Constructor.add_constructor(tag, ct['constructor'])
            yaml.Resolver.add_implicit_resolver(tag, ruamel.yaml.util.RegExp(ct['re']), ct['first'])
            yaml.Representer.add_representer(ct['type'], ct['representer'])
        return yaml.load(file)
    
    
    def dump_yaml(data, file):
        yaml = ruamel.yaml.YAML()
        for tag,ct in custom_types.items():
            yaml.Constructor.add_constructor(tag, ct['constructor'])
            yaml.Resolver.add_implicit_resolver(tag, ruamel.yaml.util.RegExp(ct['re']), ct['first'])
            yaml.Representer.add_representer(ct['type'], ct['representer'])
        return yaml.dump(data, file)
    
    yaml_file = """
    test_tuple: (1, 2, 3)
    test_array: array([4.0,5+0j,6.0j])
    """
    
    data = load_yaml(yaml_file)
    
    dump_yaml(data, sys.stdout)
    # test_tuple: (1, 2, 3)
    # test_array: np.array([4.+0.j, 5.+0.j, 0.+6.j])