Search code examples
pythonyamlruamel.yamlmulti-document

How can I maintain formatting while updating a yaml file using ruamel?


I have a multi-document YAML file. I am interested in modifying the third document only (this modification will be later made using other code and conditions). After some research, I selected ruamel since it was reported to preserve order and format.

My YAML looks like this (not including the whole thing since it is more than 3000 lines long):

---
"SOURCE": "mmmmm"
"VERSION": "5.4.2"
"DATE_WRITTEN": "Tue Oct 25 06:09:34 2022"
"CONFIG_CHECKSUM": "0XCD44F064"
"CONFIG_SIZE": "231212"
...
---
"moduleVersion": ["5.4.2   (AUG 2022)", "20:FIO w/2070-2A"]
"moduleModel": ["mmmmm", "mmmmm Linux Actuated Controller Unit"]
"maxPhases": 16
"maxVehicleDetectors": 72
"maxPedestrianDetectors": 8
"etcsAscPhsBanksMax": 4
"maxOverlaps": 16
"maxRings": 4
"etcsAscPriorityBanksMax": 4
"etcsAscMaxPriorityQueues": 6
"maxPatterns": 253
"etcsAscSFMapsMaskSize": 16
"etcsAscPFMapsMaskSize": 16
"etcsMaxSpcFuncMaps": 47
"etcsMaxPhsFuncMaps": 192
"maxTimebaseAscActions": 255
"maxTimebaseScheduleEntries": 255
"maxDayPlanEvents": 15
"maxDayPlans": 255
"maxDaylightSavingEntries": 2
"rs232Number": 3
"maxSequences": 16
"etcsAscMaxSerialPorts": 2
"maxChannels": 32
"ipAdEntAddr": [[192, 168, 1, 100], [192, 168, 0, 77]]
"etcsAscMaxSpatDestinations": 16
"etcsUnitBankMax": 4
"etcsMaxOutputLoadswitches": 32
"etcsPeerFunctionMax": 64
"etcsAscMaxPriorities": 12
"maxSplits": 253
"maxPreempts": 12
...
---
"phaseWalk": [0, 7, 0, 7, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7]
"phasePedestrianClear": [0, 28, 0, 32, 0, 28, 0, 32, 0, 0, 0, 0, 0, 0, 0, 32]
"phaseMinimumGreen": [5, 7, 5, 7, 5, 7, 5, 7, 0, 0, 0, 0, 0, 0, 0, 7]
"phasePassage": [20, 10, 20, 25, 20, 10, 20, 25, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseMaximum1": [5, 25, 5, 15, 5, 25, 5, 15, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseMaximum2": [20, 0, 20, 55, 20, 0, 20, 65, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseYellowChange": [44, 44, 40, 40, 44, 44, 40, 40, 0, 0, 0, 0, 0, 0, 0, 30]
"phaseRedClear": [20, 20, 26, 26, 20, 20, 26, 26, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseAddedInitial": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseMaximumInitial": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseTimeBeforeReduction": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseCarsBeforeReduction": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseTimeToReduce": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseMinimumGap": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseDynamicMaxLimit": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseDynamicMaxStep": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"phaseStartup": [2, 3, 2, 2, 2, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2]
"phaseOptions": [33, 165, 33, 1059, 33, 165, 33, 1059, 0, 0, 0, 0, 0, 0, 0, 1]
"phaseConcurrency": [[5, 6], [5, 6], [7, 8], [7, 8], [1, 2], [1, 2], [3, 4], [3, 4], [], [], [], [], [], [], [], []]
"etcsAscPhaseFlashWalk": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
"etcsAscPhaseExtPedClear": [0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3]

This is the code I have:

#importing the yaml
directory = input("Please enter the directory path: ")
yml_file = glob.glob('*.yaml')
import ruamel.yaml

try:
    # Create a new YAML object
    yaml = ruamel.yaml.YAML()
    # Open the YAML file in read-write mode
    with open(yml_file[0], 'r+') as file:
        # Load the YAML documents using the ruamel.yaml.safe_load() method
        data = list(yaml.load_all(file))
        # Get the third document
        doc3 = data[2]
        # Make changes to the third document
        doc3["phaseWalk"][0] == 5
        # Seek to the beginning of the file
        file.seek(0)
        # Overwrite the file with the updated documents
        yaml.dump_all(data, file)
        file.write("...\n")
        # Close the file
        file.truncate()

except FileNotFoundError:
    print("The file 'file.yaml' was not found.")

except PermissionError:
    print("You do not have permission to write to the file 'file.yaml'.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

It yields the following YAML:

SOURCE: mmmmm
VERSION: 5.4.2
DATE_WRITTEN: Tue Oct 25 06:09:34 2022
CONFIG_CHECKSUM: 0XCD44F064
CONFIG_SIZE: '231212'
---
moduleVersion: [5.4.2   (AUG 2022), 20:FIO w/2070-2A]
moduleModel: [mmmmm, mmmmm Linux Actuated Controller Unit]
maxPhases: 16
maxVehicleDetectors: 72
maxPedestrianDetectors: 8
etcsAscPhsBanksMax: 4
maxOverlaps: 16
maxRings: 4
etcsAscPriorityBanksMax: 4
etcsAscMaxPriorityQueues: 6
maxPatterns: 253
etcsAscSFMapsMaskSize: 16
etcsAscPFMapsMaskSize: 16
etcsMaxSpcFuncMaps: 47
etcsMaxPhsFuncMaps: 192
maxTimebaseAscActions: 255
maxTimebaseScheduleEntries: 255
maxDayPlanEvents: 15
maxDayPlans: 255
maxDaylightSavingEntries: 2
rs232Number: 3
maxSequences: 16
etcsAscMaxSerialPorts: 2
maxChannels: 32
ipAdEntAddr: [[192, 168, 1, 100], [192, 168, 0, 77]]
etcsAscMaxSpatDestinations: 16
etcsUnitBankMax: 4
etcsMaxOutputLoadswitches: 32
etcsPeerFunctionMax: 64
etcsAscMaxPriorities: 12
maxSplits: 253
maxPreempts: 12
---
phaseWalk: [0, 7, 0, 7, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7]
phasePedestrianClear: [0, 28, 0, 32, 0, 28, 0, 32, 0, 0, 0, 0, 0, 0, 0, 32]
phaseMinimumGreen: [5, 7, 5, 7, 5, 7, 5, 7, 0, 0, 0, 0, 0, 0, 0, 7]
phasePassage: [20, 10, 20, 25, 20, 10, 20, 25, 0, 0, 0, 0, 0, 0, 0, 0]
phaseMaximum1: [5, 25, 5, 15, 5, 25, 5, 15, 0, 0, 0, 0, 0, 0, 0, 0]
phaseMaximum2: [20, 0, 20, 55, 20, 0, 20, 65, 0, 0, 0, 0, 0, 0, 0, 0]
phaseYellowChange: [44, 44, 40, 40, 44, 44, 40, 40, 0, 0, 0, 0, 0, 0, 0, 30]
phaseRedClear: [20, 20, 26, 26, 20, 20, 26, 26, 0, 0, 0, 0, 0, 0, 0, 0]
phaseAddedInitial: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseMaximumInitial: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseTimeBeforeReduction: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseCarsBeforeReduction: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseTimeToReduce: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseMinimumGap: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseDynamicMaxLimit: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseDynamicMaxStep: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
phaseStartup: [2, 3, 2, 2, 2, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2]
phaseOptions: [33, 165, 33, 1059, 33, 165, 33, 1059, 0, 0, 0, 0, 0, 0, 0, 1]
phaseConcurrency: [[5, 6], [5, 6], [7, 8], [7, 8], [1, 2], [1, 2], [3, 4], [3, 4],
  [], [], [], [], [], [], [], []]
etcsAscPhaseFlashWalk: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
etcsAscPhaseExtPedClear: [0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3]

The issues I have with this:

  1. Where is the first 3 dashes for the first document?
  2. Why have the data types been changed? Most of my data types were defined as strings. They're no longer that way.
  3. The modification to the third document did not take effect? Am I doing anything wrong?

I am running ruamel v 0.17.21


Solution

  • TL;DR; skip to text after horizontal line

    Various parts of a YAML document are optional depending on the circumstances, and likelwise the separation of documents in a multi-document file has optional parts.

    Historically ruamel.yaml started out with (only) dealing with preserving end-of-line comments that were dropped by PyYAML, by combining PyYAMLs separate but largely overlapping sources for Python 2 and 3 (so changes would only have to be made in one place) and then adding the code that preserved the comments. (The source combination changes and those for YAML 1.2 support were first offered as PR requests to PyYAML, but were ignored, forcing me to fork)

    Other things, like indenteation, were "normalised", i.e. made the same everywhere. Indentation is still normalised, although you now have seperate indentation control for mappings and sequences.

    Normalisation often does away with superfluous elements: such extra spaces between elements in a sequence. Most of these normalisations, including removing comments, were in the original PyYAML code. Given that parsing YAML is a multi-step process (scanning, tokenizing, compositing the structure, generating Python objects) and if something is discarded during scanning, you might imagine the complexity of adding changes that the discarded information. Additionally, although PyYAML works internally with instances of various classes adding one extra parameter to a load or dump function, e.g. to optionally preserve quotes around scalars, required changes in multiple files in multiple locations each. So that is why ruamel.yaml switched to using a YAML() instance on which you can set attributes (and the the underlying code can query as necessary).

    Apart from adding such code largely depending on the lazy-ness of ruamel.yaml's main developer, there is also the question for some aspect of YAML round-tripping, whether to use PyYAML's original normalisations, make normalisations optional or always preserve. Apart from ease of implementation, the answer might depend on personal preference, and decision either way were not always made consistent.

    Things that were added later to ruamel.yaml are preservation of integer/float formats; of literal scalars (initially) and quoted/folded scalars; space after colon for root level mappings. Some of these preservations are always supplied, some depend on setting attributes on the YAML() instance.


    Given that context, the short answer is that superfluous quotes around scalars are dropped unless you set .preserve_quotes (otherwise they are normalised away), and that the end-of-document marker (...) doesn't get preserved when not necessary (i.e. when there are directives like %YAML 1.2 ), unless you set .explicit_end. So you will have to tell your YAML() instance what you want explicitly.

    (That you still have quotes around the value for CONFIG_SIZE is by the value otherwise being interpreted as a number.)

    I normally don't overwrite an input file until I know the changes are correct (it is a pain when they are partially what you want and you have to restore the input before the next test run)

    The line doc3["phaseWalk"][0] == 5 evaluates to False and has no further side-effects, so of course there is no modification of the first element of the value for key phaseWalk, and nothing gets updated.

    If you run:

    import sys
    import ruamel.yaml
    import pathlib
    
    path = next(Path('.').glob('*.yaml'))  # first matching path
        
    yaml = ruamel.yaml.YAML()
    # yaml.indent(mapping=4, sequence=4, offset=2)  # this is the default, doesn't affect your code
    yaml.preserve_quotes = True  # added in ruamel.yaml
    yaml.explicit_start = True   # control also available in PyYAML
    yaml.explicit_end = True     # control also available in PyYAML
    
    data = list(yaml.load_all(path))
    doc3 = data[2]
    doc3['phaseWalk'][0] = 5    # single '=' for assignment
    yaml.dump_all(data, sys.stdout)   # yaml.dump_all(data, path) -> overwrite original file
    

    which gives:

    ---
    "SOURCE": "mmmmm"
    "VERSION": "5.4.2"
    "DATE_WRITTEN": "Tue Oct 25 06:09:34 2022"
    "CONFIG_CHECKSUM": "0XCD44F064"
    "CONFIG_SIZE": "231212"
    ...
    ---
    "moduleVersion": ["5.4.2   (AUG 2022)", "20:FIO w/2070-2A"]
    "moduleModel": ["mmmmm", "mmmmm Linux Actuated Controller Unit"]
    "maxPhases": 16
    "maxVehicleDetectors": 72
    "maxPedestrianDetectors": 8
    "etcsAscPhsBanksMax": 4
    "maxOverlaps": 16
    "maxRings": 4
    "etcsAscPriorityBanksMax": 4
    "etcsAscMaxPriorityQueues": 6
    "maxPatterns": 253
    "etcsAscSFMapsMaskSize": 16
    "etcsAscPFMapsMaskSize": 16
    "etcsMaxSpcFuncMaps": 47
    "etcsMaxPhsFuncMaps": 192
    "maxTimebaseAscActions": 255
    "maxTimebaseScheduleEntries": 255
    "maxDayPlanEvents": 15
    "maxDayPlans": 255
    "maxDaylightSavingEntries": 2
    "rs232Number": 3
    "maxSequences": 16
    "etcsAscMaxSerialPorts": 2
    "maxChannels": 32
    "ipAdEntAddr": [[192, 168, 1, 100], [192, 168, 0, 77]]
    "etcsAscMaxSpatDestinations": 16
    "etcsUnitBankMax": 4
    "etcsMaxOutputLoadswitches": 32
    "etcsPeerFunctionMax": 64
    "etcsAscMaxPriorities": 12
    "maxSplits": 253
    "maxPreempts": 12
    ...
    ---
    "phaseWalk": [5, 7, 0, 7, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7]
    "phasePedestrianClear": [0, 28, 0, 32, 0, 28, 0, 32, 0, 0, 0, 0, 0, 0, 0, 32]
    "phaseMinimumGreen": [5, 7, 5, 7, 5, 7, 5, 7, 0, 0, 0, 0, 0, 0, 0, 7]
    "phasePassage": [20, 10, 20, 25, 20, 10, 20, 25, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseMaximum1": [5, 25, 5, 15, 5, 25, 5, 15, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseMaximum2": [20, 0, 20, 55, 20, 0, 20, 65, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseYellowChange": [44, 44, 40, 40, 44, 44, 40, 40, 0, 0, 0, 0, 0, 0, 0, 30]
    "phaseRedClear": [20, 20, 26, 26, 20, 20, 26, 26, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseAddedInitial": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseMaximumInitial": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseTimeBeforeReduction": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseCarsBeforeReduction": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseTimeToReduce": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseMinimumGap": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseDynamicMaxLimit": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseDynamicMaxStep": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "phaseStartup": [2, 3, 2, 2, 2, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2]
    "phaseOptions": [33, 165, 33, 1059, 33, 165, 33, 1059, 0, 0, 0, 0, 0, 0, 0, 1]
    "phaseConcurrency": [[5, 6], [5, 6], [7, 8], [7, 8], [1, 2], [1, 2], [3, 4], [3, 4],
      [], [], [], [], [], [], [], []]
    "etcsAscPhaseFlashWalk": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    "etcsAscPhaseExtPedClear": [0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3]
    ...
    

    Please realise that both your YAML output and your input load to the same internal data structure in normal circumstances (the round-trip parser that ruamel.yaml implements is an exception). So for practical purposes you should not care about dropped quotes unless you have to deal with a parser that is non-compliant wrt the YAML specification.

    If you don't want such changes because of comparison difficulties, or you don't want such extra changes in a repository, you should consider to bite (byte?) the bullet, just what you would have to do if you run a code formatter (such as oitnb) on your source code.