how to retain emoji using ruamel-yaml dump

I have a yaml file that contains emojis in it, and I am looking to use ruamel-yaml to load this yaml, edit it, and then dump it back. The problem I am facing right now is that it does not retain the emoji, but rather the unicode string.

Here is my yaml file

# filename: t.yml
text: "Docs 📚"

And a snippet of the code

import sys
from ruamel.yaml import YAML

yaml = YAML()
yaml.preserve_quotes = True
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.width = 1024
yaml.encoding = "utf-8"

with open("t.yml") as f:
    data = yaml.load(f)

yaml.dump(data, sys.stdout)

And here is the output:

❯ python3 t.py
text: "Docs \U0001F4DA"

If I remove the preserve_quotes then the output is fine:

python3 t.py
text: Docs 📚

Solution

This is caused by the emitter code for double quoted scalars, the behaviour of which came under scrutiny recently

Building on the answer there, a small additional change to write_double_quoted makes it deal with Unicode characters above \uFFFD:

import sys
import ruamel.yaml

class MyEmitter(ruamel.yaml.emitter.Emitter):
    def write_double_quoted(self, text, split=True):
        if self.root_context:
            if self.requested_indent is not None:
                self.write_line_break()
                if self.requested_indent != 0:
                    self.write_indent()
        self.write_indicator(u'"', True)
        start = end = 0
        while end <= len(text):
            ch = None
            if end < len(text):
                ch = text[end]
            if (
                ch is None
                or ch in u'"\\\x85\u2028\u2029\uFEFF'
                or not (
                    u'\x20' <= ch <= u'\x7E'
                    or (
                        self.allow_unicode
                        and (
                            (u'\xA0' <= ch <= u'\uD7FF') or
                            (u'\uE000' <= ch <= u'\uFFFD') or
                            (u'\U00010000' <= ch <= u'\U0010FFFF')
                        )
                    )
                )
            ):
                if start < end:
                    data = text[start:end]
                    self.column += len(data)
                    if bool(self.encoding):
                        data = data.encode(self.encoding)
                    self.stream.write(data)
                    start = end
                if ch is not None:
                    if ch in self.ESCAPE_REPLACEMENTS:
                        data = u'\\' + self.ESCAPE_REPLACEMENTS[ch]
                    elif ch <= u'\xFF':
                        data = u'\\x%02X' % ord(ch)
                    elif ch <= u'\uFFFF':
                        data = u'\\u%04X' % ord(ch)
                    else:
                        data = u'\\U%08X' % ord(ch)
                    self.column += len(data)
                    if bool(self.encoding):
                        data = data.encode(self.encoding)
                    self.stream.write(data)
                    start = end + 1
            if (
                0 < end < len(text) - 1
                and (ch == u' ' or start >= end)
                and self.column + (end - start) > self.best_width
                and split
            ):
                # data = text[start:end] + u'\\'  # <<< replaced with following two lines
                need_backquote = text[end] == u' ' and (len(text) > end) and text[end+1] == u' '
                data = text[start:end] + (u'\\' if need_backquote else u'')
                if start < end:
                    start = end
                self.column += len(data)
                if bool(self.encoding):
                    data = data.encode(self.encoding)
                self.stream.write(data)
                self.write_indent()
                self.whitespace = False
                self.indention = False
                if text[start] == u' ':
                    if not need_backquote:
                        # remove leading space it will load from the newline
                        start += 1 
                    # data = u'\\'    # <<< replaced with following line
                    data = u'\\' if need_backquote else u''
                    self.column += len(data)
                    if bool(self.encoding):
                        data = data.encode(self.encoding)
                    self.stream.write(data)
            end += 1
        self.write_indicator(u'"', False)


file_in = Path('t.yaml')
    
yaml = ruamel.yaml.YAML()
yaml.Emitter = MyEmitter
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes = True
yaml.width = 1024
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)

which gives:

# filename: t.yaml
text: "Docs 📚"

The added range test under self.allow_unicode:

                        (u'\U00010000' <= ch <= u'\U0010FFFF')

is what makes the difference.

Please note:

yaml.encoding = 'utf-8' is the default, so you don't need to set that
if you are working with UTF-8 files you should open them as binary (open(..., 'rb'))
the recommended extension for files containing YAML documents has been .yaml since at least September 2006. The YML format is XML based, and at least as old as YAML.