I have a yaml file that contains emojis in it, and I am looking to use ruamel-yaml to load this yaml, edit it, and then dump it back. The problem I am facing right now is that it does not retain the emoji, but rather the unicode string.
Here is my yaml file
# filename: t.yml
text: "Docs 📚"
And a snippet of the code
import sys
from ruamel.yaml import YAML
yaml = YAML()
yaml.preserve_quotes = True
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.width = 1024
yaml.encoding = "utf-8"
with open("t.yml") as f:
data = yaml.load(f)
yaml.dump(data, sys.stdout)
And here is the output:
❯ python3 t.py
text: "Docs \U0001F4DA"
If I remove the preserve_quotes
then the output is fine:
python3 t.py
text: Docs 📚
This is caused by the emitter code for double quoted scalars, the behaviour of which came under scrutiny recently
Building on the answer there, a small additional change to write_double_quoted
makes
it deal with Unicode characters above \uFFFD
:
import sys
import ruamel.yaml
class MyEmitter(ruamel.yaml.emitter.Emitter):
def write_double_quoted(self, text, split=True):
if self.root_context:
if self.requested_indent is not None:
self.write_line_break()
if self.requested_indent != 0:
self.write_indent()
self.write_indicator(u'"', True)
start = end = 0
while end <= len(text):
ch = None
if end < len(text):
ch = text[end]
if (
ch is None
or ch in u'"\\\x85\u2028\u2029\uFEFF'
or not (
u'\x20' <= ch <= u'\x7E'
or (
self.allow_unicode
and (
(u'\xA0' <= ch <= u'\uD7FF') or
(u'\uE000' <= ch <= u'\uFFFD') or
(u'\U00010000' <= ch <= u'\U0010FFFF')
)
)
)
):
if start < end:
data = text[start:end]
self.column += len(data)
if bool(self.encoding):
data = data.encode(self.encoding)
self.stream.write(data)
start = end
if ch is not None:
if ch in self.ESCAPE_REPLACEMENTS:
data = u'\\' + self.ESCAPE_REPLACEMENTS[ch]
elif ch <= u'\xFF':
data = u'\\x%02X' % ord(ch)
elif ch <= u'\uFFFF':
data = u'\\u%04X' % ord(ch)
else:
data = u'\\U%08X' % ord(ch)
self.column += len(data)
if bool(self.encoding):
data = data.encode(self.encoding)
self.stream.write(data)
start = end + 1
if (
0 < end < len(text) - 1
and (ch == u' ' or start >= end)
and self.column + (end - start) > self.best_width
and split
):
# data = text[start:end] + u'\\' # <<< replaced with following two lines
need_backquote = text[end] == u' ' and (len(text) > end) and text[end+1] == u' '
data = text[start:end] + (u'\\' if need_backquote else u'')
if start < end:
start = end
self.column += len(data)
if bool(self.encoding):
data = data.encode(self.encoding)
self.stream.write(data)
self.write_indent()
self.whitespace = False
self.indention = False
if text[start] == u' ':
if not need_backquote:
# remove leading space it will load from the newline
start += 1
# data = u'\\' # <<< replaced with following line
data = u'\\' if need_backquote else u''
self.column += len(data)
if bool(self.encoding):
data = data.encode(self.encoding)
self.stream.write(data)
end += 1
self.write_indicator(u'"', False)
file_in = Path('t.yaml')
yaml = ruamel.yaml.YAML()
yaml.Emitter = MyEmitter
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes = True
yaml.width = 1024
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)
which gives:
# filename: t.yaml
text: "Docs 📚"
The added range test under self.allow_unicode
:
(u'\U00010000' <= ch <= u'\U0010FFFF')
is what makes the difference.
Please note:
yaml.encoding = 'utf-8'
is the default, so you don't need to set thatopen(..., 'rb')
).yaml
since at least September 2006. The YML format is XML
based, and at least as old as YAML.