Search code examples
javaxmlunicodestax

Java XMLStreamWriter: Outputting Unicode extended characters (non-BMP)


Does anyone know how to correctly output extended characters (non-BMP, more than 1 char) using Java's XMLStreamWriter? For example, trying to output Unicode U+10480 𐒀:

import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

public class XmlStreamWriterExtendedCharactersFail {
    public static void main(String[] args) throws XMLStreamException {
        String inlineStr = "inlineStr = 𐒀";
        // create string using StringBuilder to avoid Java file encoding confusion:
        String sbStr = new StringBuilder("sbStr = ").appendCodePoint(0x10480).toString();
        assert sbStr.equals(inlineStr);
        System.out.println(sbStr);

        OutputStreamWriter outWriter = new OutputStreamWriter(System.out,
                StandardCharsets.UTF_8.newEncoder());
        XMLStreamWriter writer = XMLOutputFactory.newFactory()
                .createXMLStreamWriter(outWriter);
        writer.writeStartDocument("UTF-8", "1.1");
        writer.writeStartElement("el");
        writer.writeCharacters(sbStr);
        writer.writeEndElement();
        writer.writeEndDocument();
        writer.close();
    }
}

results in:

sbStr = 𐒀
<?xml version="1.1" encoding="UTF-8"?><el>sbStr = &#xd801;&#xdc80;</el>

Note that &#xd801;&#xdc80; are invalid code points and will lead to an error when parsing with SAX.

Expected output:

sbStr = 𐒀
<?xml version="1.1" encoding="UTF-8"?><el>sbStr = 𐒀</el>

sbStr = &#10480; would also do in a pinch, but the first is preferable.


Solution

  • Note that Apache Commons' StringEscapeUtils.escapeXML also has this problem prior to version 3.0.

    I created the following wrapper around XMLStreamWriter as a workaround to my problem. It escapes character data using Apache Commons and writes it directly to the underlying Writer.

    import java.io.IOException;
    import java.io.Writer;
    import java.nio.CharBuffer;
    
    import javax.xml.namespace.NamespaceContext;
    import javax.xml.stream.FactoryConfigurationError;
    import javax.xml.stream.XMLOutputFactory;
    import javax.xml.stream.XMLStreamException;
    import javax.xml.stream.XMLStreamWriter;
    
    import org.apache.commons.lang3.StringEscapeUtils;
    import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
    
    public class UnicodeXMLStreamWriter implements XMLStreamWriter {
        private XMLStreamWriter xmlSW;
        private Writer writer;
        private CharSequenceTranslator characterEscapor = StringEscapeUtils.ESCAPE_XML11;
    
        public static UnicodeXMLStreamWriter newInstance(Writer writer) throws XMLStreamException, FactoryConfigurationError {
            return newInstance(writer, XMLOutputFactory.newFactory());
        }
    
        public static UnicodeXMLStreamWriter newInstance(Writer writer, XMLOutputFactory factory) throws XMLStreamException {
            XMLStreamWriter xmlSW = factory.createXMLStreamWriter(writer);
            return new UnicodeXMLStreamWriter(writer, xmlSW);
        }
    
        public UnicodeXMLStreamWriter(Writer writer, XMLStreamWriter xmlSW) {
            this.writer = writer;
            this.xmlSW = xmlSW;
        }
    
        public void writeCharacters(String text) throws XMLStreamException {
            // finish writing start element
            xmlSW.writeCharacters("");
            xmlSW.flush();
            try {
                characterEscapor.translate(text, writer);
            } catch (IOException e) {
                throw new XMLStreamException(e);
            }
        }
    
        public void writeCharacters(char[] text, int start, int len)
                throws XMLStreamException {
            // finish writing start element
            xmlSW.writeCharacters("");
            xmlSW.flush();
            try {
                characterEscapor.translate(CharBuffer.wrap(text, start, len), writer);
            } catch (IOException e) {
                throw new XMLStreamException(e);
            }
        }
    
        //////////////// REMAINING METHODS ARE DELEGATES to xmlSW ////////////////
    
        public void writeStartElement(String localName) throws XMLStreamException {
            xmlSW.writeStartElement(localName);
        }
    
        public void writeStartElement(String namespaceURI, String localName)
                throws XMLStreamException {
            xmlSW.writeStartElement(namespaceURI, localName);
        }
    
        public void writeStartElement(String prefix, String localName,
                String namespaceURI) throws XMLStreamException {
            xmlSW.writeStartElement(prefix, localName, namespaceURI);
        }
    
        public void writeEmptyElement(String namespaceURI, String localName)
                throws XMLStreamException {
            xmlSW.writeEmptyElement(namespaceURI, localName);
        }
    
        public void writeEmptyElement(String prefix, String localName,
                String namespaceURI) throws XMLStreamException {
            xmlSW.writeEmptyElement(prefix, localName, namespaceURI);
        }
    
        public void writeEmptyElement(String localName) throws XMLStreamException {
            xmlSW.writeEmptyElement(localName);
        }
    
        public void writeEndElement() throws XMLStreamException {
            xmlSW.writeEndElement();
        }
    
        public void writeEndDocument() throws XMLStreamException {
            xmlSW.writeEndDocument();
        }
    
        public void close() throws XMLStreamException {
            xmlSW.close();
        }
    
        public void flush() throws XMLStreamException {
            xmlSW.flush();
        }
    
        public void writeAttribute(String localName, String value)
                throws XMLStreamException {
            xmlSW.writeAttribute(localName, value);
        }
    
        public void writeAttribute(String prefix, String namespaceURI,
                String localName, String value) throws XMLStreamException {
            xmlSW.writeAttribute(prefix, namespaceURI, localName, value);
        }
    
        public void writeAttribute(String namespaceURI, String localName,
                String value) throws XMLStreamException {
            xmlSW.writeAttribute(namespaceURI, localName, value);
        }
    
        public void writeNamespace(String prefix, String namespaceURI)
                throws XMLStreamException {
            xmlSW.writeNamespace(prefix, namespaceURI);
        }
    
        public void writeDefaultNamespace(String namespaceURI)
                throws XMLStreamException {
            xmlSW.writeDefaultNamespace(namespaceURI);
        }
    
        public void writeComment(String data) throws XMLStreamException {
            xmlSW.writeComment(data);
        }
    
        public void writeProcessingInstruction(String target)
                throws XMLStreamException {
            xmlSW.writeProcessingInstruction(target);
        }
    
        public void writeProcessingInstruction(String target, String data)
                throws XMLStreamException {
            xmlSW.writeProcessingInstruction(target, data);
        }
    
        public void writeCData(String data) throws XMLStreamException {
            xmlSW.writeCData(data);
        }
    
        public void writeDTD(String dtd) throws XMLStreamException {
            xmlSW.writeDTD(dtd);
        }
    
        public void writeEntityRef(String name) throws XMLStreamException {
            xmlSW.writeEntityRef(name);
        }
    
        public void writeStartDocument() throws XMLStreamException {
            xmlSW.writeStartDocument();
        }
    
        public void writeStartDocument(String version) throws XMLStreamException {
            xmlSW.writeStartDocument(version);
        }
    
        public void writeStartDocument(String encoding, String version)
                throws XMLStreamException {
            xmlSW.writeStartDocument(encoding, version);
        }
    
        public String getPrefix(String uri) throws XMLStreamException {
            return xmlSW.getPrefix(uri);
        }
    
        public void setPrefix(String prefix, String uri) throws XMLStreamException {
            xmlSW.setPrefix(prefix, uri);
        }
    
        public void setDefaultNamespace(String uri) throws XMLStreamException {
            xmlSW.setDefaultNamespace(uri);
        }
    
        public void setNamespaceContext(NamespaceContext context)
                throws XMLStreamException {
            xmlSW.setNamespaceContext(context);
        }
    
        public NamespaceContext getNamespaceContext() {
            return xmlSW.getNamespaceContext();
        }
    
        public Object getProperty(String name) throws IllegalArgumentException {
            return xmlSW.getProperty(name);
        }
    }
    

    Using this results in the desired output. Now to file a bug report...