Search code examples
pythonapache-nifi

Generate XML files with Japanese characters using NiFi


I am converting a JSON payload to XML file using Python ExecuteScript processor in NiFi. The JSON looks like this :

{
  "Header": {
    "Att1": 1,
    "Att2": "value2",
    "Att3": "1",
    "Att4": "경기00자123"
  }
}

The python script to convert this JSON to XML is as below :

import json
import xml.etree.ElementTree as ET
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback

class ModJSON(StreamCallback):

    def __init__(self):
        pass

    def process(self, inputStream, outputStream):
        text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
        data = json.loads(text)
        root = ET.Element("headerinfo")
        entity = ET.SubElement(root, "headerfile")
        ET.SubElement(entity, "Att1").text = str(data["Header"]["Att1"])
        ET.SubElement(entity, "Att2").text = str(data["Header"]["Att2"])
        ET.SubElement(entity, "Att3").text = str(data["Header"]["Att3"])
        ET.SubElement(entity, "Att4").text = data["Header"]["Att4"].encode("utf8")
        xmlNew = ET.tostring(root)
        outputStream.write(bytearray(xmlNew))

flowFile = session.get()
if flowFile != None:
    try :
        flowFile = session.write(flowFile, ModJSON())
        flowFile = session.putAttribute(flowFile, "filename", 'headerfile.xml')
        session.transfer(flowFile, REL_SUCCESS)
        session.commit()
    except Exception as e:
        flowFile = session.putAttribute(flowFile,'python_error', str(e))
        session.transfer(flowFile, REL_FAILURE)

No matter how I try to encode the Att4 with Japanese characters, it looks like this in the resulting XML :

京都111を3

How can I change the code to fix this? Tried a lot of different things but nothing seems to work.


Solution

  • seems there is an issue with byte-string in jython - they are automatically converted to str object with incorrect encoding.

    however ElementTree has write function that could write to a file-like object and OutputStream (java object) actually implements write function - so, we could make ElementTree write directly to OutputStream

    import json
    import xml.etree.ElementTree as ET
    from org.apache.commons.io import IOUtils
    from java.nio.charset import StandardCharsets
    from org.apache.nifi.processor.io import StreamCallback
    
    class ModJSON(StreamCallback):
        def process(self, inputStream, outputStream):
            text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
            data = json.loads(text)
            root = ET.Element("headerinfo")
            entity = ET.SubElement(root, "headerfile")
            ET.SubElement(entity, "Att1").text = str(data["Header"]["Att1"])
            ET.SubElement(entity, "Att2").text = str(data["Header"]["Att2"])
            ET.SubElement(entity, "Att3").text = str(data["Header"]["Att3"])
            ET.SubElement(entity, "Att4").text = data["Header"]["Att4"]
            ET.ElementTree(root).write(outputStream, encoding='utf-8')
    
    flowFile = session.get()
    if flowFile != None:
        try :
            flowFile = session.write(flowFile, ModJSON())
            flowFile = session.putAttribute(flowFile, "filename", 'headerfile.xml')
            session.transfer(flowFile, REL_SUCCESS)
            session.commit()
        except Exception as e:
            flowFile = session.putAttribute(flowFile,'python_error', str(e))
            session.transfer(flowFile, REL_FAILURE)