Search code examples
c++cdataxerces

Xerces parser CDATA section not parsed correctly


I have a problem parsing CDATA section when using the xerces parser. It manage to parse some of the xml correct but for some reason a part of the CDATA section contains wired sign.

This is my xml string:

const std::string XML = R"(<?xml version="1.0" encoding="UTF-8"?>
<AAA version="5.2.0">
    <Status>
        <CurrentStatus>
            <Status><![CDATA[<LghjL><AAAA><BBBCCCCCDD><EEEEE>AAAAccf123</EEEEE></BBBCCCCCDD></AAAA></FF><AAAA><BBBCCCCCDD><EEEEE></BBBCCCCCDD>]]></Status>
            <Test><![CDATA[<TEST>data</TEST>]]></Test>
        </CurrentStatus>
    </Status>
</AAA>)";

This is how my parser is set up:


#include <iostream>
#include <string>
#include <list>
#include <xercesc/framework/MemBufInputSource.hpp> 
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/sax/HandlerBase.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/ErrorHandler.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLUni.hpp>
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/validators/common/Grammar.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/MemBufFormatTarget.hpp>
#include <xercesc/dom/DOMImplementationRegistry.hpp>
#include <xercesc/dom/DOMErrorHandler.hpp>
#include <xercesc/dom/DOMImplementationLS.hpp>
#include <xercesc/dom/DOMConfiguration.hpp>
#include <xercesc/dom/DOMCDATASection.hpp>
#include <xercesc/util/XMLException.hpp>
#include <xercesc/sax/SAXException.hpp>
#include <xercesc/dom/DOMDocument.hpp>

#include <xercesc/dom/DOMImplementation.hpp>
#include <xercesc/dom/DOMLSSerializer.hpp>
#include <xercesc/dom/DOMLSOutput.hpp>
#include <xercesc/framework/LocalFileFormatTarget.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <sstream>

namespace xmlx
{
class XMLNode
{
private:
xercesc::DOMElement* _element;
bool    _first_in_loop;

public:
XMLNode() : _element(nullptr), _first_in_loop(true)
{
}

XMLNode(xercesc::DOMElement* element) : _element(element), _first_in_loop(true) {}

~XMLNode()
{
}

    xercesc::DOMDocument* parseXml(const std::string& XML = "")
    {
        xercesc::MemBufInputSource memBuf(reinterpret_cast<const XMLByte*>(XML.c_str()),             XML.size(), "xmlBuffer", false);
    
    
        xercesc::XercesDOMParser* parser = new xercesc::XercesDOMParser();
        parser->setValidationScheme(xercesc::XercesDOMParser::Val_Never);
        parser->setDoNamespaces(false);
        parser->setDoSchema(false);
        parser->setLoadExternalDTD(false);
    
        xercesc::DOMDocument* doc;
        try
        {
            parser->parse(memBuf);
            doc = parser->getDocument();
            delete parser;
        }
        catch (...)
        {
            std::cerr << "Unexpected exception during parsing" << std::endl;
        }

        _element = doc->getDocumentElement();
        return doc;
    }
    XMLNode operator [](const std::string& tagName){
        if(_element == nullptr)
        {
            std::string error = "------------------- OBS *************** Current element is null element: " +tagName; 
            throw(std::runtime_error(error));
        }
        for (DOMNode* child = _element->getFirstChild(); child != nullptr; child = child->getNextSibling())
        {
            if (child->getNodeType() == xercesc::DOMNode::ELEMENT_NODE)
            {

                xercesc::DOMElement* currentElement = dynamic_cast<xercesc::DOMElement*>(child);
                std::string myStr = xercesc::XMLString::transcode(currentElement->getTagName());
                if (myStr == tagName)
                {
                    return XMLNode(currentElement);
                }
            }
        }
        std::string error = "No next element found";
        throw(std::runtime_error(error));
    }

    std::string getText()
    {
        if (_element == nullptr) return "";

        for (DOMNode* child = _element->getFirstChild(); child != nullptr; child = child->getNextSibling())
        {
            if (child->getNodeType() == xercesc::DOMNode::TEXT_NODE)
            {
                xercesc::DOMText* tempText = dynamic_cast<xercesc::DOMText*>(child);
                if (tempText == nullptr)
                {
                    return "";
                }
                char* textContent = xercesc::XMLString::transcode(tempText->getData());
                std::string text(textContent);
                xercesc::XMLString::release(&textContent);
                return text;
            }
            if (child->getNodeType() == xercesc::DOMNode::CDATA_SECTION_NODE)
            {
                xercesc::DOMCDATASection* tempCdata1 = dynamic_cast<xercesc::DOMCDATASection*>(child);
                if (tempCdata1 == nullptr)
                {
                    return "";
                }
                char* textContent1 = xercesc::XMLString::transcode(tempCdata1->getData());
                std::string text1(textContent1);
                xercesc::XMLString::release(&textContent1);
                return text1;
            }
        }
        return "";
    }
};
}

Printing out to the console:

xercesc::XMLPlatformUtils::Initialize();

xmlx::XMLNode myXmlx;
myXmlx.parseXml(XML);
std::cout << myXmlx["Status"]["CurrentStatus"]["Test"].getText() << "\n"; 
//I get: <TEST>data</TEST> 
std::cout << myXmlx["Status"]["CurrentStatus"]["Status"].getText() << "\n"; 
//I get: ????jL><AAAA><BBBCCCCCDD><EEEEE>AAAAccf123</EEEEE></BBBCCCCCDD></AAAA></FF><AAAA><BBBCCCCCDD><EEEEE></BBBCCCCCDD>
xercesc::XMLPlatformUtils::Terminate();

I get some very wired signs(replaced them with ? signs here) in the beginning for the longer CDATA section but do not know why there would be a difference between them cause I am not using any wired signs inside of the longer CDATA section and all my other test cases works.

Also if I print out my string before parsing it into the xerces dom-tree it shows up fine so there should not be a problem with my terminal not being able to handle some of the signs.


Solution

  • It is an error to call getDocument() and then delete parser, because the parser owns the Document:

    getDocument(): The returned DOMDocument object is owned by the parser.

    Instead, call parser->adoptDocument() to transfer ownership to your XMLNode:

    adoptDocument(): The caller will adopt the DOMDocument and thus is responsible to call DOMDocument::release() to release the associated memory. The parser will not delete it. The ownership is transferred from the parser to the caller.