I have a problem parsing CDATA section when using the xerces parser. It manage to parse some of the xml correct but for some reason a part of the CDATA section contains wired sign.
This is my xml string:
const std::string XML = R"(<?xml version="1.0" encoding="UTF-8"?>
<AAA version="5.2.0">
<Status>
<CurrentStatus>
<Status><![CDATA[<LghjL><AAAA><BBBCCCCCDD><EEEEE>AAAAccf123</EEEEE></BBBCCCCCDD></AAAA></FF><AAAA><BBBCCCCCDD><EEEEE></BBBCCCCCDD>]]></Status>
<Test><![CDATA[<TEST>data</TEST>]]></Test>
</CurrentStatus>
</Status>
</AAA>)";
This is how my parser is set up:
#include <iostream>
#include <string>
#include <list>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/sax/HandlerBase.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/ErrorHandler.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLUni.hpp>
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/validators/common/Grammar.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/MemBufFormatTarget.hpp>
#include <xercesc/dom/DOMImplementationRegistry.hpp>
#include <xercesc/dom/DOMErrorHandler.hpp>
#include <xercesc/dom/DOMImplementationLS.hpp>
#include <xercesc/dom/DOMConfiguration.hpp>
#include <xercesc/dom/DOMCDATASection.hpp>
#include <xercesc/util/XMLException.hpp>
#include <xercesc/sax/SAXException.hpp>
#include <xercesc/dom/DOMDocument.hpp>
#include <xercesc/dom/DOMImplementation.hpp>
#include <xercesc/dom/DOMLSSerializer.hpp>
#include <xercesc/dom/DOMLSOutput.hpp>
#include <xercesc/framework/LocalFileFormatTarget.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <sstream>
namespace xmlx
{
class XMLNode
{
private:
xercesc::DOMElement* _element;
bool _first_in_loop;
public:
XMLNode() : _element(nullptr), _first_in_loop(true)
{
}
XMLNode(xercesc::DOMElement* element) : _element(element), _first_in_loop(true) {}
~XMLNode()
{
}
xercesc::DOMDocument* parseXml(const std::string& XML = "")
{
xercesc::MemBufInputSource memBuf(reinterpret_cast<const XMLByte*>(XML.c_str()), XML.size(), "xmlBuffer", false);
xercesc::XercesDOMParser* parser = new xercesc::XercesDOMParser();
parser->setValidationScheme(xercesc::XercesDOMParser::Val_Never);
parser->setDoNamespaces(false);
parser->setDoSchema(false);
parser->setLoadExternalDTD(false);
xercesc::DOMDocument* doc;
try
{
parser->parse(memBuf);
doc = parser->getDocument();
delete parser;
}
catch (...)
{
std::cerr << "Unexpected exception during parsing" << std::endl;
}
_element = doc->getDocumentElement();
return doc;
}
XMLNode operator [](const std::string& tagName){
if(_element == nullptr)
{
std::string error = "------------------- OBS *************** Current element is null element: " +tagName;
throw(std::runtime_error(error));
}
for (DOMNode* child = _element->getFirstChild(); child != nullptr; child = child->getNextSibling())
{
if (child->getNodeType() == xercesc::DOMNode::ELEMENT_NODE)
{
xercesc::DOMElement* currentElement = dynamic_cast<xercesc::DOMElement*>(child);
std::string myStr = xercesc::XMLString::transcode(currentElement->getTagName());
if (myStr == tagName)
{
return XMLNode(currentElement);
}
}
}
std::string error = "No next element found";
throw(std::runtime_error(error));
}
std::string getText()
{
if (_element == nullptr) return "";
for (DOMNode* child = _element->getFirstChild(); child != nullptr; child = child->getNextSibling())
{
if (child->getNodeType() == xercesc::DOMNode::TEXT_NODE)
{
xercesc::DOMText* tempText = dynamic_cast<xercesc::DOMText*>(child);
if (tempText == nullptr)
{
return "";
}
char* textContent = xercesc::XMLString::transcode(tempText->getData());
std::string text(textContent);
xercesc::XMLString::release(&textContent);
return text;
}
if (child->getNodeType() == xercesc::DOMNode::CDATA_SECTION_NODE)
{
xercesc::DOMCDATASection* tempCdata1 = dynamic_cast<xercesc::DOMCDATASection*>(child);
if (tempCdata1 == nullptr)
{
return "";
}
char* textContent1 = xercesc::XMLString::transcode(tempCdata1->getData());
std::string text1(textContent1);
xercesc::XMLString::release(&textContent1);
return text1;
}
}
return "";
}
};
}
Printing out to the console:
xercesc::XMLPlatformUtils::Initialize();
xmlx::XMLNode myXmlx;
myXmlx.parseXml(XML);
std::cout << myXmlx["Status"]["CurrentStatus"]["Test"].getText() << "\n";
//I get: <TEST>data</TEST>
std::cout << myXmlx["Status"]["CurrentStatus"]["Status"].getText() << "\n";
//I get: ????jL><AAAA><BBBCCCCCDD><EEEEE>AAAAccf123</EEEEE></BBBCCCCCDD></AAAA></FF><AAAA><BBBCCCCCDD><EEEEE></BBBCCCCCDD>
xercesc::XMLPlatformUtils::Terminate();
I get some very wired signs(replaced them with ? signs here) in the beginning for the longer CDATA section but do not know why there would be a difference between them cause I am not using any wired signs inside of the longer CDATA section and all my other test cases works.
Also if I print out my string before parsing it into the xerces dom-tree it shows up fine so there should not be a problem with my terminal not being able to handle some of the signs.
It is an error to call getDocument()
and then delete parser
, because the parser owns the Document:
getDocument()
: The returnedDOMDocument
object is owned by the parser.
Instead, call parser->adoptDocument()
to transfer ownership to your XMLNode
:
adoptDocument()
: The caller will adopt theDOMDocument
and thus is responsible to callDOMDocument::release()
to release the associated memory. The parser will not delete it. The ownership is transferred from the parser to the caller.