Search code examples
c++visual-c++msxmlmsxml6

Getting the name of child node in xml returns #text c++


I am trying to retrieve the name of the child node tag in an xml document. My xml document looks something like this:

<?xml version="1.0" encoding="utf-8"?>
<Parent>
  <child1>
    <grandchild1>someinfo1</grandchild1>
    <grandchild2>someinfo2</grandchild2>
  </child1>
  <child2>
    <grandchild3>someinfo3</grandchild3>
    <grandchild4>someinfo4</grandchild4>
  </child2>
</Parent>

I need to loop and find the tag names like child1 grandchild1 etc.

My code for doing the following is as follows:

IXMLDOMDocument *pXMLDom = NULL;
IXMLDOMNodeList *pNodes = NULL;
IXMLDOMNode *pNode = NULL;

pXMLDom->put_async(VARIANT_FALSE);
pXMLDom->put_validateOnParse(VARIANT_TRUE);
pXMLDom->put_resolveExternals(VARIANT_FALSE);
pXMLDom->put_preserveWhiteSpace(VARIANT_TRUE);

BSTR parentNode = SysAllocString(L"//Parent/*");

pXMLDom->selectNodes(parentNode, &pNodes); 
pNodes->get_length(&length);

for (int i = 0; i < length; i++)
{
    pNodes->get_item(i, &pNode);
    BSTR temp = NULL;
    pNode->get_xml(&temp);
    printf("Node (%d), <%S>:\n", i, temp); // works fine until this point

    IXMLDOMNode *firstChild;
    pNode->get_firstChild(&firstChild);

    IXMLDOMNodeList *childNodes;
    pNode->get_childNodes(&childNodes);

    firstChild->get_nodeName(&temp); // Does not work
    firstChild->get_baseName(&temp); // Does not work
}

Please note I have only provided a very minimalist version of my code for sake of simplicity. If there is any additional clarification or code required I will be happy to provide. Any pointers in the right direction will be helpful. Most of the code has been written with the help from msdn.


Solution

  • XML is comprised of nodes, and there are many different kinds of nodes (elements, attributes, text, namespaces, processing instructions, comments, documents, etc).

    An XML element node that contains text content will have a child node named #text. This is dictated by the XML specification. So, in your example, grandchild1, grandchild2, grandchild3, and grandchild4 all have a child #text node, eg:

    Document
    |
    |_ PI: <?xml version="1.0" encoding="utf-8"?>
    |
    |_ Element: "Parent"
        |
        |_ Element: "child1"
        |   |
        |   |_ Element: "grandchild1"
        |   |   |
        |   |   |_ #text "someinfo1"
        |   |
        |   |_ Element: "grandchild2"
        |       |
        |       |_ #text "someinfo2"
        |
        |_ Element: "child2"
            |
            |_ Element: "grandchild3"
            |    |
            |    |_ #text: "someinfo3"
            |
            |_ Element: "grandchild4"
                |
                |_ #text: "someinfo4"
    

    Even whitespace between elements, even if just line breaks, get stored as extra text nodes (because you are setting the preserveWhiteSpace option to true), eg:

    Document
    |
    |_ PI: <?xml version="1.0" encoding="utf-8"?>
    |
    |_ #text "\r\n"
    |
    |_ Element: "Parent"
        |
        |_ #text "\r\n  "
        |
        |_ Element: "child1"
        |   |
        |   |_ #text "\r\n    "
        |   |
        |   |_ Element: "grandchild1"
        |   |   |
        |   |   |_ #text "someinfo1"
        |   |
        |   |_ #text "\r\n    "
        |   |
        |   |_ Element: "grandchild2"
        |       |
        |       |_ #text "someinfo2"
        |
        |_ #text "\r\n  "
        |
        |_ Element: "child2"
        |   |
        |   |_ #text "\r\n    "
        |   |
        |   |_ Element: "grandchild3"
        |   |    |
        |   |    |_ #text: "someinfo3"
        |   |
        |   |_ #text "\r\n    "
        |   |
        |   |_ Element: "grandchild4"
        |   |   |
        |   |   |_ #text: "someinfo4"
        |   |
        |   |_ #text "\r\n  "
        |
        |_ #text "\r\n"
    

    XPath searches all nodes, but the * wildcard only matches element nodes. But you are manually drilling into the children of found elements, so you are going to encounter the #text nodes. For what you are attempting to do, turn OFF whitespace preservation to remove unwanted whitespace text nodes, and then focus only on element child nodes, eg:

    IXMLDOMDocument *pXMLDom = NULL;
    IXMLDOMNodeList *pNodes = NULL;
    IXMLDOMNode *pNode = NULL;
    long length = 0;
    
    // create pXMLDom as needed ...
    pXMLDom->put_async(VARIANT_FALSE);
    pXMLDom->put_validateOnParse(VARIANT_TRUE);
    pXMLDom->put_resolveExternals(VARIANT_FALSE);
    pXMLDom->put_preserveWhiteSpace(VARIANT_FALSE); // <--
    
    BSTR parentNode = SysAllocString(L"//Parent/*");
    HRESULT hRes = pXMLDom->selectNodes(parentNode, &pNodes); 
    SysFreeString(parentNode);
    
    if (SUCCEEDED(hRes))
    {
        pNodes->get_length(&length);
    
        for (int i = 0; i < length; ++i)
        {
            hRes = pNodes->get_item(i, &pNode);
            if (SUCCEEDED(hRes))
            {
                BSTR name = NULL;
                hRes = pNode->get_nodeName(&name);
                if (SUCCEEDED(hRes))
                {
                    printf("Node (%d), <%S>:\n", i, name);
                    SysFreeString(name);
                }
    
                IXMLDOMNode *pChild = NULL;
                hRes = pNode->get_firstChild(&pChild);
                if (hRes == S_OK)
                {
                    do
                    {
                        DOMNodeType type;
                        hRes = pChild->get_nodeType(&type);  
                        if ((SUCCEEDED(hRes) && (type == NODE_ELEMENT))
                        {
                            hRes = pNode->get_nodeName(&name);
                            if (SUCCEEDED(hRes))
                            {
                                printf("  %S\n", name);
                                SysFreeString(name);
                            }
                        }
    
                        IXMLDOMNode *pSibling = NULL;
                        hRes = pChild->get_nextSibling(&pSibling);
                        if (hRes != S_OK) break;
    
                        pChild->Release();
                        pChild = pSibling;
                    }
                    while (true);
    
                    pChild->Release();
                }
    
                pNode->Release();
            }
        }
    
        pNodes->Release();
    }
    
    ...
    
    pXMLDom->Release();
    

    If you need to go more than 2 levels deep, you should setup a recursive loop instead, eg:

    void processNode(IXMLDOMNode *pNode)
    {
        BSTR name = NULL;
        hRes = pNode->get_nodeName(&name);
        if (SUCCEEDED(hRes))
        {
            printf("%S\n", name);
            SysFreeString(name);
        }
    
        IXMLDOMNode *pChild = NULL;
        hRes = pNode->get_firstChild(&pChild);
        if (hRes == S_OK)
        {
            do
            {
                DOMNodeType type;
                hRes = pChild->get_nodeType(&type);  
                if ((SUCCEEDED(hRes) && (type == NODE_ELEMENT))
                    processNode(pChild);
    
                IXMLDOMNode *pSibling = NULL;
                hRes = pChild->get_nextSibling(&pSibling);
                if (hRes != S_OK) break;
    
                pChild->Release();
                pChild = pSibling;
            }
            while (true);
    
            pChild->Release();
        }
    }
    
    ...
    
    IXMLDOMDocument *pXMLDom = NULL;
    IXMLDOMNodeList *pNodes = NULL;
    IXMLDOMNode *pNode = NULL;
    long length = 0;
    
    // create pXMLDom as needed ...
    pXMLDom->put_async(VARIANT_FALSE);
    pXMLDom->put_validateOnParse(VARIANT_TRUE);
    pXMLDom->put_resolveExternals(VARIANT_FALSE);
    pXMLDom->put_preserveWhiteSpace(VARIANT_FALSE); // <--
    
    BSTR parentNode = SysAllocString(L"//Parent/*");
    HRESULT hRes = pXMLDom->selectNodes(parentNode, &pNodes); 
    SysFreeString(parentNode);
    
    if (SUCCEEDED(hRes))
    {
        pNodes->get_length(&length);
    
        for (int i = 0; i < length; ++i)
        {
            hRes = pNodes->get_item(i, &pNode);
            if (SUCCEEDED(hRes))
            {
                processNode(pNode);
                pNode->Release();
            }
        }
    
        pNodes->Release();
    }
    
    ...
    
    pXMLDom->Release();