Search code examples
htmlclibxml2

How do I get the text at a tag block libxml2?


I have htmlDocPtr htmlfile = htmlParseFile(localfileurl, NULL).

Local Html file

 <!DOCTYPE html>
<html>
<head>
<meta></meta>
<title>Page Title</title>
</head>
<body>

<h1>This is a Heading</h1>
<p>This is a paragraph.</p>

</body>
</html> 

I want to end up storing the Page Title in a char variable

I have tried

htmlNodePtr node = xmlDocGetRootElement(htmlfile);
// title is on the following
node = node->children->next->children->next->next->next;

How do I now get the value of title


Solution

  • Traverse through the document, looking for the element with the name "title", and get its contents:

    static void printTitle(xmlDoc *doc, xmlNode * a_node)
    {
        xmlNode *cur_node = NULL;
    
        for (cur_node = a_node; cur_node; cur_node = cur_node->next) {
            if (cur_node->type == XML_ELEMENT_NODE && !xmlStrcmp(cur_node->name, (const xmlChar *)"title")) {
                xmlChar* content;
                content = xmlNodeListGetString(doc, cur_node->xmlChildrenNode, 1);
                printf("node type: Element, name: %s, content: %s\n", cur_node->name, content);
                xmlFree(content);
            }
    
            printTitle(doc, cur_node->children);
        }
    }
    
    
    int main(int argc, char **argv)
    {
        xmlDoc *doc = NULL;
        xmlNode *root_element = NULL;
    
        if (argc != 2)
            return(1);
    
        LIBXML_TEST_VERSION
    
        doc = xmlReadFile(argv[1], NULL, 0);
    
        if (doc == NULL) {
            printf("error: could not parse file %s\n", argv[1]);
        }
    
        root_element = xmlDocGetRootElement(doc);
    
        printTitle(doc, root_element);
    
        xmlFreeDoc(doc);
    
        xmlCleanupParser();
    
        return 0;
    }
    

    (See Retrieving Element Content)