I'm trying to parse an HTML string, and write it out again (for the purposes of brevity, I've left out the transforms I wish to perform).
#include <iostream>
#include <libxml/HTMLparser.h>
static const char *html = "<!DOCTYPE html><html><head></head><body><div></div></body></html>";
int main(int argc, const char * argv[]) {
xmlChar *buff;
int buffersize;
htmlDocPtr doc = htmlReadMemory(html, (unsigned)strlen(html), "noname.html", NULL, 0);
xmlDocDumpFormatMemory(doc, &buff, &buffersize, 1);
printf("%s", (char *) buff);
xmlFree(buff);
xmlFreeDoc(doc);
return 0;
}
(I am almost 100% green in C++ land - please excuse any outlaying errors)
This all works (in that it doesn't error out), but xmlDocDumpFormatMemory
treats the tree as XML and outputs accordingly:
<?xml version="1.0" standalone="yes"?>
<!DOCTYPE html>
<html>
<head/>
<body>
<div/>
</body>
</html>
As you can see, there's an XML declaration added above the doctype, and empty tags have been self-closed. As far as I can tell, there is no htmlDocDumpFormatMemory
- is there an option or alternate function I can use?
Try the following code. It makes use of the xmlSaveToXXXX
-functions (xmlSaveToBuffer
, in this case), which allow to pass options
like, for example, XML_SAVE_NO_DECL
:
#include <iostream>
#include <libxml/HTMLparser.h>
#include <libxml/xmlsave.h>
static const char *html = "<!DOCTYPE html><html><head></head><body><div></div></body></html>";
int main(int argc, const char * argv[]) {
htmlDocPtr doc = htmlReadMemory(html, (unsigned)strlen(html), "noname.html", NULL, 0);
xmlBufferPtr buffer = xmlBufferCreate();
if (buffer == NULL)
return 1; // Add error handling...
xmlSaveCtxtPtr saveCtxtPtr = xmlSaveToBuffer(buffer,NULL, XML_SAVE_NO_DECL);
if (xmlSaveDoc(saveCtxtPtr, doc) < 0)
return 1; // Add error handling
xmlSaveClose(saveCtxtPtr);
const xmlChar *xmlCharBuffer = xmlBufferContent(buffer);
printf("%s", xmlCharBuffer);
xmlBufferFree(buffer);
xmlFreeDoc(doc);
return 0;
}
It produces the following output:
<!DOCTYPE html>
<html><head></head><body><div></div></body></html>