Why does the following use of DOMParser result in HTML different than the HTML used as input? It removes the whitespace between the DOCTYPE and the top-level element, removes the whitespace between the document element and head
, and adds a newline before </body>
.
I have tested this in Google Chrome, Firefox, and Safari; I also ran the analogous code with JSoup and got exactly the same results. So I'm pretty sure it's not a bug. My current theory is that this is caused by some sort of esoteric parsing rule from a specification somewhere. But there could be other things I misunderstand.
const html = `<!DOCTYPE html>
<html>
<head>
<title>1</title>
</head>
<body>
<div>
Hello, World!
</div>
</body>
</html>`;
const setText = function(id,string) {
document.getElementById(id).appendChild(document.createTextNode(string));
};
const documentToString = function(d) {
return Array.prototype.slice.call(d.childNodes).map(function(node) {
if (node.nodeType == node.ELEMENT_NODE) return node.outerHTML;
if (node.nodeType == node.DOCUMENT_TYPE_NODE) return new XMLSerializer().serializeToString(node);
throw new TypeError("" + node);
}).join("");
};
setText("raw", html);
var parsed = new DOMParser().parseFromString(html,"text/html");
setText("parsed", parsed.documentElement.outerHTML);
setText("converted", documentToString(parsed));
setText("xmlserializer", new XMLSerializer().serializeToString(parsed));
#raw, #parsed, #converted, #xmlserializer { white-space: pre; font-family: monospace; }
h1 { font-size: 110%; font-weight: bold; font-family: sans-serif; }
<body>
<h1>Raw string</h1>
<div id="raw"></div>
<h1>Parsed top-level element</h1>
<div id="parsed"></div>
<h1>Using a document-to-string converter</h1>
<div id="converted"></div>
<h1>From XMLSerializer</h1>
<div id="xmlserializer"></div>
</body>
Because that's what the specs ask to do.
HTML is not XML, and a lot of transformations will happen. For instance, you may not realize, but your very StackSnippet contains a duplicate <body>
tag, because the HTML section is actually wrapped in such a tag by the snippet's script. The duplicate is ignored at document parsing.
console.log('how many bodies?', document.querySelectorAll('body').length);
<body><body><body></body></body></body>
Similar transformations happen to TextNodes.
And yes, it's not a DOMParser
thing, it's really an HTML DOM parsing one, you have the same behavior at document parsing:
frame.src = URL.createObjectURL(new Blob([
`<!DOCTYPE html>
<html>
<head>
<title>1</title>
</head>
<body>
<div>
Hello, World!
</div>
<script> parent.postMessage(document.documentElement.outerHTML, "*");
<\/script>
</body>
</html>`], {type: 'text/html'}));
onmessage = e => console.log(e.data);
<iframe id="frame"></iframe>
Now, if you wish to retrieve the exact same string, then parse it as XML instead:
const html = `<!DOCTYPE html>
<html>
<head>
<title>1</title>
</head>
<body>
<div>
Hello, World!
</div>
</body>
</html>`;
const setText = function(id,string) {
document.getElementById(id).appendChild(document.createTextNode(string));
};
const documentToString = function(d) {
return Array.prototype.slice.call(d.childNodes).map(function(node) {
if (node.nodeType == node.ELEMENT_NODE) return node.outerHTML;
if (node.nodeType == node.DOCUMENT_TYPE_NODE) return new XMLSerializer().serializeToString(node);
throw new TypeError("" + node);
}).join("");
};
setText("raw", html);
var parsed = new DOMParser().parseFromString(html,"text/xml");
setText("parsed", parsed.documentElement.outerHTML);
setText("converted", documentToString(parsed));
setText("xmlserializer", new XMLSerializer().serializeToString(parsed));
#raw, #parsed, #converted, #xmlserializer { white-space: pre; font-family: monospace; }
h1 { font-size: 110%; font-weight: bold; font-family: sans-serif; }
<h1>Raw string</h1>
<div id="raw"></div>
<h1>Parsed top-level element</h1>
<div id="parsed"></div>
<h1>Using a document-to-string converter</h1>
<div id="converted"></div>
<h1>From XMLSerializer</h1>
<div id="xmlserializer"></div>