I want to clear some tags from a html document like so:
const { JSDOM } = require('jsdom');
function clearAnchorTags(pDom: JSDOM) {
if (typeof pDom === 'string') {
pDom = new JSDOM(pDom);
}
// Get all anchor tags from dom
for (const anchorTag of pDom.window.document.querySelectorAll('a')) {
for (const child of anchorTag.children) {
if (child.nodeName === 'EM') {
const node = pDom.window.document.createTextNode(child.textContent);
anchorTag.replaceChild(node, child);
}
if (child.nodeName === 'B') {
const node = pDom.window.document.createTextNode(child.textContent);
anchorTag.replaceChild(node, child);
}
if (child.nodeName === 'U') {
const node = pDom.window.document.createTextNode(child.textContent);
anchorTag.replaceChild(node, child);
}
}
}
// return as string just as we received it -- also replace with space
return pDom.serialize().replace(/ /g, ' ');
}
The issue seems to be the serialize() method here at the end, as it returns a fully featured HTML document.
I need it without the <html>, <head> and <body>
tags. I only need a HTML fragment.
I fixed my problem using
return pDom.window.document.body.innerHTML.replace(/ /g, ' ');