There is a web page that I would like to scrape some information from.
I start off with gathering a bunch of HTML Elements.
var theSearch = document.getElementsByClassName('theID');
I then take that HTML Collection and turn it into an array.
var arr = theSearch );
Now comes the tricky part.
I'd like to scroll down the page, and grab new items that have appeared on the page.
window.scrollTo(0, document.body.scrollHeight);
How does one access the newly inserted DOM nodes? Something like ...
var theSearch2 = document.getElementsByClassName('theID');
... and casting it into a new array ...
var arr2 = theSearch );
... and pushing the items from arr2
to arr
like ...
And how would one achieve an ongoing process which keeps scraping until no new items are appended into the page's DOM.
The OP might have a look into MutationObserver
. Whenever new items are rendered into the DOM (triggered by the scrolling) the observer's callback
receives a list of MutationRecord
instances which the OP can act upon.
function handleChildlistChanges(mutationList/*, observer*/) {
mutationList.forEach(mutation => {
const { type, addedNodes } = mutation;
if (type === 'childList') {
// one or more children have been added to
// and/or removed from the tree.
console.log({ scrapedContentNodes });
const scrapedContentNodes = [];
const options = {
//attributes: true,
childList: true,
//subtree: true,
const target = document.querySelector('#items');
const observer = new MutationObserver(handleChildlistChanges);
observer.observe(target, options);
// test case ... creating content.
['the quick', 'brown fox', 'jumped over', 'the lazy dog.']
.reduce((parentNode, content, idx) => {
const contentNode = document.createElement('p');
() => parentNode.appendChild(contentNode),
600 * idx,
return parentNode;
}, target);
.as-console-wrapper { left: auto!important; width: 70%; min-height: 100%; }
<div id="items">