I've been trying to use puppeteer to scrape Twitch.
The idea for this program is to get the icon, username, & thumbnail of every stream in (for example) the category 'Just Chatting' in the 1st page. I think my main code is working, but the object I'm trying to return (properties) is being returned as undefined.
I tried adding await behind my console.log in the function log(), and I also searched it up on here and read that the values returned from evaluate function have to be json serializeable, which I believe do include the strings which the object would have. Any help would be appreciated, thanks!
let properties = { icon: [], user: [], img: [], link: [] };
const puppeteer = require('puppeteer');
let elements = {
'https://www.twitch.tv/directory/game/Just%20Chatting': [
'img[class="InjectLayout-sc-588ddc-0.iyfkau.tw-image.tw-image-avatar"]',
'a[class="ScCoreLink-udwpw5-0.cxXSPs.tw-link"]',
'img[class="tw-image"]',
],
};
async function scrapeStreams() {
console.log('scrape started');
try {
console.log('try started');
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto(Object.keys(elements)[0], { waitUntil: 'networkidle2' });
await page.evaluate(
(properties, elements) => {
for ([key, value] of Object.entries(elements)) {
if ((key = Object.keys(elements)[0])) {
value.forEach((element) => {
if ((element = Object.values(elements)[0])) {
el = document.querySelector(element);
for (let val in el) {
datatype = val.src;
Object.values(properties)[0].push(datatype);
}
} else if ((element = Object.values(elements)[1])) {
el = document.querySelector(element);
for (let val in el) {
datatype = val.innerHTML;
Object.values(properties)[1].push(datatype);
}
} else if ((element = Object.values(elements)[2])) {
el = document.querySelector(element);
for (let val in el) {
datatype = val.src;
Object.values(properties)[2].push(datatype);
}
}
});
}
}
return properties;
},
properties,
elements
);
} catch (error) {
console.log('THIS IS THE ERROR: ' + error);
}
}
async function log() {
let properties = await scrapeStreams();
console.log(properties);
}
log();
Variables inside and outside of the function argument of page.evaluate()
are not the same: they are copied while transferred between Node.js and browser contexts. So while you change properties
inside page.evaluate()
, the properties
outside remains unchanged. While you use return properties;
inside page.evaluate()
, you are not save the returned value.
You forget to return value in scrapeStreams()
.
However, it seems there are some other issues in your code (many null
are returned), but you may use another question for them.
// ...
// FIXED:
properties = await page.evaluate(
// ...
// FIXED:
return properties;
} catch (error) {
console.log('THIS IS THE ERROR: ' + error);
}
}
// ...