I want to get the src of multiple images but their selectors don't seem to work, like the elements are fake and aren't actually in the page.
https://imgflip.com/memegenerator/Drake-Hotline-Bling
This is one of the pages, every page has one of my desired elements (the main image), each with the same selector.
I have tried multiple selectors like:
'#mm-preview-outer > div.mm-preview > img'
and 'img[alt="meme generator image preview"]'
but they don't work.
I tested my code by scraping other elements and everything works but when I change the selector in the .$eval()
to my desired element it doesn't work (no errors).
This is my code working correctly with a different selector:
const puppeteer = require('puppeteer');
(async()=>{
const browser = await puppeteer.launch({
headless:false,
defaultViewport:false,
userDataDir:'./tmp'
});
const page = await browser.newPage();
const page2 = await browser.newPage();
await page.goto('https://imgflip.com/memetemplates');
const boxes = await page.$$('.mt-boxes > .mt-box');
for(const box of boxes){
try {
const title = await page.evaluate((el) => el.querySelector('h3 > a').textContent, box);
const pageurl = await page.evaluate((el) => el.querySelector('a.mt-caption').getAttribute('href'), box);
await page2.goto(`https://imgflip.com${pageurl}`);
const imageurl = await page.$eval('img[alt="Imgflip Logo"]', el => el.src);
console.log('The source of',title,'is')
console.log(imageurl);
} catch(error){}
}
await browser.close();
})();
Technically all I need to do is to change 'img[alt="Imgflip Logo"]'
to 'img[alt="meme generator image preview"]'
but this doesn't work.
To get img[alt="meme generator image preview"]
you can use
'img.mm-img'
or 'img[class^=mm-img]'
selectors (note : ^
means begins with)el.src
with el.getAttribute('src')
so change
const imageurl = await page.$eval('img[alt="Imgflip Logo"]', el => el.src);
to
const imageurl = await page2.$eval('img[class^=mm-img]', el => el.getAttribute('src'));
in your code.
you can also re-write some of the other parts like below, even though they are not necessary for what you want.
const puppeteer = require('puppeteer');
(async()=>{
const browser = await puppeteer.launch({
headless:false,
defaultViewport:false,
});
const page = await browser.newPage();
await page.goto('https://imgflip.com/memetemplates', {waitUntil: "load", timeout: 15000 });
await page.waitForSelector('#page');
const boxes = await page.$$('.mt-box');
let allMemes = [];
for(let box of boxes){
try {
let title = await box.$eval('h3 > a', el => el.textContent);
let link = await box.$eval('a.mt-caption', el => el.getAttribute('href'));
const page2 = await browser.newPage();
await page2.goto(`https://imgflip.com${link}`, {waitUntil: "load", timeout: 15000 });
await page2.waitForSelector('body');
const imageurl = await page2.$eval('img[class^=mm-img]', el => el.getAttribute('src'));
console.log("The source of", title, "is");
console.log(imageurl);
await page2.close();
} catch(error){
console.log(error);
}
}
await browser.close();
})();
If you need to get the already existing memes then you can do it this way..
Code :
const puppeteer = require('puppeteer');
(async()=>{
const browser = await puppeteer.launch({
headless:false,
defaultViewport:false,
});
const page = await browser.newPage();
await page.goto('https://imgflip.com/memetemplates', {waitUntil: "load", timeout: 15000 });
await page.waitForSelector('#page');
const boxes = await page.$$('.mt-box');
let allMemes = [];
for(let box of boxes){
try {
let data = await box.$eval('.mt-title > a', el => {return { link : el.getAttribute('href'), text : el.textContent}});
const page2 = await browser.newPage();
await page2.goto(`https://imgflip.com${data.link}`, {waitUntil: "load", timeout: 15000 });
await page2.waitForSelector('body');
// some pages have empty or ads in .base-unit without any h2, so the :has(h2) selector checks if the div has a h2 or not.
let memes = await page2.$$(".base-unit:has(h2)");
let relative = [];
for (let m of memes) { // get all relative memes on the page
let title = await m.$eval('h2 > a', el => {return { link: el.getAttribute("href"), text: el.textContent };});
// some pages have the image in an a tag, others have it in a div tag, so if div get data-src if a tag get src.
let image = (!! await m.$('div.base-img')) ? await m.$eval('.base-img', el => el.getAttribute("data-src")) : await m.$eval('.base-img', el => el.getAttribute("src"));
relative.push({ link: title.link, text : title.text, image: image});
}
await page2.close();
allMemes.push({
link : data.link,
text : data.text,
relative : relative
});
} catch(error){
console.log(error);
}
}
await browser.close();
console.dir(allMemes, { depth: null }); // same as log, but show all objects
})();