Search code examples
javascripthtmlweb-scrapingpuppeteerscreen-scraping

Puppeteer element can't be reached with selector


I want to get the src of multiple images but their selectors don't seem to work, like the elements are fake and aren't actually in the page.

https://imgflip.com/memegenerator/Drake-Hotline-Bling

This is one of the pages, every page has one of my desired elements (the main image), each with the same selector.

I have tried multiple selectors like:

'#mm-preview-outer > div.mm-preview > img'and 'img[alt="meme generator image preview"]'but they don't work.

I tested my code by scraping other elements and everything works but when I change the selector in the .$eval() to my desired element it doesn't work (no errors).

This is my code working correctly with a different selector:

const puppeteer = require('puppeteer');

(async()=>{
    const browser = await puppeteer.launch({
        headless:false,
        defaultViewport:false,
        userDataDir:'./tmp'
    });


    const page = await browser.newPage();
    const page2 = await browser.newPage();
    await page.goto('https://imgflip.com/memetemplates');

    const boxes = await page.$$('.mt-boxes > .mt-box');
    for(const box of boxes){
        try {
            const title = await page.evaluate((el) => el.querySelector('h3 > a').textContent, box);
            const pageurl = await page.evaluate((el) => el.querySelector('a.mt-caption').getAttribute('href'), box);

            await page2.goto(`https://imgflip.com${pageurl}`);
            const imageurl = await page.$eval('img[alt="Imgflip Logo"]', el => el.src);
            console.log('The source of',title,'is')
            console.log(imageurl);
        } catch(error){}
    }
    await browser.close();
})();

Technically all I need to do is to change 'img[alt="Imgflip Logo"]' to 'img[alt="meme generator image preview"]' but this doesn't work.


Solution

  • To get img[alt="meme generator image preview"] you can use

    • 'img.mm-img' or 'img[class^=mm-img]' selectors (note : ^ means begins with)
    • and you need to replace el.src with el.getAttribute('src')

    so change

    const imageurl = await page.$eval('img[alt="Imgflip Logo"]', el => el.src);
    

    to

     const imageurl = await page2.$eval('img[class^=mm-img]', el => el.getAttribute('src'));
    

    in your code.

    you can also re-write some of the other parts like below, even though they are not necessary for what you want.

    const puppeteer = require('puppeteer');
    
    (async()=>{
        const browser = await puppeteer.launch({
            headless:false,
            defaultViewport:false,
        });
    
        const page = await browser.newPage();    
        await page.goto('https://imgflip.com/memetemplates', {waitUntil: "load", timeout: 15000 });
        await page.waitForSelector('#page');
        const boxes = await page.$$('.mt-box');
    
        let allMemes = [];
        for(let box of boxes){
            try {
                let title = await box.$eval('h3 > a', el => el.textContent);
                let link = await box.$eval('a.mt-caption', el => el.getAttribute('href'));
    
                const page2 = await browser.newPage();      
                await page2.goto(`https://imgflip.com${link}`, {waitUntil: "load", timeout: 15000 });
                await page2.waitForSelector('body');
    
                const imageurl = await page2.$eval('img[class^=mm-img]', el => el.getAttribute('src'));
    
                console.log("The source of", title, "is");
                console.log(imageurl);
    
                await page2.close();
    
            } catch(error){
                console.log(error);
            }
        }
    
        await browser.close();
    
    })();
    
    

    If you need to get the already existing memes then you can do it this way..

    Code :

    const puppeteer = require('puppeteer');
    
    (async()=>{
        const browser = await puppeteer.launch({
            headless:false,
            defaultViewport:false,
        });
    
        const page = await browser.newPage();    
        await page.goto('https://imgflip.com/memetemplates', {waitUntil: "load", timeout: 15000 });
        await page.waitForSelector('#page');
        const boxes = await page.$$('.mt-box');
    
        let allMemes = [];
        for(let box of boxes){
            try {
                let data = await box.$eval('.mt-title > a', el => {return { link : el.getAttribute('href'), text : el.textContent}});
                const page2 = await browser.newPage();      
                await page2.goto(`https://imgflip.com${data.link}`, {waitUntil: "load", timeout: 15000 });
                await page2.waitForSelector('body');
                // some pages have empty or ads in .base-unit without any h2, so the :has(h2) selector checks if the div has a h2 or not.
                let memes = await page2.$$(".base-unit:has(h2)");
                let relative = [];
                for (let m of memes) { // get all relative memes on the page
                    let title = await m.$eval('h2 > a', el => {return { link: el.getAttribute("href"), text: el.textContent };});
                    // some pages have the image in an a tag, others have it in a div tag, so if div get data-src if a tag get src.
                    let image = (!! await m.$('div.base-img')) ? await m.$eval('.base-img', el => el.getAttribute("data-src")) : await m.$eval('.base-img', el => el.getAttribute("src"));                
                    relative.push({ link: title.link, text : title.text, image: image});
                }
                await page2.close();
    
                allMemes.push({
                    link : data.link,
                    text : data.text,
                    relative : relative
                });
    
            } catch(error){
                console.log(error);
            }
        }
    
        await browser.close();
    
        console.dir(allMemes, { depth: null }); // same as log, but show all objects
    
    })();