Search code examples
web-scrapingpuppeteerapify

How do I add input with Puppeteer


I've been getting errors that node not found. I switched to trying jQuery. No luck. Now I'm here. page.text didn't work. I got the node error. I'm trying to scrape https://web6.seattle.gov/courts/ECFPortal/Default.aspx for case information and documents by supplying a case number.

const Apify = require('apify');
const {
    puppeteer
} = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') = > {
    const screenshotBuffer = await page.screenshot({
        fullPage: true
    });
    await Apify.setValue(key, screenshotBuffer, {
        contentType: 'image/png'
    });
};
Apify.main(async() = > {
    // Launch Puppeteer
    const browser = await Apify.launchPuppeteer();
    const page = await browser.newPage();
    await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
    await page.addScriptTag({
        url: 'https://code.jquery.com/jquery-3.2.1.min.js'
    });
    await page.waitForFunction(() = > window.jQuery);
    page.evaluate(() = > $('span:contains("Case Information")').click());
    //await page.waitForNavigation();
    await page.waitFor(4000);
    const input = await Apify.getInput()
    console.log('json stringify input: ' + JSON.stringify(input))
    const caseNumber = input['court_case'];
    console.log('CASE NUMBER: ' + caseNumber)
    var html = await page.$eval('body', e = > e.outerHTML);
    const output2 = {
        html,
        crawledAt: new Date(),
    };
    await Apify.setValue('HTMltestOUTPUT', output2);
    console.log('html to test.');
    page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_txtCaseNumber').val("585344"));
    await saveScreen(page, 'test-screen');
    await page.waitFor(1000);
    console.log('Attempted to enter case number');
    page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_btnSearch').click());
    console.log('Attempted to click button');
    // Times-out here
    //await page.waitForNavigation();
    console.log('Attempted to wait for navigation');
    // Get cookies
    const cookies = await page.cookies();
    console.log('Attempted to wait for cookies');
    var html = await page.$eval('body', e = > e.outerHTML);
    // And then save output
    const output = {
        html,
        crawledAt: new Date(),
    };
    console.log('My output:');
    console.dir(output);
    await Apify.setValue('OUTPUT', output);
    await browser.close();
    console.log('Done.');
});

Solution

  • The main issue with your code is the fact, that the website is a single page aspx app that does not do any navigation and loads all content through XHR requests. So every page.waitForNavigation call will always time out.

    You can work around this either by waiting for an element on page to be visible or by tracking the network requests. I have rewritten your code with this in mind and made a functional version that uses both approaches. Hope this helps you:

    const Apify = require('apify');
    const { puppeteer } = Apify.utils;
    
    const saveScreen = async(page, key = 'debug-screen') => {
        const screenshotBuffer = await page.screenshot({
            fullPage: true
        });
        await Apify.setValue(key, screenshotBuffer, {
            contentType: 'image/png'
        });
    };
    
    const saveHtml = async (page, key = 'output', logOutput = false) => {
        const html = await page.$eval('body', e => e.outerHTML);
        const output = {
            html,
            crawledAt: new Date(),
        };
        if (logOutput) {
            console.log('My output:');
            console.dir(output);
        }
        return Apify.setValue(key, output);
    };
    
    Apify.main(async() => {
        const input = await Apify.getInput()
        console.log('json stringify input: ' + JSON.stringify(input));
        // Get case number from input or use default (for testing)
        const caseNumber = input && input.court_case || '585344';
        console.log('CASE NUMBER: ' + caseNumber)
    
        // Launch Puppeteer
        const browser = await Apify.launchPuppeteer();
        const page = await browser.newPage();
        await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
    
        console.log('Page opened');
    
        // Wait for the link in menu to appear and then click on it
        await page.waitForSelector('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
        await page.click('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
    
        console.log('Redirecting to case information');
    
        // Wait for the new page to load and input to appear
        await page.waitForSelector('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', { visible: true });
    
        console.log('Inputing case number');
    
        // Input the case number
        await page.type('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', caseNumber, { delay: 20 })
    
        // Save current html and screenshot for debugging
        await saveScreen(page, 'search-screen');
        await saveHtml(page, 'search-html');
    
        // Prepare waitForResponse promise, we need to do it here, because after clicking on
        // button it might be too late.
        const waitForResponsePromise = page.waitForResponse((response) => {
            return response.url().includes('courts/ECFPortal/Default.aspx');
        });
    
        console.log('clicking on search');
    
        // Click on the search button
        await page.click('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_btnSearch');
    
        // Wait for the xhr request to finish, this means that the case information should be loaded
        await waitForResponsePromise;
        await page.waitFor(500);
    
        console.log('Case information loaded');
    
        // Save current html and screenshot for debugging
        await saveScreen(page, 'output-screen');
        await saveHtml(page, 'output', true);
        await browser.close();
        console.log('Done.');
    });