Search code examples
apify

Apify dynamic pagination based on date from results


I wrote a half working scraping script for a website:

async function pageFunction(context) {
    const {
        request,
        log,
        skipLinks,
        jQuery: $,
        waitFor
    } = context;

        log.info('Pagination');
        let timeoutMillis; // undefined
        const buttonSelector = 'div.pagination-view-more';

        //click on Show more button 5 times
        for (let step = 0; step < 5; step++) {
            log.info('Waiting for the "Show more" button.');
            try {
                await waitFor(buttonSelector, {
                    timeoutMillis
                }); // Default timeout first time.
                timeoutMillis = 5000; // 2 sec timeout after the first.
            } catch (err) {
                // Ignore the timeout error.
                log.info('Could not find the "Show more button", we\'ve reached the end.');
                break;
            }
            log.info('Clicking the "Show more" button.');
            $(buttonSelector).click();
        }

        //export the results
        var result = [];
        $(".thing-card").each(function() {
            result.push({
                title: $(this).attr('title'),
                //format Dec 15, 2019
                date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
            });
        });
        return result;

}

In the example above I am clicking 5 times on the "Show more" button and I am trying to export title and date as result. The problem is, that I don't get all the results, I think script is finishing earlier than it should be.

In the final script I would like to remove the fixed for loop and run this loop until date from result is max -7 days (or 1 week) from today. Is this somehow possible with Apify?


Solution

  • I guess you almost have this. There are no limitations with Apify as you can write any code you want :) So this is a more general JS question than specific to Apify.

    Instead of the fixed loop, you can check the date of the last item (I am assuming the items are sorted from the most recent one).

    Something like this should do it although you can fine tune it.

    async function pageFunction(context) {
        const {
            log,
            jQuery: $,
            waitFor
        } = context;
    
        log.info('Pagination');
        const buttonSelector = 'div.pagination-view-more';
    
        // Last item date, we have to check it before loop also
        let lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
    
        const weekAgo = new Date(Date.now() - 1000 * 3600 * 24 * 7);
    
        // We also need to track if we get new items after the click so we dont run in endless loop
        let itemCount = $(".thing-card").length;
    
        // We only enter the loop if the last item is more recent than week ago
        if (lastItemsDate >= weekAgo) {
            // I chose 'infinite' loop so we can log how we break out
            while (true) {
                log.info('Waiting for the "Show more" button.');
                try {
                    await waitFor(buttonSelector);
                } catch (err) {
                    // Ignore the timeout error.
                    log.info('Could not find the "Show more button", we\'ve reached the end.');
                    break;
                }
                log.info('Clicking the "Show more" button.');
                $(buttonSelector).click();
                // Wait a bit so items can load
                await waitFor(5000);
    
                // Now we check if new items were loaded
                const itemCountAfterClick = $(".thing-card").length;
                if (itemCountAfterClick === itemCount) {
                    log.info('No new items, exiting the loop...');
                    break;
                }
                itemCount = itemCountAfterClick;
    
                // Now we check if last item is still within a week. We can compare Dates directly
                lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
    
                if (lastItemsDate < weekAgo) {
                    log.info(`Last item date is older than a week, exiting the loop: ${lastItemsDate}`);
                    break;
                }
            }
        }
    
        //export the results
        var result = [];
        $(".thing-card").each(function() {
            result.push({
                title: $(this).attr('title'),
                //format Dec 15, 2019
                date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
            });
        });
        return result;
    
    }