Search code examples
javascriptweb-scrapingscreen-scrapingpuppeteerapify

Scraping URLs using Apify and Puppeteer


I'm trying to scrape URLs from https://en.wikipedia.org/wiki/List_of_hedge_funds by using an Apify actor called "web-scraper" (https://apify.com/apify/web-scraper)

Specifically, I'm trying to use the following Apify pageFunction to scrape that target page and return a list of URLs from anchor tags present in the HTML.

pageFunction
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const cssSelector = 'tr > td > a';

    const $ = context.jQuery;
    const pageTitle = $('title').first().text();
    const anchorTag = $( cssSelector );

    return {
      url: context.request.url,
      pageTitle, anchorTag,
    };
}

In my console, I expect to see the value of the href attribute of one or more anchor tags that exist on the target page in a property called anchorTag. I also expect to see the page title in a property called pageTitle and the url property. As follows:

What I expect to see:
{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
    "1": "http://example1.com",
    "2": "http://example2.com",
    "3": "http://example3.com",
    ...
    "39": "http://example39.com",
}}

But instead of the list of URLs, the actor returns the following dataset:

What I actually see:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": {},
    "1": {},
    "2": {},
    "3": {},
    "4": {},
    "5": {},
    "6": {},
    "7": {},
    "8": {},
    "9": {},
    "10": {},
    "11": {},
    "12": {},
    "13": {},
    "14": {},
    "15": {},
    "16": {},
    "17": {},
    "18": {},
    "19": {},
    "20": {},
    "21": {},
    "22": {},
    "23": {},
    "24": {},
    "25": {},
    "26": {},
    "27": {},
    "28": {},
    "29": {},
    "30": {},
    "31": {},
    "32": {},
    "33": {},
    "34": {},
    "35": {},
    "36": {},
    "37": {},
    "38": {},
    "39": {},
    "length": 40,
    "prevObject": {
      "0": {
        "location": {
          "href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
          "ancestorOrigins": {},
          "origin": "https://en.wikipedia.org",
          "protocol": "https:",
          "host": "en.wikipedia.org",
          "hostname": "en.wikipedia.org",
          "port": "",
          "pathname": "/wiki/List_of_hedge_funds",
          "search": "",
          "hash": "",
          "assign": {},
          "reload": {},
          "toString": {},
          "replace": {}
        },
        "write": {},
        "writeln": {},
        "jQuery3410461525655351679551": {
          "events": {
            "mmv-setup-overlay": [
              {
                "type": "mmv-setup-overlay",
                "origType": "mmv-setup-overlay",
                "handler": {
                  "guid": 21
                },
                "guid": 21,
                "namespace": ""
              }
            ],
            "mmv-cleanup-overlay": [
              {
                "type": "mmv-cleanup-overlay",
                "origType": "mmv-cleanup-overlay",
                "handler": {
                  "guid": 22
                },
                "guid": 22,
                "namespace": ""
              }
            ],
            "keyup": [
              {
                "type": "keyup",
                "origType": "keyup",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseover": [
              {
                "type": "mouseover",
                "origType": "mouseover",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "focusout": [
              {
                "type": "focusout",
                "origType": "blur",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseout": [
              {
                "type": "mouseout",
                "origType": "mouseout",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "click": [
              {
                "type": "click",
                "origType": "click",
                "handler": {
                  "guid": 26
                },
                "guid": 26,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ]
          },
          "handle": {},
          "focusin": 1,
          "focusout": 1
        }
      },
      "length": 1
    }
  }
}]

What am I doing wrong?


Solution

  • You have to access the href attribute of the a tag to get the URL. Also, you need to loop over all the a tags to get them into one array.

    // ...
    const anchorTag = $( cssSelector );
    const links = [];
    
    // anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
    anchorTag.each((index, el) => {
        const link = $(el).attr('href');
        if (link) {
             links.push(link);
        }
    })
    
    return {
       url: context.request.url,
       pageTitle,
       links,
    };