Search code examples
javascripthtmlweb-scrapingpuppeteerapify

How to structure the results of a successful web scrape using Apify and Puppeteer?


Using Apify and Puppeteer, I want to scrape the table of data at the following URL:

https://en.wikipedia.org/wiki/List_of_hedge_funds

I want the result to be an array of objects. Each element of the array should represent each <tr> row of the original data source table and be a JS object with the following properties.

{ firmName, firmUrl, hq, hqUrl, aum, }

Where:

  • firmName is the .innerText() of the first <td> element of each row.
  • firmUrl is the href attribute of the first <td> element of each row.
  • hq is the . innerText() of the second<td> element of each row.
  • hqUrl is the href attribute of the second <td> element of each row.
  • aum is the . innerText() of the third <td> element of each row.

Specifically, for example, I would like to see the following object returned to me.

What I want to see, alternative A:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "links": [
    {
      firmName: "Bridgewater Associates",
      firmUrl: "/wiki/Bridgewater_Associates",
      hq: "Westport, Connecticut",
      hqUrl: "/wiki/Westport,_Connecticut",
      aum: "$132,050",
    }
    // ...x39 more times
  ]
}]

Or, alternatively, the object could be as follows (I don't know which is possible, that's part of my confusion)

What I want to see, alternative B:
[
  {
    "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
    "pageTitle": "List of hedge funds - Wikipedia",
    "links": {
      firmName: "Bridgewater Associates",
      firmUrl: "/wiki/Bridgewater_Associates",
      hq: "Westport, Connecticut",
      hqUrl: "/wiki/Westport,_Connecticut",
      aum: "$132,050",
    },  
  },
  // ...x39 more times
]

But instead, I actually see the following result.

What I actually see:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "links": [
    "/wiki/Bridgewater_Associates",
    "/wiki/Westport,_Connecticut",
    "/wiki/Renaissance_Technologies",
    "/wiki/East_Setauket,_New_York",
    "/wiki/Man_Group",
    "/wiki/London",
    "/wiki/AQR_Capital_Management",
    "/wiki/Greenwich,_Connecticut",
    "/wiki/Two_Sigma_Investments",
    "/wiki/New_York_City,_New_York",
    "/wiki/Millennium_Management,_LLC",
    "/wiki/New_York_City,_New_York",
    "/wiki/Elliott_Management",
    "/wiki/New_York_City,_New_York",
    "/wiki/BlackRock",
    "/wiki/New_York_City,_New_York",
    "/wiki/Citadel_LLC",
    "/wiki/Chicago,_IL",
    "/wiki/Davidson_Kempner_Capital_Management",
    "/wiki/New_York_City,_New_York",
    "/wiki/Viking_Global_Investors",
    "/wiki/Greenwich,_Connecticut",
    "/wiki/Baupost_Group",
    "/wiki/Boston,_MA",
    "/wiki/D.E._Shaw_%26_Co.",
    "/wiki/New_York_City,_New_York",
    "/wiki/Farallon_Capital",
    "/wiki/San_Francisco,_CA",
    "/wiki/Marshall_Wace",
    "/wiki/London",
    "/wiki/The_Children%27s_Investment_Fund_Management",
    "/wiki/London",
    "/wiki/Wellington_Management_Company",
    "/wiki/Boston,_MA",
    "/wiki/Winton_Group",
    "/wiki/London",
    "/wiki/Capula_Investment_Management",
    "/wiki/London",
    "/wiki/York_Capital_Management",
    "/wiki/New_York_City,_NY"
  ]
}]

I am using the following code as my pageFunction.

pageFunction
// The function accepts a single argument: the "context" object.
// For a complete list of its properties and functions,
// see https://apify.com/apify/web-scraper#page-function 
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const TITLE_SELECTOR = 'title';
    const ANCHOR_SELECTOR = 'tr > td > a';
    const HREF_SELECTOR = 'href';

    // jQuery is handy for finding DOM elements and extracting data from them.
    //  To use it, make sure to enable the "Inject jQuery" option.
    const $ = context.jQuery;
    const pageTitle = $( TITLE_SELECTOR ).first().text();
    const anchorTag = $( ANCHOR_SELECTOR );
    const links = [];
    anchorTag.each((index, item,) => {
      const link = $(item).attr( HREF_SELECTOR );
      if( link ) links.push( link );
    });

    return {
      url: context.request.url,
      pageTitle,
      links,
    };
}

How do I need to change my code?


Solution

  • It looks good, you need to change parsing of data from table. There is an example of pageFunction which works.

    // The function accepts a single argument: the "context" object.
    // For a complete list of its properties and functions,
    // see https://apify.com/apify/web-scraper#page-function 
    async function pageFunction( context ) {
        const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
        const TITLE_SELECTOR = 'title';
        const ANCHOR_SELECTOR = 'tr > td > a';
        const LINE_SELECTOR = '.wikitable tr'
        const HREF_SELECTOR = 'href';
    
        // jQuery is handy for finding DOM elements and extracting data from them.
        //  To use it, make sure to enable the "Inject jQuery" option.
        const $ = context.jQuery;
        const pageTitle = $( TITLE_SELECTOR ).first().text();
        const anchorTag = $( ANCHOR_SELECTOR );
        const lines = $( LINE_SELECTOR );
        const links = [];
        lines.each((index, item) => {
            const columns = $(item).find('td');
            const link = {
              firmName: columns.eq(1).text().trim(),
              firmUrl: columns.eq(1).find('a').eq(0).attr('href'),
              hq: columns.eq(2).text().trim(),
              hqUrl: columns.eq(2).find('a').eq(0).attr('href'),
            }
            if (link.firmUrl) {
                links.push(link);
            }       
        });
    
        return {
          url: context.request.url,
          pageTitle,
          links,
        };
    }