I'm trying to scrape URLs from https://en.wikipedia.org/wiki/List_of_hedge_funds by using an Apify actor called "web-scraper" (https://apify.com/apify/web-scraper)
Specifically, I'm trying to use the following Apify pageFunction
to scrape that target page and return a list of URLs from anchor tags present in the HTML.
async function pageFunction( context ) {
const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
const cssSelector = 'tr > td > a';
const $ = context.jQuery;
const pageTitle = $('title').first().text();
const anchorTag = $( cssSelector );
return {
url: context.request.url,
pageTitle, anchorTag,
};
}
In my console, I expect to see the value of the href
attribute of one or more anchor tags that exist on the target page in a property called anchorTag
. I also expect to see the page title in a property called pageTitle
and the url
property. As follows:
{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
"1": "http://example1.com",
"2": "http://example2.com",
"3": "http://example3.com",
...
"39": "http://example39.com",
}}
But instead of the list of URLs, the actor returns the following dataset:
What I actually see:[{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": {},
"1": {},
"2": {},
"3": {},
"4": {},
"5": {},
"6": {},
"7": {},
"8": {},
"9": {},
"10": {},
"11": {},
"12": {},
"13": {},
"14": {},
"15": {},
"16": {},
"17": {},
"18": {},
"19": {},
"20": {},
"21": {},
"22": {},
"23": {},
"24": {},
"25": {},
"26": {},
"27": {},
"28": {},
"29": {},
"30": {},
"31": {},
"32": {},
"33": {},
"34": {},
"35": {},
"36": {},
"37": {},
"38": {},
"39": {},
"length": 40,
"prevObject": {
"0": {
"location": {
"href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"ancestorOrigins": {},
"origin": "https://en.wikipedia.org",
"protocol": "https:",
"host": "en.wikipedia.org",
"hostname": "en.wikipedia.org",
"port": "",
"pathname": "/wiki/List_of_hedge_funds",
"search": "",
"hash": "",
"assign": {},
"reload": {},
"toString": {},
"replace": {}
},
"write": {},
"writeln": {},
"jQuery3410461525655351679551": {
"events": {
"mmv-setup-overlay": [
{
"type": "mmv-setup-overlay",
"origType": "mmv-setup-overlay",
"handler": {
"guid": 21
},
"guid": 21,
"namespace": ""
}
],
"mmv-cleanup-overlay": [
{
"type": "mmv-cleanup-overlay",
"origType": "mmv-cleanup-overlay",
"handler": {
"guid": 22
},
"guid": 22,
"namespace": ""
}
],
"keyup": [
{
"type": "keyup",
"origType": "keyup",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseover": [
{
"type": "mouseover",
"origType": "mouseover",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"focusout": [
{
"type": "focusout",
"origType": "blur",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseout": [
{
"type": "mouseout",
"origType": "mouseout",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"click": [
{
"type": "click",
"origType": "click",
"handler": {
"guid": 26
},
"guid": 26,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
]
},
"handle": {},
"focusin": 1,
"focusout": 1
}
},
"length": 1
}
}
}]
What am I doing wrong?
You have to access the href
attribute of the a
tag to get the URL. Also, you need to loop over all the a
tags to get them into one array.
// ...
const anchorTag = $( cssSelector );
const links = [];
// anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
anchorTag.each((index, el) => {
const link = $(el).attr('href');
if (link) {
links.push(link);
}
})
return {
url: context.request.url,
pageTitle,
links,
};