I am working on scraping websites, I have tried many technologies to scrape websites.
First of all I used PHP cURL as a scraping tool, and went up to some extent to scrape websites, but then I faced a problem, that was; the PHP cURL couldn't scrape websites that used Ajax to load the website contents/data. And that's what stopped me scraping through PHP.
After a decent research I have found another solution to scrape websites, that were beyond the limitation of Ajax loaded websites etc, and was very powerful and cool to use, they were indeed Phantom JS and Casper JS. I have scraped lot of sites with it.
The problem I faced with these tools was that, these tools works/controlled through the command line interface, for example when you want to run the Phantom/Casper JS code, you need to run it through the command line. And this is my basic problem. What I need is, to write the code in Phantom/Casper JS and I want to have a webpage with admin panel, where I can control these scripts. Currently I am scraping career/jobs listings websites, and I want to automate these tools, to scrape these sites automatically after a given time, to stay updated with the employers sites, who posts new jobs.
For instance, I have code for each website separately and I manually execute each file through the command line and then wait for it to finish scraping and then I continue with second one and so on. What I want to have is, I write a script in JavaScript (preferably in Node JS - but not compulsory) which will execute the scraper code after a specific instance, and then will start scraping all of the websites in the background.
I can do the automation, its not a problem, but the problem is, I am unable to connect the Phantom/Casper JS with the website, even I tried Spooky JS which connects Phantom/Casper JS with Node JS, but unfortunately it doesn't work for me, and its alot messy.
Is there any other tool that's powerful like these two, and I can easily interact with them through a webpage ?
Continuing my own research for scrapping sites, I was unable to find any perfect solution. But the powerful solution I came up with is to use Phantom JS module with Node JS. You can find this module here.
For installation Guide follow this documentation. Phantom JS is used asynchronously in node JS and then its alot easier to get the results, and really easy to interact with it using, express JS on server side and Ajax or Socket.io on client side to enhance the functionality.
Below is my code which I came up with :
const phantom = require('phantom');
const ev = require('events');
const event = new ev.EventEmitter();
var MAIN_URL,
TOTAL_PAGES,
TOTAL_JOBS,
PAGE_DATA_COUNTER = 0,
PAGE_COUNTER = 0,
PAGE_JOBS_DETAILS = [],
IND_JOB_DETAILS = [],
JOB_NUMBER = 1,
CURRENT_PAGE = 1,
PAGE_WEIGHT_TIME,
CLICK_NEXT_TIME,
CURRENT_WEBSITE,
CURR_WEBSITE_LINK,
CURR_WEBSITE_NAME,
CURR_WEBSITE_INDEX,
PH_INSTANCE,
PH_PAGE;
function InitScrap() {
// Initiate the Data
this.init = async function(url) {
MAIN_URL = url;
PH_INSTANCE = await phantom.create(),
PH_PAGE = await PH_INSTANCE.createPage();
console.log("Scrapper Initiated, Please wait...")
return "success";
}
// Load the Basic Page First
this.loadPage = async function(pageLoadWait) {
var status = await PH_PAGE.open(MAIN_URL),
w;
if (status == "success") {
console.log("Page Loaded . . .");
if (pageLoadWait !== undefined && pageLoadWait !== null && pageLoadWait !== false) {
let p = new Promise(function(res, rej) {
setTimeout(async function() {
console.log("Page After 5 Seconds");
PH_PAGE.render("new.png");
TOTAL_PAGES = await PH_PAGE.evaluate(function() {
return document.getElementsByClassName("flatten pagination useIconFonts")[0].textContent.match(/\d+/g)[1];
});
TOTAL_JOBS = await PH_PAGE.evaluate(function() {
return document.getElementsByClassName("jobCount")[0].textContent.match(/\d+/g)[0];
});
res({
p: TOTAL_PAGES,
j: TOTAL_JOBS,
s: true
});
}, pageLoadWait);
})
return await p;
}
}
}
function ScrapData(opts) {
var scrap = new InitScrap();
scrap.init("https://www.google.com/").then(function(init_res) {
if (init_res == "success") {
scrap.loadPage(opts.pageLoadWait).then(function(load_res) {
console.log(load_res);
if (load_res.s === true) {
scrap.evaluatePage().then(function(ev_page_res) {
console.log("Page Title : " + ev_page_res);
scrap.evaluateJobsDetails().then(function(ev_jobs_res) {
console.log(ev_jobs_res);
})
})
}
return
})
}
});
return scrap;
}
module.exports = {
ScrapData
};
}