Search code examples
javascriptnode.jscasperjsbrowserify

Browserify a scraper written in casperJS


I am trying to make this code work in the browser.

scrape.html

<!doctype html>

<html>
    <head>
        <title> </title>
        <style>
            label {
                margin-bottom: 2%;
            }

            div {
                margin-bottom: 2%;
            }
        </style>
        <script src = "../../AppData/Roaming/npm/node_modules/phantomjs/lib/phantomjs.js"></script>
        <script src = "../../AppData/Roaming/npm/node_modules/casperjs/modules/casper.js"></script>
    </head>

    <body>
        <form action="#" id = "form" method="get">
            <label for="start">Start Page</label>
            <div>
                <input type = "number" name = "number1" value = "start"></input>
            </div>
            <label for="end">End Page</label>
            <div>
                <input type = "number" name = "number2" value = "end"></input>
            </div>
            <button onclick="myFunction()"> Submit  </button>
        </form>
        <script> 
            function myFunction() {
                var x = document.getElementById("form");
                var number = [];
                var i;
                for (i = 0; i < x.length-1 ;i++) {
                    number.push(x.elements[i].value);
                    }
                console.log(number);
                //var casper = require('casper').create();
                casper.then(function(){
                    console.log(this.fetchText('div.info-list-text'));

                    var startUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number1*3';
                    var endUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number2*3'
                });
            }
        </script>
    </body>
</html>

It produces the following error,

casper.js:32 Uncaught ReferenceError: patchRequire is not defined

I think the error is caused because we can't import modules in the browser using require the way we can in Node.js. In order to make this functionality available in the browser, I installed browserify in my project folder and created the following JS file.

browserReq.js

var casper = require('casper').create();

var url = 'ok,-MI'
var baseUrl = 'http://www.bedandbeyond.com/comm/c/'+url;
console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  var pageData = this.evaluate(getPageData);
  allLinks = allLinks.concat(pageData);

  if (!this.exists(nextBtn)) {
    return;
  }

  this.thenClick(nextBtn).then(function() {
    //this.echo(this.getCurrentUrl());
    //this.wait(1000);
  }).then(processPage);
}

function getPageData(){
  //return document.title;
  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
}

casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
    this.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);
      //jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      require('utils').dump(jsonObj);
    });
  });
});

I am running this file using browserify browserReq.js -o browserReqOut.js -d.

It gives me the following error, Cannot find module 'casper' from project folder location. I have installed casperJS in the project folder and also globally.

UPDATE 1:

I am posting values of the form elements in scrape.html to the following code,

scrape.php

<?php $url = $_POST["urlToScrape"]; ?><br>
<?php $page1 = $_POST["number1"]; ?> <br>
<?php $page2 = $_POST["number2"]; ?><br>
<?php $newProxyList = explode(PHP_EOL, $_POST['proxy']); ?> <br>

<?php echo $url ?> <br>
<?php echo $page1 ?> <br>
<?php echo $page2 ?> <br>
<?php echo $newProxyList[0] ?> <br>

<?php echo "<script> 

    var casper = require('casper').create();

var baseUrl = 'http://www.houzz.com/professionals/c/Nashville,-TN';
console.log(baseUrl);

var nextBtn = 'a.navigation-button.next';

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  var pageData = this.evaluate(getPageData);
  allLinks = allLinks.concat(pageData);

  if (!this.exists(nextBtn)) {
    return;
  }

  this.thenClick(nextBtn).then(function() {
    this.echo(this.getCurrentUrl());
    //this.wait(1000);
  }).then(processPage);
}

function getPageData(){
  //return document.title;
  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
}

casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
    this.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,'and');  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      require('utils').dump(jsonObj);
    });
  });
});

 </script>"
 ?>

</body>
</html>

It still gives me the same error, Uncaught ReferenceError: require is not defined. Why am I getting this error when PHP is executed on the server and the require module is also available on the server.


Solution

  • PhantomJS is a full browser, which has its own API. CasperJS uses that API to do stuff. Unless you implement the full PhantomJS API in plain JavaScript in the browser, you're not going to be able to browserify CasperJS.