Search code examples
phpmysqlsimple-html-domphpcrawl

PHPCrawl with simplehtmldom to parse data


I'm trying to use PHPCrawl to crawl and collect URL(s), then feed to simplehtmldom to pull the required data from the html and store in a mysql database. Right now I am getting the error

**

Fatal error: Call to undefined method simple_html_dom::find() in /home/content/54/11109254/html/PHPCrawl_081/skunktest.php on line 44

**

Can anyone help out with what I've done wrong, and maybe glance over and see if I have any other hurdles ahead of me?

    <?php
set_time_limit(1000000);

// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");

// Include Simplehtmldom
include("../simple_html_dom.php");

// Extend the class and override the handleDocumentInfo()-method 
class MyCrawler extends PHPCrawler 
{
  function handleDocumentInfo($DocInfo) 
  {

    // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
    if (PHP_SAPI == "cli") $lb = "\n";
    else $lb = "<br />";

    // Print the URL and the HTTP-status-Code
    echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;

    // Print the refering URL
    echo "Referer-page: ".$DocInfo->referer_url.$lb;

    // Print if the content of the document was be recieved or not
    if ($DocInfo->received == true)
      echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
    else
      echo "Content not received".$lb; 

    // Now you should do something with the content of the actual
    // received page or file ($DocInfo->source), we skip it in this example 

    $result = $DocInfo->url;



    $html = file_get_html($result);


    if($html && is_object($html) && isset($html->nodes)){

    $partnumber = $html->find('div[class=product-sku')->plaintext;

    $title = $html->find('.product-name')->plaintext;

    $productnote = $html->find('.product-note')->plaintext;

    $description = $html->find('.product-description')->innertext;


    foreach($html->find('.MagicZoomBigImageCont') as $img)
        {
            foreach($img->find('img') as $e)
                {
                    $image = $e;
                    $imagehref = $e->href;

                }
        }

    foreach($html->find('.p-related-image') as $rel)
        {
            foreach($rel->find('a') as $element)
                {
                    $rel1 = $element[0]->href;
                    $rel2 = $element[1]->href;
                    $rel3 = $element[2]->href;
                    $rel4 = $element[3]->href;
                    $rel5 = $element[4]->href;
                }
        }


    foreach($html->find('.p-related-name') as $name)
        {
            foreach($name->find('a') as $el)
                {
                    $rel1n = $el[0]->plaintext;
                    $rel2n = $el[1]->plaintext;
                    $rel3n = $el[2]->plaintext;
                    $rel4n = $el[3]->plaintext;
                    $rel5n = $el[4]->plaintext;
                }
        }

    $vehfitment = $html->find('div#appanel_1')->outertext;
    }else{echo "htmldom issue";}

    $manufacturer = "Skunk2";




//Make your connection to database
$con = mysql_connect($host,$username,$password);

//Check your connection
if (!$con) {
die("Could not connect: " . mysql_error());
}

//Select your database
$db_selected = mysql_select_db($database, $con);

//Check to make sure the database is there
if (!$db_selected) {
    die ('Can\'t use the db : ' . mysql_error());
//}

//Run query
$result = mysql_query("INSERT INTO $table(manufacturer, partnumber, title, productnote, description, rel1img, rel2img, rel3img, rel4img, rel5img, rel1name, rel2name, rel3name, rel4name, rel5name, image, vehfitment) VALUES('".$manufacturer."','".$partnumber."','".$title."','".$productnote."','".$description."','".$rel1."','".$rel2."','".$rel3."','".$rel4."','".$rel5."','".$rel1n."','".$rel2n."','".$rel3n."','".$rel4n."','".$rel5n."','".$imagehref."','".$vehfitment."')");

echo '.$manufacturer.<br>.$partnumber.<br>.$title.<br>.$productnote.<br>.$description.<br>.$rel1.<br>.$rel1n.<br>.$image.<br>.$imagehref.<br>.$vehfitment.';

for($k=0;$k<count($image);$k++){

echo '<img src="'.$image[$k].'"><br/>';

$isok=copy($image[$k] , dirname(__FILE__).'/desktop/skunk2'.($k+1).'.jpg');

if(isok==true){
   echo' success!';  
}
else{
  echo ' Fail';  
}
}

    echo $lb;

    flush();
  }
} 


// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process. 

$crawler = new MyCrawler();

// URL to crawl
$crawler->setURL("store.skunk2.com");

// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);

// Tell the crawler to stream everything but "text/html"-documents to a tmp-file
$crawler->addStreamToFileContentType("#^((?!text/html).)*$#");

//User Agent String

$crawler->setUserAgentString("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");

//0 - The crawler will follow EVERY link, even if the link leads to a different host or domain.
//If you choose this mode, you really should set a limit to the crawling-process (see limit-options),
//otherwise the crawler maybe will crawl the whole WWW!

//1 - The crawler only follow links that lead to the same domain like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will follow links to "http://www.foo.com/..."
//and "http://bar.foo.com/...", but not to "http://www.another-domain.com/...".

//2 - The crawler will only follow links that lead to the same host like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will ONLY follow links to "http://www.foo.com/...", but not
//to "http://bar.foo.com/..." and "http://www.another-domain.com/...". This is the default mode.

//3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.
//E.g. if the root-url is "http://www.foo.com/bar/index.html", the crawler will follow links to "http://www.foo.com/bar/page.html" and
//"http://www.foo.com/bar/path/index.html", but not links to "http://www.foo.com/page.html"
$crawler->setFollowMode(1);

// Thats enough, now here we go
$crawler->go();

// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();

if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";

echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb; 
?>

Solution

  • Try this

    $html = new simple_html_dom();
    $html->load_file($DocInfo->url;);
    
    if($html && is_object($html) && isset($html->nodes)){
    ...
    }