Search code examples
phphtmlsimple-html-domscraper

Why can I not scrape the title off this site?


I'm using simple-html-dom to scrape the title off of a specified site.

<?php

include('simple_html_dom.php');

$html = file_get_html('http://www.pottermore.com/');

foreach($html->find('title') as $element) 
       echo $element->innertext . '<br>';

?>

Any other site I've tried works, apple.com for example.

But if I input pottermore.com, it doesn't output anything. Pottermore has flash elements on it, but the home screen I'm trying to scrape the title off of has no flash, just html.


Solution

  • This works for me :)

    $url = 'http://www.pottermore.com/';
    $html = get_html($url);
    file_put_contents('page.htm',$html);//just to test what you have downloaded
    echo 'The title from: '.$url.' is: '.get_snip($html, '<title>','</title>');
    
    function get_html($url)
    {
        $ch = curl_init();
        $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
        $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
        $header[] = "Cache-Control: max-age=0";
        $header[] = "Connection: keep-alive";
        $header[] = "Keep-Alive: 300";
        $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
        $header[] = "Accept-Language: en-us,en;q=0.5";
        $header[] = "Pragma: "; //browsers keep this blank.  
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows;U;Windows NT 5.0;en-US;rv:1.4) Gecko/20030624 Netscape/7.1 (ax)');
        curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 20);
        curl_setopt($ch, CURLOPT_AUTOREFERER, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE);
        curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE); 
        $result = curl_exec ($ch);
        curl_close ($ch);
        return($result);
    }
    
    function get_snip($string,$start,$end,$trim_start='1',$trim_end='1')
    {
        $startpos = strpos($string,$start);
        $endpos = strpos($string,$end,$startpos);
    
        if($trim_start!='')
        {
            $startpos += strlen($start);
        }
        if($trim_end=='')
        {
            $endpos += strlen($end);
        }
        return(substr($string,$startpos,($endpos-$startpos)));
    }