Search code examples
phpfile-get-contentsdata-extraction

PHP Data Extraction From External Website, Then Write to Database


Just wondering how this would be done. Let's say there's a simple HTML table on an external website, and you have a database with the same structure as that HTML table. I understand that you can use file_get_contents to grab that entire web page.

From there, I would assume that you would remove everything from your file_get_contents except for the stuff between the <table></table> tags, thus isolating the table containing the data you wish to write.

What is the next step? Assuming your database table structure matches the structure of the HTML table, what would be the easiest way to write the table data into your database?


Solution

  • Perhaps this will be of interest(hope so lol), a super simple class to parse html.

    Using only DOMDocument and cURL

    <?php 
    $scraper = new DOMScraper();
    //example couldent think of a site with an example table
    $scraper->setSite('http://cherone.co.uk/forum')->setSource();
    
    //all tables on page
    echo '<table>'.$scraper->getInnerHTML('table').'</table>';
    
    //get only tables with id="some_table_id" or any attribute match eg class="somthing"
    echo '<table>'.$scraper->getInnerHTML('table','id=some_table_id').'</table>';
    
    //get all tables contents but return only nodeValue/text
    echo '<table>'.$scraper->getInnerHTML('table','id=some_table_id',true).'</table>';
    
    
    /**
     * Generic DOM scapper using DOMDocument and cURL
     */
    Class DOMScraper extends DOMDocument{
        public $site;
        private $source;
        private $dom;
    
        function __construct(){
            libxml_use_internal_errors(true);
            $this->preserveWhiteSpace = false;
            $this->strictErrorChecking = false;
        }
    
        function setSite($site){
            $this->site = $site;
            return $this;
        }
    
        function setSource(){
            if(empty($this->site))return 'Error: Missing $this->site, use setSite() first';
            $this->source = $this->get_data($this->site);
            return $this;
        }
    
        function getInnerHTML($tag, $id=null, $nodeValue = false){
            if(empty($this->site))return 'Error: Missing $this->source, use setSource() first';
            $this->loadHTML($this->source);
            $tmp = $this->getElementsByTagName($tag);
            $ret = null;
            foreach ($tmp as $v){
                if($id !== null){
                    $attr = explode('=',$id);
                    if($v->getAttribute($attr[0])==$attr[1]){
                        if($nodeValue == true){
                            $ret .= trim($v->nodeValue);
                        }else{
                            $ret .= $this->innerHTML($v);
                        }
                    }
                }else{
                    if($nodeValue == true){
                        $ret .= trim($v->nodeValue);
                    }else{
                        $ret .= $this->innerHTML($v);
                    }
                }
            }
            return $ret;
        }
    
        function innerHTML($dom){
            $ret = "";
            $nodes = $dom->childNodes;
            foreach($nodes as $v){
                $tmp = new DOMDocument();
                $tmp->appendChild($tmp->importNode($v, true));
                $ret .= trim($tmp->saveHTML());
            }
            return $ret;
        }
    
        function get_data($url){
            if(function_exists('curl_init')){
                $ch = curl_init();
                curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
                curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                curl_setopt($ch, CURLOPT_URL, $url);
                curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
                $data = curl_exec($ch);
                curl_close($ch);
                return $data;
            }else{
                return file_get_contents($url);
            }
        }
    }
    ?>