Search code examples
phpcurlproxysimple-html-domproxylocal

How Can I Run a Local PHP Simple HTML DOM Parser with a Proxy?


I have a PHP Simple HTML DOM Parser locally in MAMP that pulls information and works well with the Japan version of a website, since I'm located in Japan. However, I would like to pull information from the UK version of the site. What is the simplest way to do this?

I tried the following from the documentation and it didn't work.

$context = array('http' => array('proxy' => '212.82.126.32:80','request_fulluri' => true,),);
$stream = stream_context_create($context);

$html = file_get_html('http://www.supremenewyork.com/shop/new', false, $stream);

I also tried the curl version with modifications as the site has safe mode enabled. That didn't work as well.

function curl_exec_follow(/*resource*/ $ch, /*int*/ &$maxredirect = null) { 
    $mr = $maxredirect === null ? 5 : intval($maxredirect); 
    if (ini_get('open_basedir') == '' && ini_get('safe_mode' == 'Off')) { 
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $mr > 0); 
        curl_setopt($ch, CURLOPT_MAXREDIRS, $mr); 
    } else { 
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); 
        if ($mr > 0) { 
            $newurl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); 

            $rch = curl_copy_handle($ch); 
            curl_setopt($rch, CURLOPT_HEADER, true); 
            curl_setopt($rch, CURLOPT_NOBODY, true); 
            curl_setopt($rch, CURLOPT_FORBID_REUSE, false); 
            curl_setopt($rch, CURLOPT_RETURNTRANSFER, true); 
            do { 
                curl_setopt($rch, CURLOPT_URL, $newurl); 
                $header = curl_exec($rch); 
                if (curl_errno($rch)) { 
                    $code = 0; 
                } else { 
                    $code = curl_getinfo($rch, CURLINFO_HTTP_CODE); 
                    if ($code == 301 || $code == 302) { 
                        preg_match('/Location:(.*?)\n/', $header, $matches); 
                        $newurl = trim(array_pop($matches)); 
                    } else { 
                        $code = 0; 
                    } 
                } 
            } while ($code && --$mr); 
            curl_close($rch); 
            if (!$mr) { 
                if ($maxredirect === null) { 
                    trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING); 
                } else { 
                    $maxredirect = 0; 
                } 
                return false; 
            } 
            curl_setopt($ch, CURLOPT_URL, $newurl); 
        } 
    } 
    return curl_exec($ch); 
} 



$url = 'http://www.supremenewyork.com/shop/new';
$proxy = '212.82.126.32:80';

$options = array( 
    CURLOPT_PROXY          => $proxy,
    CURLOPT_HTTPPROXYTUNNEL => 0,
    CURLOPT_REFERER        => "http://www.google.com",
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_USERAGENT      => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1", 
    CURLOPT_CONNECTTIMEOUT => 20,
    CURLOPT_TIMEOUT        => 20,
    CURLOPT_MAXREDIRS      => 10,
    CURLOPT_HEADER         => true,

); 

$ch = curl_init( $url ); 
//curl_setopt_array( $ch, $options ); 
$content = curl_exec_follow( $ch ); 

$html = new simple_html_dom();
$html->load($content,true,false);

I tried uploading to US and UK servers as well, but that didn't work and it just pulls US data. Some help please?


Solution

  • Curl works whatever safe mode is enable or disable. Your Curl script is too complex, make it simple and try again.

    $content = curl_exec_follow('http://www.supremenewyork.com/shop/new'); 
    
    $html = new simple_html_dom();
    $html->load($content,true,false);
    

    I modified your code, you can try.

    // define cookie file path here
    define('CRAWLER_COOKIE_FILENAME', 'cookie.txt');
    
    function curl_exec_follow($url) {
    
        $proxy = '212.82.126.32:80';
        $agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1';
    
        // Some websites check referrer
    
        $host = parse_url($url, PHP_URL_HOST);
        $scheme = parse_url($url, PHP_URL_SCHEME);
        $referrer = $scheme . '://' . $host; 
    
        $ch = curl_init();
    
        $curl_defaults = array(
            CURLOPT_HEADER => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_RETURNTRANSFER => 1,
        );
    
        curl_setopt_array($ch, $curl_defaults);
    
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_PROXY, $proxy);
        curl_setopt($ch, CURLOPT_USERAGENT, $agent);
        curl_setopt($ch, CURLOPT_REFERER, $referrer);
    
        if ( !file_exists(CRAWLER_COOKIE_FILENAME) || !is_writable(CRAWLER_COOKIE_FILENAME) ) {
            echo 'Cookie file is missing or not writable.';
            exit;
        }
        curl_setopt($ch, CURLOPT_COOKIESESSION, 0);
        curl_setopt($ch, CURLOPT_COOKIEFILE, CRAWLER_COOKIE_FILENAME);
        curl_setopt($ch, CURLOPT_COOKIEJAR, CRAWLER_COOKIE_FILENAME);
    
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
    
        // allow to crawl https webpages
        curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
        curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
    
        // the download speed must be at least 1 byte per second
        curl_setopt($ch,CURLOPT_LOW_SPEED_LIMIT, 1);
    
        // if the download speed is below 1 byte per second for more than 30 seconds curl will give up
        curl_setopt($ch,CURLOPT_LOW_SPEED_TIME, 30);
    
        $content = curl_exec($ch);
    
    
        if ($ret === FALSE) {
            echo curl_error($ch);
        }
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    
        if ( $code != '200' ) echo 'http error code: ' . $code;
    
        curl_close($ch);
    
        return $content;
    }