I'm not a coder, all I need to accomplish is get a fully loaded source code. I found this code a while back and it has been serving me well. But it doens't work for some websites due to advertisement layering with timer.
import urllib2,cookielib
site= "http://example.com" #real url edited out
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
content = page.read()
print content
But I got this print out in Python 2.7 console
<html>
<head>
<script type="text/javascript">
//<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok3v=1613a3a185/"},atok:"469b082f74e88d5de78deda9ca22d249",petok:"704cf398eb73eb73e891bfef183856ace9cb873c-1500869038-1800",zone:"example.com",rocket:"a",apps:{}}];
document.write('<script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok3v=85b614c0f6/cloudflare.min.js"><'+'\/script>');}}catch(e){};
//]]></script>
<script type="text/rocketscript">
function set_cookie(){
var now = new Date();
vartime = now.getTime();
time += 19360000 * 1000;
now.setTime(time);
document.cookie='beget=begetok'+';
expires='+now.toGMTString()+';
path=/';
}
set_cookie();
location.
reload();
</script> </head><body></body></html>
What I did was converted into a function and it works!!!
def getHtml(url):
import urllib2,cookielib
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
html = page.read()
#print html
return html;
Alternative (slower), while I was looking all over the internet I found out that you can use Python Selenium WebDriver either with Firefox or Chrome or Headless PhantomJS to get the html source code. You need to place GeckoDriver.exe or ChromeDriver.exe or PhantomJS.exe in C:\Python27\Scripts\
def getHtmlViaWebDriver(url):
from selenium import webdriver
#print("Open Web Driver - External Head/less Browser PhantomJS or Firefox or Chrome")
#driver = webdriver.Firefox(executable_path=r'C:\Python27\Scripts\geckodriver.exe')
#driver = webdriver.Chrome(executable_path=r'C:\Python27\Scripts\chromedriver.exe')
driver = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomJS.exe')
html = driver.page_source.encode('utf-8')
driver.quit()
#print html
return html;