Using Python, Selenium, Sublime and Firefox: I am scraping the links off of this website and would like to save the scraped pages (as html files) into a folder. However, I have been working for days on trying to get the body of these html files to dump into a dropbox folder. The problem is 1) saving the html files and 2) saving them to a dropbox folder (or any folder).
I have successfully written code that will perform a search, then scrape the links off of a series of webpages. The following code works well for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
import csv
import pickle
import signal
import time
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM,handler)
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
def waitUntilReadyBreak(browser_b,url,counter):
try:
signal.alarm(counter)
waitUntilReady(browser_b)
signal.alarm(0)
except Exception,e:
print e
signal.alarm(0)
browser_b.close()
browser_b = webdriver.Firefox()
browser_b.get(url)
waitUntilReadyBreak(browser_b,url,counter)
return browser_b
browser = webdriver.Firefox()
thisurl = 'http://www.usprwire.com/cgi-bin/news/search.cgi'
browser.get(thisurl)
waitUntilReady(browser)
numarticles = 0
elem = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
elem = browser.find_element_by_name("query")
elem.send_keys('"test"')
form = browser.find_element_by_xpath("/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/font/input[2]").click()
nextpage = False
all_newproduct_links = []
npages = 200
for page in range(1,npages+1):
if page == 1:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_A_PAGE"
elem = browser.find_element_by_link_text('[>>]').click()
waitUntilReady(browser)
if page >=2 <= 200:
# click the dots
print page
print page
print "B4 LastLoop"
elems = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "category_links")))
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_C_PAGE"
# This is the part that will not work :(
for e in elems:
numarticles = numarticles+1
numpages = 0
numpages = numpages+1000
article_url = e.get_attribute('href')
print 'waiting'
bodyelem.send_keys(Keys.COMMAND + "2")
browser.get(article_url)
waitUntilReady(browser)
fw = open('/Users/My/Dropbox/MainFile/articlesdata/'+str(page)+str(numpages)+str(numarticles)+'.html','w')
fw.write(browser.page_source.encode('utf-8'))
fw.close()
bodyelem2 = browser.find_elements_by_xpath("//body")[0]
bodyelem2.send_keys(Keys.COMMAND + "1")
The above (for e in elems:
) is meant to click on the page and create an html file containing the body of the scraped page. I seem to be missing something fundamental.
Any guidance at all would be most appreciated.
I think you are overcomplicating it.
There is at least one problem in this block:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
elems
would contain a list of elements found by find_elements_by_tag_name()
, but then, you are using the same elems
variable in the list comprehension. As a result, when you are iterating over elems
later, you are getting an error, since elems
now refer to a single element and not a list.
Anyway, here is the approach I would take:
_Iran_Shipping_Report_Q4_2014_is_now_available_at_Fast_Market_Research_326303.shtml
would be the article filenameThe code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def isReady(browser):
return browser.execute_script("return document.readyState") == "complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
browser = webdriver.Firefox()
browser.get('http://www.usprwire.com/cgi-bin/news/search.cgi')
# make a search
query = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
query.send_keys('"test"')
submit = browser.find_element_by_xpath("//input[@value='Search']")
submit.click()
# grab article urls
npages = 4
article_urls = []
for page in range(1, npages + 1):
article_urls += [elm.get_attribute("href") for elm in browser.find_elements_by_class_name('category_links')]
browser.find_element_by_link_text('[>>]').click()
# iterate over urls and save the HTML source
for url in article_urls:
browser.get(url)
waitUntilReady(browser)
title = browser.current_url.split("/")[-1]
with open('/Users/My/Dropbox/MainFile/articlesdata/' + title, 'w') as fw:
fw.write(browser.page_source.encode('utf-8'))