I'm trying to scrape Amazon prices with phantomjs and python. I want to parse it with beautiful soup, to get the new and used prices for books, the problem is: when I pass the source of the request I do with phantomjs the prices are just 0,00, the code is this simple test.
I don't understand if is amazon who have measures to avoid scraping prices or I'm doing it wrong because I was trying with other more simple pages and I can get the data I want.
PD I'm in a country not supported to use amazon API, that's why the scraper is necesary
import re
import urlparse
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
link = 'http://www.amazon.com/gp/offer-listing/1119998956/ref=dp_olp_new?ie=UTF8&condition=new'#'http://www.amazon.com/gp/product/1119998956'
class AmzonScraper(object):
def __init__(self):
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape_prices(self):
self.driver.get(link)
s = BeautifulSoup(self.driver.page_source)
return s
def scrape(self):
source = self.scrape_prices()
print source
self.driver.quit()
if __name__ == '__main__':
scraper = TaleoJobScraper()
scraper.scrape()
First of all, to follow @Nick Bailey's comment, study the Terms of Use and make sure there are no violations on your side.
To solve it, you need to tweak PhantomJS
desired capabilities:
caps = webdriver.DesiredCapabilities.PHANTOMJS
caps["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87"
self.driver = webdriver.PhantomJS(desired_capabilities=caps)
self.driver.maximize_window()
And, to make it bullet-proof, you can make a Custom Expected Condition and wait for the price to become non-zero:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class wait_for_price(object):
def __init__(self, locator):
self.locator = locator
def __call__(self, driver):
try :
element_text = EC._find_element(driver, self.locator).text.strip()
return element_text != "0,00"
except StaleElementReferenceException:
return False
Usage:
def scrape_prices(self):
self.driver.get(link)
WebDriverWait(self.driver, 200).until(wait_for_price((By.CLASS_NAME, "olpOfferPrice")))
s = BeautifulSoup(self.driver.page_source)
return s