Hi I'm trying to extract information from Macy's website, specifically from this category = 'https://www.macys.com/shop/featured/women-handbags'. But when I access a particular item page I get a blank page with the following message:
Access Denied You don't have permission to access "any of the items links listed on the above category link" on this server. Reference #18.14d6f7bd.1526927300.12232a22
I've also tried changing the user agent with chrome options but it didn't work.
This is my code:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
url = 'https://www.macys.com/shop/featured/women-handbags'
def init_selenium():
global driver
driver = webdriver.Chrome("/Users/rodrigopeniche/Downloads/chromedriver")
driver.get(url)
def find_page_items():
items_elements = driver.find_elements_by_css_selector('li.productThumbnailItem')
for index, element in enumerate(items_elements):
items_elements = driver.find_elements_by_css_selector('li.productThumbnailItem')
item_link = items_elements[index].find_element_by_tag_name('a').get_attribute('href')
driver.get(item_link)
driver.back()
init_selenium()
find_page_items()
Any idea what's going on and what can I do to fix it?
It's not a selenium oriented solution (all through) but it works. You can give it a try.
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
url = 'https://www.macys.com/shop/featured/women-handbags'
def find_page_items(driver,link):
driver.get(link)
item_link = [item.find_element_by_tag_name('a').get_attribute('href') for item in driver.find_elements_by_css_selector('li.productThumbnailItem')]
for newlink in item_link:
res = requests.get(newlink,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
name = soup.select_one("h1[itemprop='name']").text.strip()
print(name)
if __name__ == '__main__':
driver = webdriver.Chrome()
try:
find_page_items(driver,url)
finally:
driver.quit()
Output:
Mercer Medium Bonded-Leather Crossbody
Mercer Large Tote
Nolita Medium Satchel
Voyager Medium Multifunction Top-Zip Tote
Mercer Medium Crossbody
Kelsey Large Crossbody
Medium Mercer Gallery
Mercer Large Center Tote
Signature Raven Large Tote
However, If you stick to selenium then you need to create new instance of it everytime you browse a new url or may be a better option is to clear the cache.