python selenium web-scraping access-denied data-extraction

Access denied while scraping a website with selenium in Python

Hi I'm trying to extract information from Macy's website, specifically from this category = 'https://www.macys.com/shop/featured/women-handbags'. But when I access a particular item page I get a blank page with the following message:

Access Denied You don't have permission to access "any of the items links listed on the above category link" on this server. Reference #18.14d6f7bd.1526927300.12232a22

I've also tried changing the user agent with chrome options but it didn't work.

This is my code:

import sys
reload(sys)
sys.setdefaultencoding('utf8')
from selenium import webdriver 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

url = 'https://www.macys.com/shop/featured/women-handbags'

def init_selenium():
    global driver
    driver = webdriver.Chrome("/Users/rodrigopeniche/Downloads/chromedriver")
    driver.get(url)

def find_page_items():
    items_elements = driver.find_elements_by_css_selector('li.productThumbnailItem')
    for index, element in enumerate(items_elements):
    items_elements = driver.find_elements_by_css_selector('li.productThumbnailItem')
    item_link = items_elements[index].find_element_by_tag_name('a').get_attribute('href')
    driver.get(item_link)
    driver.back()


init_selenium()
find_page_items()

Any idea what's going on and what can I do to fix it?

Solution

It's not a selenium oriented solution (all through) but it works. You can give it a try.

from selenium import webdriver 
import requests
from bs4 import BeautifulSoup

url = 'https://www.macys.com/shop/featured/women-handbags'

def find_page_items(driver,link):
    driver.get(link)
    item_link = [item.find_element_by_tag_name('a').get_attribute('href') for item in driver.find_elements_by_css_selector('li.productThumbnailItem')]
    for newlink in item_link:
        res = requests.get(newlink,headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(res.text,"lxml")
        name = soup.select_one("h1[itemprop='name']").text.strip()
        print(name)

if __name__ == '__main__':
    driver = webdriver.Chrome()
    try:
        find_page_items(driver,url)
    finally:
        driver.quit()

Output:

Mercer Medium Bonded-Leather Crossbody
Mercer Large Tote
Nolita Medium Satchel
Voyager Medium Multifunction Top-Zip Tote
Mercer Medium Crossbody
Kelsey Large Crossbody
Medium Mercer Gallery
Mercer Large Center Tote
Signature Raven Large Tote

However, If you stick to selenium then you need to create new instance of it everytime you browse a new url or may be a better option is to clear the cache.