Search code examples
pythonselenium-webdriverautomationselenium-chromedriver

Getting Index out of range error while trying to scrape product code from a website (Using Python & Selenium)


Issue Description:

I am trying to automate a process where I can visit a website and scrape product details of top 100 products on that page and put it in an excel file.

Code Explanation:

I have a class Webscraper inside which I am calling two functions. First I am calling scroll_and_click_view_more function which is simply scrolling down the webpage that I am visiting. Then I am calling prod_vitals function which is extracting product code and product names from that webpage.

Error Description:

Whenever I am running below code upto a certain maximum no. of products, the code gets stuck after a point and throws Index out of range error. If I set max_count_of_products=50, code got stuck at line, If I set max_count_of_products=100, code got stuck at 93. There is no fixed index where I am getting stuck, if I change the value of max_count_of_products, the point at which the code gets stuck is also changing.

I am attaching screenshots of the error below.

max_count_of_products=50

image1

max_count_of_products=100

image2

Please find my code below:

products_summary = []
max_count_of_products=100

def scroll_and_click_view_more(driver,href):
    flag=False
    last_height = driver.execute_script("return window.pageYOffset + window.innerHeight")               
    while True:
        try:                                                                   
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(4)
            new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product-tile')))
            except Exception as e:
                if new_height == last_height and flag==False:
                    print("Reached the end of the page and no product tiles were found: ",href)
                    return "No product tiles found" 
                else:
                    last_height = new_height
                    continue                                                                      
            div_count = 0
            flag=True                                                                                                                            
            response = driver.page_source                                             
            soup = BeautifulSoup(response, 'html.parser')                                
            div_elements = soup.find_all('div', class_ = 'product-tile')                              
            div_count = len(div_elements)                                                 
            if(div_count > max_count_of_products):                                                       
                return(driver.page_source)
            else:
                driver.execute_script("window.scrollBy(0, 300);")
                time.sleep(3) 
                new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
                #print(new_height)
                if new_height == last_height:
                    return(driver.page_source)
                else:
                    last_height = new_height
        except Exception as e:                                                               
            print(e)
            break

def prod_vitals(soup,title,url):                                                                                                                                  
    count_of_items=1
    products_data = []                                                        # Array to store all product data for our excel sheet
    for div in soup.find_all('div', class_ = 'product-tile'):                            # Iterate over each individual product-tile div tag
        if count_of_items<=max_count_of_products:                                                              
            count_of_items = count_of_items+1;                                                                                                             
            pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
            pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
            products_data.append({'Product Code': pro_code, 'Product Name': pro_name})      # Append the extracted data to the list     
            print("Count: ", count_of_items)
            print("Product Code: ",pro_code)
            print("Product Name: ",pro_name)
            print("\n")
        else:
            break
    time.sleep(5)

class WebScraper:
    def __init__(self):
        self.url = "https://staging1-japan.coach.com/shop/new/women/?auto=true"
        options = Options()
        options.add_argument("--remote-debugging-port=9222")
        self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)

    def scrape(self):
        self.driver.get(self.url)
        time.sleep(5)
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')            # Refresh the page source and parse it
        response = scroll_and_click_view_more(self.driver, 'Link')
        time.sleep(3)
        if response != "No product tiles found" and response != "Reached the end of the page.":
            soup = BeautifulSoup(response, 'html.parser')
            prod_vitals(soup,'TITLE', self.url)
            time.sleep(2)
        else:
            self.driver.execute_script("window.scrollTo(0,0);")
            time.sleep(3)
            self.driver.close()
scraper = WebScraper()
scraper.scrape()                       
time.sleep(5)                         
scraper.driver.quit()

> Attaching the product div structure below:

<div class="css-0">
   <div class="css-1fg6eq7">
      <div tabindex="-1" style="padding-top: 125%;"></div>
         <img width="237.01" height="296" class="chakra-image css-14ql1gk" src="https://coach.scene7.com/is/image/Coach/cn731_b4ous_a0?$desktopProductTile$" fetchpriority="high" id="CN731 B4OUS" name="タビー 12" data-qa="cm_tile_link_pt_img" contain="none" alt="COACH®,タビー 12,ボディバッグ&amp;斜めがけバッグ,トゥルー ピンク">
   </div>
</div>
**Please find my code below with API method:**

def prod_vitals(title,url):                                                                                                                                  
count_of_items=1
products_data = []                                                        # Array to store all product data for our excel sheet
page = 0
list_price = 0                                                                   # Variable to store list price
sale_price = 0                                                                   # Variable to store sale price
discount1 = 0                                                                    # Variable to store discount% that is displayed on the site
discount2 = 0                                                                  # Variable to store discount% calculated manually
res = "Incorrect"
if '/shop' not in url:
    print("Not in url",url)
    return "No product tiles found"
try:
    while True: 
        page = page + 1 #page result start from 2 and each page contains 16 result
        ct = count_of_items + 15    #count to adjust loop as each page contains 16 results
        if(page==1):
            full_url = f"{url}"
        else:
            full_url = f"{url}?page={page}"
        if 'api/get-shop' not in full_url:
            full_url = full_url.replace('/shop', '/api/get-shop')
        session = requests.Session()
        response = session.get(full_url, headers=headers, verify=False)
        products = response.json().get('pageData', {}).get('products', [])
        #products = len(response.json()['pageData']['products'])
        print(full_url,"\n", len(products),"\n")
        if '/get-shop' in full_url and not products:
            print("No product tiles found",full_url)
            return "No product tiles found"
        #print(response.json()['pageData']['products'],"\n")
        for i in products:
            print(i['defaultVariant']['prices'],"\n")
            pro_code = i['defaultColor']['vgId']
            #print(pro_code)
            pro_name = i['name']
            #print(pro_name)
            pdpurl = i['defaultColor']['url']
            #print(pdpurl,"\n")
            sale_price = i['defaultVariant']['prices']['currentPrice'] if i['defaultVariant']['prices']['currentPrice'] else 0
            list_price = i['defaultVariant']['prices']['regularPrice'] if 'regularPrice' in i['defaultVariant']['prices'] and i['defaultVariant']['prices']['regularPrice'] else 0
            discount1 = i['defaultVariant']['prices']['discount'] if i['defaultVariant']['prices']['discount'] and i['defaultVariant']['prices']['discount'] is not None else 0  
            if list_price > 0 and sale_price > 0:
                discount2 = round(((list_price - sale_price) / list_price) * 100)
                res = "Correct" if discount1 == discount2 else "Incorrect"
            elif(list_price ==0 and discount1 == 0):
                discount2 = 0
                res = "Correct"
            translator = Translator()
            translated_pro_name = translator.translate(pro_name, dest='en').text
            if count_of_items <= max_count_of_products: # condition to check if the count reach 100 or not, if count is less than 100 print, else stop printing
                #print(f'Product Count: {count_of_items}\nProduct Code: {pro_code}\nProduct Name: {translated_pro_name}\nProduct URL: {url+pdpurl}\nSale Price: {sale_price}\nList Price: {list_price}\nDiscount1: {discount1}\nDiscount2: {discount2}\nResult: {res}\n\n')
                products_data.append({'Product Code': pro_code, 'Product Name': translated_pro_name,'Product URL': url+pdpurl, 'Sale Price': '¥'+format(sale_price, '.2f'), 'List Price': '¥'+format(list_price, '.2f'), 'Discount on site': str(discount1)+'%', 'Actual Discount': str(discount2)+'%', 'Result': res})
                count_of_items = count_of_items + 1
                #continue
            else:
                break
        if ct > max_count_of_products: #if ct > count thats mean we reach our goal so break the loop
            break
except Exception as e:
    print(e)
    pass

error


Solution

  • this was an exception made by your Python script because it didn't find the id value for some products (maybe a different variant of product ID/code was shown that didn't belong to div.css-1fg6eq7 well this will fix if you add exception handler, here is the modified version of code in prod_vitals function:

    def prod_vitals(soup,title,url):                                                                                                                                  
        count_of_items=0
        products_data = []
        try:  #added error handler to avoid any disruption while the function is running                                                      # Array to store all product data for our excel sheet
            for div in soup.find_all('div', class_ = 'product-tile'):                            # Iterate over each individual product-tile div tag
                if count_of_items<=max_count_of_products:
                    #print(title)
                    list_price = 0                                                                   # Variable to store list price
                    sale_price = 0                                                                   # Variable to store sale price
                    discount1 = 0                                                                    # Variable to store discount% that is displayed on the site
                    discount2 = 0
                    count_of_items = count_of_items+1;                                                                    # Variable to store discount% calculated manually
                    res = "Incorrect"                                                                # Variable to store result of discount1==discount2; initialized with Incorrect
                    pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
                    pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
                    products_data.append({'Product Code': pro_code, 'Product Name': pro_name})      # Append the extracted data to the list     
                    print("Count: ", count_of_items)
                    print("Product Code: ",pro_code)
                    print("Product Name: ",pro_name)
                    print("\n")
                else:
                    break
        except Exception:
            pass
        time.sleep(3)
    

    Second method:

    All of the info that you're targeting is fetched from this API endpoint https://staging1-japan.coach.com/api/get-shop/new/women?page=2 with JSON response, so we can gather this information faster than selenium, with a couple of lines of code using requests module.

    Here is the code:

    import requests
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #disable urllib warning
    
    count = 0
    headers = {
        "Cookie":"auth-bypass=true;" #this cookie value is mandatory for this apps otherwise its throwing '401 Unauthorized' error
    }
    page = 1
    while True: 
        page = page + 1 #page result start from 2 and each page contains 16 result
        ct = count + 15 #count to adjust loop as each page contains 16 results
        url = f"https://staging1-japan.coach.com/api/get-shop/new/women?page={page}"
        session = requests.Session()
        response = session.get(url, headers=headers, verify=False)
        for i in response.json()['pageData']['products']:
            code = i['productId']
            name = i['name']
            count = count + 1
            if count < 101: # condition to check if the count reach 100 or not, if count is less than 100 print, else stop printing
                print(f'Product Count: {count}\nProduct Code: {code}\nProduct Name: {name}\n\n')
            
        if ct > 100: #if ct > count thats mean we reach our goal so break the loop
            break