Search code examples
pythonselenium-webdriverweb-scrapingbeautifulsoup

How to scrape options combination prices in homedepot


I'm trying to scrape products info from homedepot like price, product details, specifications, images, and s on. I was able to scrape all these information but now I don't know how to scrape the price if different combination of options selected in the product since the price is changing based on these options. Is there any way to scrape the price and image for each possible combination in the options in the product?

for more clear explanation to my problem see that product url

https://custom.homedepot.com/custom-doors/p/Steves-Sons-Regency-Modern-Customizable-Fiberglass-Door/314599913/45272-Pre-Hung/57533-Single-w-Two-Sidelites-Transom/57526-36-x-93/40069-12/57523-64-1-2-x-95-1-4/55572-Autumn-Wheat/45147-Left-Hand-Inswing/55578-Glass-Panels/45143-Black-Bronze/35734-6-9-16

you can see that there're some options in the right of the image each parameter has multiple options and each combination of these options when selected it changes the image and the price. If possible how can I scrape these info?

Note: I'm using selenium and BeautifulSoup

Update:

Here's my code so far for scraping the options part in the product page

def scrape_price(self):
    if self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"}):
        price_div = self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"})
        price_curr =  price_div.findAll("span")[0].text
        price_doll =  price_div.findAll("span")[1].text
        price_cent=""
        if len(price_div.findAll("span")) > 2:
            price_cent =  price_div.findAll("span")[2].text
        if price_cent != "":
            self.data['price']=price_curr+price_doll+"."+price_cent
        else:
            self.data['price']=price_curr+price_doll
    else:
        if self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
            price_div = self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
            price_span = price_div.find("span",attrs={"class":"price-detailed__unit-price"}).find("span").text
            #unit_span = price_div.findAll("span")[1].text
            self.data['price']=price_span
        else:
            if self.soup.find("div", attrs={"class":"pricingReg"}):
                price_div = self.soup.find("div", attrs={"class":"pricingReg"})
                curr = price_div.find("span", attrs={"class":"price__currency"}).text
                dollars = price_div.find("span", attrs={"class":"price__dollars"}).text
                cents = price_div.find("span", attrs={"class":"price__cents"}).text
                price = curr+dollars+"."+cents
                self.data['price']=price
                self.data['Availability'] = "Available"
            else:
                self.data['Availability'] = "Not Available"

    if self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
        detailed_price_tag = self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
        detailed_price = cleanhtml(detailed_price_tag.text)
        self.data["Detailed Price"] = detailed_price
        if self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}):
            self.data["Case Unit Cover"] = self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}).text


def scrape_images(self):
    if self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"}):
        img_btns = self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"})
        count=0
        self.data["images"]=[]
        for img_btn in img_btns:
            img_url = img_btn.find("img").get("src")
            self.data["images"].append(img_url)
            count+=1 
    else:
        if self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"}):
            images_div = self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"})
            images_divs = images_div.findAll("div", attrs={"class":"styles__ThumbnailInner-sc-10zajq9-1 icLycq"})
            imgs=[]
            for image_div in images_divs:
                if image_div.find("img"):
                    img_src = image_div.find("img").get("src")
                    imgs.append(img_src)
            self.data["images"]= imgs
def scrape_options(self):
    if self.soup.find("div", attrs={"class":"super-sku"}):
        param_tag = self.soup.find("div", attrs={"class":"super-sku"})
        params = param_tag.findAll("div", attrs={"class":"super-sku__inline-attribute"})
        parameters=[]
        for param in params:
            param_body = param.find("div", attrs={"class":"label"}).text
            cleaned_param = cleanhtml(param_body)
            splitted = cleaned_param.split(':')
            label = splitted[0]
            val = splitted[1]
            options_div=param.findAll("div", attrs={"class":"super-sku__inline-tile--space"})
            if len(options_div) == 0:
                options_div=param.findAll("button", attrs={"class":"super-sku__inline-swatch"})
            options=[]
            for opt_div in options_div:
                if opt_div.find("img"):
                    opt = { 
                        "img" : opt_div.find("img").get("src"),
                        "label":opt_div.find("img").get("title")
                    }
                else:
                    opt = opt_div.find("button").text
                options.append(opt)
            parameters.append({
                "Label":label,
                "Value":val,
                "Options":options
            })
        self.data["Parameters"] = parameters
    else:
        if self.soup.find("div", attrs={"class":"buybox__super-sku"}):
            options=[]
            options_divs = self.soup.find("div", attrs={"class":"buybox__super-sku"}).find_all("div",recursive=False)
            for option_div in options_divs:
                option={}
                optionheader0 = option_div.find("div", attrs={"class":"styles__HeaderRow-fb29x6-1"})
                optionheader1 = option_div.find("div", attrs={"class":"styles__Header-sc-1gql1zk-0"})
                if optionheader0 or optionheader1:
                    if optionheader0:
                        header_div = optionheader0
                    else:
                        header_div = optionheader1
                    if header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}):
                        label = header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}).text
                        option["Label"] = label
                    if header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}):
                        value = header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}).text
                        option["Value"] = value
                optionchoices0 = option_div.find("div", attrs={"class":"DefaultTemplate__FixedSizeChoiceImageWrapper-rpf825-0"})
                optionchoices1 = option_div.find("div", attrs={"class":"styles__TileSelectWrapper-jw86q8-1"})
                optionchoices2 = option_div.find("div", attrs={"class":"product_sku_Overlay_ListBoxes"})
                optionchoices3 = option_div.find("div", attrs={"class":"product_sku_Overlay_ColorSwtHolder"})
                if optionchoices0 or optionchoices1 or optionchoices2 or optionchoices3:
                    if optionchoices0:
                        choices_div = optionchoices0
                        choices=[]
                        choices_images=choices_div.findAll("div",attrs={"class":"styles__ChoiceImage-kykx13-4"})
                        for choice_div in choices_images:
                            if choice_div.find("img"):
                                choice_img = choice_div.find("img").get("src")
                                choice_val = choice_div.find("img").get("alt")
                                choices.append({
                                    "img":choice_img,
                                    "value":choice_val
                                })
                        option["choices"]=choices
                    elif optionchoices2:
                        choices_div = optionchoices2
                        choices=[]
                        choices_images=choices_div.findAll("span",attrs={"class":"drop-down__hover-effect"})
                        for choice_div in choices_images:
                            if choice_div.find("a"):
                                choice_text = choice_div.find("a").text
                                choices.append(choice_text)
                        option["choices"]=choices
                    elif optionchoices3:
                        choices_div = optionchoices3
                        choices=[]
                        choices_images=choices_div.findAll("li",attrs={"class":"styles__SwatchRoot-sc-1kr5yl9-1"})
                        for choice_div in choices_images:
                            if choice_div.find("img"):
                                choice_img = choice_div.find("img").get("src")
                                choice_val = choice_div.find("img").get("title")
                                choices.append({
                                    "img":choice_img,
                                    "value":choice_val
                                })
                        option["choices"]=choices
                    else:
                        choices_div = optionchoices1
                        choices=[]
                        choices_images=choices_div.findAll("div",attrs={"class":"styles__TileDiv-jw86q8-0"})
                        for choice_div in choices_images:
                            choice_text = choice_div.text
                            choices.append(choice_text)
                        option["choices"]=choices
                options.append(option)
            self.data["options"] = options

now I want to know how can I scrape the price for each combination of these options


Solution

  • An important facet of the target page is that whenever an item is toggled (clicked or selected), additional pricing options can emerge. This solution recursively traverses the feature listing, clicking on each one, and continues the process on the rest of price listings once they appear:

    from selenium import webdriver
    import time, re
    d = webdriver.Chrome('/path/to/chromedriver')
    d.get('https://custom.homedepot.com/custom-doors/p/Steves-Sons-Regency-Modern-Customizable-Fiberglass-Door/314599913/45272-Pre-Hung/57533-Single-w-Two-Sidelites-Transom/57526-36-x-93/40069-12/57523-64-1-2-x-95-1-4/55572-Autumn-Wheat/45147-Left-Hand-Inswing/55578-Glass-Panels/45143-Black-Bronze/35733-4-9-16')
    def get_combos(_seen):
       flag = False
       for i, a in enumerate(d.execute_script("""return document.querySelector('.buybox__super-sku').children""")):
           if i and i not in dict(_seen):
              flag = True
              for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
                 p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
                 if (op1:=d.execute_script(f"""return document.querySelectorAll('{p}')""")):
                    for j, _ in enumerate(op1):
                       try:
                           d.execute_script(f"""document.querySelectorAll('{p}')[{j}].click()""")
                           time.sleep(1)
                           yield from get_combos([*_seen, [i, [d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""), d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")]]])
                       except:
                          pass
                    break
              break
       if not flag:
          yield {'price':d.execute_script("""return document.querySelector('span:nth-of-type(1).pReg').textContent"""), 
                  'img':d.execute_script("""return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')"""),
                  'combo':_seen}
               
              
    result = list(get_combos([]))
    final_result = [{'price':f'{i["price"][:6]}.{i["price"][-2:]}', 'image':i['img'], **({re.sub(':\s*$', '', a):b for _, [a, b] in i['combo']})} for i in result]
    

    Output:

    [{'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009218?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009266?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009242?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009290?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009219?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009267?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009243?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009291?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009220?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009268?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}]
    

    Regarding a concurrent version of the above solution for use on many thousands of input links, there are a couple immediate particulars that should be addressed:

    1. First, the hosting of your selenium instances. You can either spin up a number of selenium driver instances on your own machine or use a service like Browserless to do this for you. selenium is very resource intensive, so a hosting service that can automatically handle many separate selenium instances is probably your best approach to take.
    2. Second, the method by which to interact with the target pages. If you use a service like Browserless, you could pass a Javascript function to driver.execute_script with a specific timeout that will offload the page interactions to the service itself.

    Below is a solution that maintains a pool of selenium driver instances with an async version of get_combos. These drivers can either point to a remote selenium instance (like Browserless) or local instances on your own machine.

    First, the async implementation of get_combos:

    import asyncio, functools
    from selenium import webdriver
    async def get_page_combos(d, link):
       d.get(link)
       async def get_combos(_seen):
          flag = False
          loop = asyncio.get_running_loop()
          first_vals = await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.buybox__super-sku').children"""))
          for i, a in enumerate(first_vals):
             if i and i not in dict(_seen):
                flag = True
                for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
                    p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
                    loop = asyncio.get_running_loop()
                    if (op1:=(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return document.querySelectorAll('{p}')""")))):
                       for j, _ in enumerate(op1):
                          try:
                              loop = asyncio.get_running_loop()
                              await loop.run_in_exector(None, functools.partial(d.execute_script, f"""document.querySelectorAll('{p}')[{j}].click()"""))
                              await asyncio.sleep(1)
                              new_vals = [(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""))), (await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")))]
                              async for pl in get_combos([*_seen, [i, new_vals]]):
                                 yield pl
                          except:
                             pass
                       break
                break
          if not flag:
             loop = asyncio.get_running_loop()
             yield {'price':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('span:nth-of-type(1).pReg').textContent""")), 
                  'img':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')""")),
                  'combo':_seen}
       result = []
       async for i in get_combos([]):
          result.append(i)
       return result
    

    Second, putting it all together:

    async def main(links, instance_num = 10): #you can adjust the number of instances depending on your needs
        drivers = [webdriver.Chrome('<path>') for _ in range(instance_num)] #<path> can be substituted for a path to a local chromedriver executable or a url to a remote instance
        final_results = []
        while links:
           pairing = [(a, b) for a, b in zip(drivers, [links.pop(0) if links else None for _ in range(instance_num)]) if b]
           vals = await asyncio.gather(*[get_page_combos(*i) for i in pairing])
           final_results.extend(vals)
        return final_results
      
    links = [...] #all your homedepot links to be crawled
    all_page_vals = asyncio.run(main(links))