Search code examples
pythonselenium-webdriverweb-scrapingbeautifulsoupanalysis

How to do multi pages web scraping to extract product data(product name, price) from Lazada by using Python?


I tried to extract product data of all the products (name and price) from a Lazada shop. However, There are total 102 pages but I only able to extract the first page of the data. Could anyone recognize the problem of my code?

url: https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page=1&pageTypeId=2&q=All-Products

Below are my coding

import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class ScrapeLazada():
    
    def scrape(self):
        url = 'https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page=1&pageTypeId=2&q=All-Products'
        driver = webdriver.Chrome()
        driver.get(url)
        
        products=[]
        for i in range(102):
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#root")))
            time.sleep(2)

            soup = BeautifulSoup(driver.page_source, "html.parser")
            for item in soup.findAll('div', class_='Bm3ON'):
                product_name = item.find('div', class_='RfADt').text
                price = item.find('span', class_='ooOxS').text.replace('RM', '')
                products.append(
                    (product_name, price) 
                )

            time.sleep(2)
            driver.find_element(By.CSS_SELECTOR, ".ant-pagination-next > button").click()
            time.sleep(3)

            df = pd.DataFrame(products, columns=['Product Name', 'Price'])
            print(df)

            df.to_excel('Lazada_Guardian_Scrape.xlsx', index=False)
            print('Data saved in local disk')
    
    
        driver.close()
        
sl = ScrapeLazada()
sl.scrape()

Below are my outcome. Apparently it just listed the first page products but have problem to list starting from the next page.

Product Name   Price
0   UPHAMOL 250 Children Suspension Delicious Oran...    7.80
1   Darlie Double Action Fresh + Clean Toothpaste ...   20.92
2                         Dermal Therapy Lip Balm 10g   12.78
3                Nurish No Teen Anti Acne Toner 100Ml   12.82
4                   Live-Well OCCUsharp 30s Pack-of-3  100.90
5                       Oxy Anti- Blackhead Wash 100g   11.95
6                Guardian Clear Assorted Plasters 20s    1.55
7          Selsun Blue 2 in 1 Treatment Shampoo 120ml   24.66
8                    Hansaplast Disney Frozen II 20's    8.90
9              Guardian Wet Wipes 10's Fragrance Free    3.46
10          Fruiser Shower Cream Pump Rosemilk 1000ml    8.10
11  Enchanteur Wonder Woman Handbag Edt Fighter Of...    9.90
12               Guardian Plastic Plasters 100s + 20s    9.10
13                          Sensodyne Fresh Mint 100g   12.50
14        Pantene Hair Fall Control Conditioner 165ML   10.89
15                Koolfever Cooling Gel For Babies 4s    8.60
16            Hada Labo Premium Whitening Essence 30g   85.90
17           Kinohimitsu J'pan Health Pad 10's + 10's   67.03
18           Ceradan Moisturising Hand Sanitiser 50ml   26.74
19                   Sunsweet Pitted Prune 340g (USA)   23.20
20                      **21st Century Probiotics 30s   17.00
21  Kundal Honey and Macadamia Hair Treatment Pear...   27.22
22      Sunsilk Super Conditioner Damage Rescue 180ml   11.28
23                    LACTOGG probiotic capsules 30's  125.10
24                              Rosken Bio Serum 50ml   28.82
25           Simple Kind To Skin Soothing Toner 200ml   21.34
26                  L’Oreal White Perfect Toner 200ml   30.25
27                            Total Image S Tummy 60s   63.00
28        Durex Invisible Extra Lubricant Condom 10's   51.13
29          3 Legs Tolnaftate Cream Pack Of 2 (2X10g)   14.66
30          Hansaplast Universal Water Resistant 20's    4.20
31     Perfume Generics Perfume Oil Paris Hilton 10Ml    8.90
32                Aiken Shampoo - Intense Repair 350G   12.68
33                            GoodMorning VGrains 1kg   62.52
34                        Woodwards Gripe Water 148ml   14.00
35          Difflam Hextra Sore Throat Lozenges 2.4mg    8.00
36                               Okamoto 003 Cool 3's   14.50
37                 Dettol Hand Sanitizer Refresh 50ml    6.25
38  Avene Pre-Serum Hydrating Essence-In-Lotion 200Ml   87.30
39  Guardian Essential Lavender Refreshing Body Wa...   10.10
Traceback (most recent call last):
  File "Lazada_Guardian.py", line 43, in <module>
    sl.scrape()
  File "Lazada_Guardian.py", line 30, in scrape
    driver.find_element(By.CSS_SELECTOR, ".ant-pagination-next > button").click()
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 94, in click
    self._execute(Command.CLICK_ELEMENT)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 403, in _execute
    return self._parent.execute(command, params)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 440, in execute
    self.error_handler.check_response(response)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py", line 245, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted: Element <button class="ant-pagination-item-link" type="button" tabindex="-1">...</button> is not clickable at point (1186, 693). Other element would receive the click: <html lang="en" class=" ">...</html>
  (Session info: chrome=112.0.5615.137)
Stacktrace:
0   chromedriver                        0x000000010295d670 chromedriver + 4298352
1   chromedriver                        0x0000000102955bbc chromedriver + 4266940
2   chromedriver                        0x0000000102588758 chromedriver + 280408
3   chromedriver                        0x00000001025cb444 chromedriver + 554052
4   chromedriver                        0x00000001025c8e84 chromedriver + 544388
5   chromedriver                        0x00000001025c663c chromedriver + 534076
6   chromedriver                        0x00000001025c5530 chromedriver + 529712
7   chromedriver                        0x00000001025b8428 chromedriver + 476200
8   chromedriver                        0x00000001025b7b90 chromedriver + 474000
9   chromedriver                        0x00000001025fc080 chromedriver + 753792
10  chromedriver                        0x00000001025b62d0 chromedriver + 467664
11  chromedriver                        0x00000001025b7354 chromedriver + 471892
12  chromedriver                        0x000000010291d6c4 chromedriver + 4036292
13  chromedriver                        0x0000000102921c64 chromedriver + 4054116
14  chromedriver                        0x00000001029282d8 chromedriver + 4080344
15  chromedriver                        0x0000000102922970 chromedriver + 4057456
16  chromedriver                        0x00000001028f98dc chromedriver + 3889372
17  chromedriver                        0x000000010294125c chromedriver + 4182620
18  chromedriver                        0x00000001029413b4 chromedriver + 4182964
19  chromedriver                        0x00000001029500f4 chromedriver + 4243700
20  libsystem_pthread.dylib             0x00000001a0e2e06c _pthread_start + 148
21  libsystem_pthread.dylib             0x00000001a0e28e2c thread_start + 8

Solution

  • Can solve this issue in two ways.

    1. Using Action chains to click on the next button.
    # Imports required
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    actions = ActionChains(driver)
    wait = WebDriverWait(driver,30)
    products_list = []
    for i in range(102):
        nextbutton = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,".ant-pagination-next > button")))
        actions.move_to_element(nextbutton).click().perform()
        time.sleep(2)
    
    1. Using page value in the URL and iterate over and no need to perform click action on next button.
    for i in range(1,102):
        driver.get(f"https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page={i}&pageTypeId=2&q=All-Products")
        ...
    

    But for scraping can use Request library from Python. Refer Web scraping with API to find API and extract data.

    import requests
    import json
    
    for i in range(1,5):
        url = f"https://www.lazada.com.my/guardian/?ajax=true&from=wangpu&isFirstRequest=true&langFlag=en&page={i}&pageTypeId=2&q=All-Products"
        response = requests.get(url)
        data = response.json()["mods"]["listItems"]
        for j in range(len(data)):
            name = data[j]["name"]
            price = data[j]["priceShow"]
            print(f"{name} : {price}")
    
    Output:
    PHAMOL 250 Children Suspension Delicious Orange Flavour 60ml : RM7.80
    Darlie Double Action Fresh + Clean Toothpaste Original Strong Mint 225g x 2 (Value Pack) : RM20.92
    Dermal Therapy Lip Balm 10g : RM12.78
    Nurish No Teen Anti Acne Toner 100Ml : RM12.82
    Live-Well OCCUsharp 30s Pack-of-3 : RM100.90
    ...