Search code examples
pythonseleniumtimer

How to add a timer to calculate my code execution


I have written a python script which scraps products from aliexpress.

Here is my code :

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import json 
import csv 


options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 2):
    print('---', page_nb, '---')
    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@class="JIIxO"]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@class="eXPaM"]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@class="ox0KZ"]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl) + str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data) 

My question :

What I need is to add a timer that calculate the time of process and returns a value to know how much time takes the code. And also I want a method in order ro get a new collection after each scraping because when I run my code the second time, I get my data with the old collection.

I appreciate any help from you . Thank you !


Solution

  • May your problem solve in below code:

    from selenium.webdriver.edge.options import Options  
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium import webdriver  
    from pymongo import MongoClient
    from time import sleep
    from lxml import html 
    import pandas as pd
    import cssselect
    import pymongo
    import json 
    import csv 
    import time as Time
    
    
    options = Options()
    options.headless = True
    driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
    url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
    baseurl = 'https://www.aliexpress.com'
    
    for page_nb in range(1, 2):
        print('---', page_nb, '---')
        
        driver.get(url.format(page_nb))
        sleep(2)
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  # JavaScript has time to add elements
            new_offset = driver.execute_script("return window.pageYOffset;")
            print(new_offset,current_offset)
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        
        sleep(3)
        
        tree = html.fromstring(driver.page_source)
        
        results = []
        
        for product in tree.xpath('//div[@class="JIIxO"]//a'):
            start_time = Time.time()
            title = product.xpath('.//h1/text()')
            
            if title:
                title = title[0]
                
                price = product.cssselect('div.mGXnE._37W_B span')
                price = [x.text for x in price]
    
                currency = price[0]
                price = ''.join(price[1:])
                stars = product.xpath('.//span[@class="eXPaM"]/text()')
                if stars :
                    stars  = stars [0]
                else:
                    stars  = 'None'
                    
                nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
                if nb_sold:
                    nb_sold = nb_sold[0]
                else:
                    nb_sold = 'None'
                supl = product.xpath('.//a[@class="ox0KZ"]/text()')
                if supl:
                    supl = supl[0]
                else:
                    supl = 'None'
    
                ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
                if ship_cost:
                    ship_cost = ship_cost[0]
                else:
                    ship_cost = 'None'
                
                product_links = product.xpath('./@href')
                if product_links:
                    product_links = str(baseurl) + str( product_links[0])
                difference_time = Time.time() - start_time # calculate time taken by program
                
                row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links, difference_time] #difference_time store dataframe
                results.append(row)
                print('len(results):', len(results))
    
        driver.close()
    df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks", "Time Taken"))
    
    ####### Insert in database #############
    client = MongoClient("mongodb://localhost:27017/")     
    collection = client['db2']['aliex2']     
    data = df.to_dict(orient = 'records')     
    collection.insert_many(data) 
    

    I had find out time taken by program in loop and store that difference time in dataframe