Search code examples
seleniumselenium-webdriverweb-scrapingpython-requests-html

requests_htlml infinite scrolling on div instead of entire page


Hello I am trying to get all the links from below web page. This page loads new product when we scroll down and I am trying to get the links for all the products by scrolling to the bottom of the page. I am using scrolldown method of requests_html after following this post however it only fetches links of the products that are visible without scroll. The problem is it is scrolling down the complete page instead of the product frame. If you see the below image the products are loaded only when you scroll at the bottom of the products frame.

I also tried seleniumwire(check below code) but it does the same thing, scrolls to the bottom of the page where no products are loaded. How ca I only scroll the products div?

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver

baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
    SCROLL_PAUSE_TIME = 2

    def interceptor(request):
        del request.headers['Referer']  # Delete the header first
        request.headers['Referer'] = header

    # Set the interceptor on the driver
    driver.request_interceptor = interceptor

    # All requests will now use 'some_referer' for the referer
    driver.get(baseurl)

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # r = requests.get(driver.page_source, headers=header)
    print(driver.page_source)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # product_list = soup.find_all('div', class_='col-item productInfoDiv ')
    #
    # for itemprop in product_list:
    #     for link in itemprop.find_all('a', href=True):
    #         product_links.append("{}{}".format(baseurl, link['href']))
    #
    # product_links_uniq = set(product_links)
    #
    # print(product_links_uniq)

finally:
    driver.quit()

enter image description here

from requests_html import HTML, HTMLSession

baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"

session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
    print(ln)
print(len(all_links))

filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))

Solution

  • You could just mimic the POST requests the page does and keep requesting batches of 20 results, extracting the links, until you have gathered the total specified number of results.

    import requests
    import math
    from bs4 import BeautifulSoup as bs
    
    
    def add_product_links(soup):
        product_links.extend(['https://www.medplusmart.com' + i['href']
                              for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
        return
    
    
    product_links = []
    n = 0
    results_per_page = 20
    page = 1
    
    data = {
        'sortField': '',
        'startIndex': n,
        'productCategoryId': 'MART_20002',
        'startPrice': '',
        'endPrice': '',
        'minPrice': '0',
        'maxPrice': '2650',
        'excludeNoStock': 'N',
        'pCatName': 'personal-care_10102',
        'catName': 'skin-care_20002',
        'productIdString': '',
        'Brand Search': ''
    }
    
    with requests.Session() as s:
        s.headers = {"User-Agent": "Safari/537.36"}
        r = s.get(
            'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
        soup = bs(r.content, 'lxml')
        data['productIdString'] = soup.select_one('#productIdString')['value']
        num_results = int(soup.select_one('#totalProductFound')['value'])
        num_pages = math.ceil(num_results / results_per_page)
        add_product_links(soup)
        s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
    
        while True:
            if page > num_pages:
                break
            data['startIndex'] = n
            r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
            soup = bs(r.content, 'lxml')
            add_product_links(soup)
            n += results_per_page
            page += 1
    
    print(len(product_links))