Hello I am trying to get all the links from below web page. This page loads new product when we scroll down and I am trying to get the links for all the products by scrolling to the bottom of the page. I am using scrolldown
method of requests_html after following this post however it only fetches links of the products that are visible without scroll. The problem is it is scrolling down the complete page instead of the product frame. If you see the below image the products are loaded only when you scroll at the bottom of the products frame.
I also tried seleniumwire(check below code) but it does the same thing, scrolls to the bottom of the page where no products are loaded. How ca I only scroll the products div?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
SCROLL_PAUSE_TIME = 2
def interceptor(request):
del request.headers['Referer'] # Delete the header first
request.headers['Referer'] = header
# Set the interceptor on the driver
driver.request_interceptor = interceptor
# All requests will now use 'some_referer' for the referer
driver.get(baseurl)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# r = requests.get(driver.page_source, headers=header)
print(driver.page_source)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# product_list = soup.find_all('div', class_='col-item productInfoDiv ')
#
# for itemprop in product_list:
# for link in itemprop.find_all('a', href=True):
# product_links.append("{}{}".format(baseurl, link['href']))
#
# product_links_uniq = set(product_links)
#
# print(product_links_uniq)
finally:
driver.quit()
from requests_html import HTML, HTMLSession
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
print(ln)
print(len(all_links))
filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))
You could just mimic the POST requests the page does and keep requesting batches of 20 results, extracting the links, until you have gathered the total specified number of results.
import requests
import math
from bs4 import BeautifulSoup as bs
def add_product_links(soup):
product_links.extend(['https://www.medplusmart.com' + i['href']
for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
return
product_links = []
n = 0
results_per_page = 20
page = 1
data = {
'sortField': '',
'startIndex': n,
'productCategoryId': 'MART_20002',
'startPrice': '',
'endPrice': '',
'minPrice': '0',
'maxPrice': '2650',
'excludeNoStock': 'N',
'pCatName': 'personal-care_10102',
'catName': 'skin-care_20002',
'productIdString': '',
'Brand Search': ''
}
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get(
'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
soup = bs(r.content, 'lxml')
data['productIdString'] = soup.select_one('#productIdString')['value']
num_results = int(soup.select_one('#totalProductFound')['value'])
num_pages = math.ceil(num_results / results_per_page)
add_product_links(soup)
s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
while True:
if page > num_pages:
break
data['startIndex'] = n
r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
soup = bs(r.content, 'lxml')
add_product_links(soup)
n += results_per_page
page += 1
print(len(product_links))