I am developing a code to scrape well-known ecommerce web sites. The code works, but it can't read the js scripts. I always manage to see up to 10 products when I know there are more than 40. I would need a python request that waits for the rendering of the page to scrape. I don't know English, this was translated with Google Translate. I apologize.
Codes I have tried:
import requests, random
from django.shortcuts import render
from bs4 import BeautifulSoup
from requests_html import HTMLSession, AsyncHTMLSession
# Walmart - Create your views here.
def wlista(request):
buscarprods = request.COOKIES['buscarprod']
url = 'https://www.walmart.com/search?q=hp+printers'
url = url.replace(" ", "%20")
proveedor = request.COOKIES['proveedor']
HDRS = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'es-ES;es;q=0.8',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# This works but doesn't wait for javascript
session_object = requests.Session()
r = session_object.get(url, headers=HDRS).text
# This works too, but doesn't wait for javascript
#r = requests.get(url, headers=HDRS, timeout=(8.05, 35)).content
#Here: I have error with "r.html.render(sleep=2)":
# Error:
# "There is no current event loop in thread 'Thread-1 (process_request_thread)'."
#s = HTMLSession()
#r = s.get(url, headers=HDRS)
#r.html.render(sleep=2)
soup = BeautifulSoup(r, "html.parser")
rows = soup.find_all(attrs={"data-item-id": True})
Thanks for the help!
You'll need something besides requests
if you want to access JavaScript-generated content. I'd suggest Selenium/Chromedriver from https://chromedriver.chromium.org/downloads
This example finds 56 printers:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
dirname, scriptname = os.path.split(os.path.abspath(__file__))
THIS_DIRECTORY = f'{dirname}{os.sep}'
HEADLESS = False
DRIVER = None
def load_page(url):
'''Load the specified page with an automated browser'''
global DRIVER
if DRIVER is None:
options = webdriver.ChromeOptions()
options.headless = HEADLESS
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
DRIVER = webdriver.Chrome(options=options, executable_path=f'{THIS_DIRECTORY}chromedriver.exe')
DRIVER.get(url)
def main(url):
load_page(url)
soup = BeautifulSoup(DRIVER.page_source, 'html.parser')
rows = soup.find_all(attrs={"data-item-id": True})
print(len(rows))
for row in rows:
# Do stuff
pass
if __name__ == '__main__':
main('https://www.walmart.com/search?q=hp+printers')
Output:
56