Search code examples
pythonbeautifulsouppython-requestsrenderingscreen-scraping

Python requests waiting for js scripts to render


I am developing a code to scrape well-known ecommerce web sites. The code works, but it can't read the js scripts. I always manage to see up to 10 products when I know there are more than 40. I would need a python request that waits for the rendering of the page to scrape. I don't know English, this was translated with Google Translate. I apologize.

Codes I have tried:

    import requests, random
    from django.shortcuts import render
    from bs4 import BeautifulSoup
    from requests_html import HTMLSession, AsyncHTMLSession


    # Walmart - Create your views here.
    def wlista(request):

        buscarprods = request.COOKIES['buscarprod']
        url = 'https://www.walmart.com/search?q=hp+printers'  
        url = url.replace(" ", "%20")
        proveedor = request.COOKIES['proveedor'] 

        HDRS = {
           'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',,
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'Accept-Language': 'es-ES;es;q=0.8',
           'DNT': '1',
           'Connection': 'keep-alive',
           'Upgrade-Insecure-Requests': '1',
        }

        # This works but doesn't wait for javascript
        session_object = requests.Session()
        r = session_object.get(url, headers=HDRS).text

        # This works too, but doesn't wait for javascript
        #r = requests.get(url, headers=HDRS, timeout=(8.05, 35)).content




        #Here: I have error with "r.html.render(sleep=2)":
        # Error:
        # "There is no current event loop in thread 'Thread-1 (process_request_thread)'."

        #s  = HTMLSession()
        #r = s.get(url, headers=HDRS)
        #r.html.render(sleep=2)




        soup = BeautifulSoup(r, "html.parser")
        rows = soup.find_all(attrs={"data-item-id": True})

Thanks for the help!


Solution

  • You'll need something besides requests if you want to access JavaScript-generated content. I'd suggest Selenium/Chromedriver from https://chromedriver.chromium.org/downloads

    This example finds 56 printers:

    import os
    from bs4 import BeautifulSoup
    from selenium import webdriver
    
    dirname, scriptname = os.path.split(os.path.abspath(__file__))
    THIS_DIRECTORY = f'{dirname}{os.sep}'
    HEADLESS = False
    DRIVER = None
    
    def load_page(url):
        '''Load the specified page with an automated browser'''
        global DRIVER
        if DRIVER is None:
            options = webdriver.ChromeOptions()
            options.headless = HEADLESS
            options.add_experimental_option('excludeSwitches', ['enable-automation'])
            options.add_experimental_option('excludeSwitches', ['enable-logging'])
            options.add_experimental_option('useAutomationExtension', False)
    
            DRIVER = webdriver.Chrome(options=options, executable_path=f'{THIS_DIRECTORY}chromedriver.exe')
        DRIVER.get(url)
    
    
    def main(url):
        load_page(url)
        soup = BeautifulSoup(DRIVER.page_source, 'html.parser')
        rows = soup.find_all(attrs={"data-item-id": True})
        print(len(rows))
        for row in rows:
            # Do stuff
            pass
    
    if __name__ == '__main__':
        main('https://www.walmart.com/search?q=hp+printers')
    
    

    Output:

    56