python python-3.x selenium-webdriver web-scraping

Can't scrape all the data from a lazy-loading table using Selenium

I'm trying to scrape three fields (player, logo, dkprice) from a table located in the middle of a webpage. To see all the data in that table, it is necessary to scroll down to the bottom of it.

I've created a script in selenium that can scroll the content of the table to the bottom but can scrape only the last 16 results. However, there are 240 items in the table.

My goal is to scrape all the content of the table using selenium, as I have already successfully grabbed the content using the requests module. I wish to know why, even after scrolling to the bottom, Selenium still fails to parse all the content of that table.

I found success using the requests module:

import requests

link = 'https://fantasyteamadvice.com/api/user/get-ownership'

res = requests.post(link,json={"sport":"mlb"})
for item in res.json()['ownership']:
    print(item['fullname'],item['team'],item['dkPrice'])

The script built with Selenium can only parse the last 16 items:

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

link = 'https://fantasyteamadvice.com/dfs/mlb/ownership'

def get_content(driver,link):
    driver.get(link)
    scroll_to_get_more(driver)
    for elem in WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".ownership-table-container [class$='player-row']"))):
        player = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayer']").text
        logo = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayerTeam'] > img").get_attribute("alt")
        dkprice = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayerDkPrice']").text
        yield player,logo,dkprice


def scroll_to_get_more(driver):
    last_elem = ''
    while True:
        current_elem = WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".ownership-table-container [class$='player-row']:last-child")))
        driver.execute_script("arguments[0].scrollIntoView();", current_elem)
        time.sleep(3) # wait for page to load new content
        if (last_elem == current_elem):
           break
        else:
           last_elem = current_elem


if __name__ == '__main__':
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    try:
        for item in get_content(driver,link):
            print(item)
    finally:
        driver.quit()

How can I scrape all the data of that lazy-loading table using Selenium?

Solution

You are scraping the data from the table after scrolling to the bottom, the issue is that the table is loaded dynamically and the html is only only what is being displayed. So simply scrolling to the bottom will not get the entire table here. You need to pull the data from the table (store it), then scroll, then pull the data from the table (append it to what is already stored), continued.

SO here's my code for example:

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

link = 'https://fantasyteamadvice.com/dfs/mlb/ownership'

def get_content(driver):
    rows = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".ownership-table-container [class$='player-row']")))
    
    tempData = []
    for elem in rows:
        player = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayer']").text
        logo = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayerTeam'] > img").get_attribute("alt")
        dkprice = elem.find_element(By.CSS_SELECTOR,"[data-testid='ownershipPlayerDkPrice']").text

        tempData.append((player,logo,dkprice))
        
    return tempData
            
            
    
    


def scroll_to_get_more(driver):
    current_elem = WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".ownership-table-container [class$='player-row']:last-child")))
    driver.execute_script("arguments[0].scrollIntoView();", current_elem)
    time.sleep(3) # wait for page to load new content


# To remove duplicates and maintain the order
def de_dup(data):
    dedup_data = []
    for x in data:
        if x not in dedup_data:
            dedup_data.append(x)
            
    return dedup_data


if __name__ == '__main__':
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(link)
    complete = False
    data = []
    while complete == False:
        current_data = data.copy()
        data += get_content(driver)
        data = de_dup(data)
        scroll_to_get_more(driver)
        
        # Once the code no longer adds new data, we know it's complete
        if len(current_data) == len(data):
            complete = True
            
    driver.close()
    
    for tup in data:
        print(tup)

Output: 158 players

('Carlos Santana', 'MIN logo', '$4200')
('Tommy Pham', 'CWS logo', '$3800')
('Jose Altuve', 'HOU logo', '$5300')
('Jd Martinez', 'NYM logo', '$4900')
('Salvador Perez', 'KAN logo', '$4900')
('Nathan Eovaldi', 'TEX logo', '$9300')
('Paul Goldschmidt', 'STL logo', '$4500')
('Christian Vazquez', 'MIN logo', '$2900')
('Nolan Arenado', 'STL logo', '$4100')
('Andrew Mccutchen', 'PIT logo', '$4200')
('Randal Grichuk', 'ARI logo', '$3200')
('Marcell Ozuna', 'ATL logo', '$5800')
('Jon Singleton', 'HOU logo', '$2700')
('Adam Duvall', 'ATL logo', '$3200')
('Jose Quintana', 'NYM logo', '$7400')
('Willson Contreras', 'STL logo', '$5100')
...
('Masataka Yoshida', 'BOS logo', '$3700')
('Jake Bloss', 'HOU logo', '$6600')
('Wyatt Langford', 'TEX logo', '$4200')
('Paul Skenes', 'PIT logo', '$10500')