Search code examples
pythonselenium-webdriverweb-scrapingyoutube

Scraping on YouTube with Selenium - code doesn't store my values


I'm trying to write a script to scrape information from youtube channels. The script uses Selenium, and some parts of it seemed to work just fine on their own. However, once I created a function and put all the bits and pieces I did try on their own, the results don't get stored.

So this is the code that works (i.e.: I run it and all my results are stored as expected)

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_extension(r"C:\Scraping YT\3.4.6_0.crx") #i dont care about cookies
options.add_extension(r"C:\Scraping YT\1.47.4_0.crx") #ublock origin
options.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path=r"C:\Users\Nick\Nextcloud\Università\Tirocinio\Scraping YT\chromedriver\chromedriver.exe", options=options)

driver.get(url)

handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text


#SCRIPTINO TO SCROLL PAGE UNTIL IT ENDS
WAIT_IN_SECONDS = 5
last_height = driver.execute_script("return document.documentElement.scrollHeight")

while True:
    # Scroll to the bottom of page
    driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
    # Wait for new videos to show up
    time.sleep(WAIT_IN_SECONDS)
    
    # Calculate new document height and compare it with last height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
titles = driver.find_elements(By.ID, "video-title")
links = driver.find_elements(By.ID, "video-title-link")


videos = []
for title, view, thumb, link in zip(titles, views, thumbnails, links):
    video_dict = {
        'title': title.text,
        'views': view.text,
        'thumbnail': thumb.get_attribute('src'),
        'link': link.get_attribute('href')
    }
    videos.append(video_dict)

But when I do this:


import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

#CHROME DRIVER 
options = Options()
options.add_extension(r"C:\Users\Nick\Nextcloud\Università\Tirocinio\Scraping YT\3.4.6_0.crx") #i dont care about cookies
options.add_extension(r"C:\Users\Nick\Nextcloud\Università\Tirocinio\Scraping YT\1.47.4_0.crx") #ublock origin
options.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path=r"C:\Users\Nick\Nextcloud\Università\Tirocinio\Scraping YT\chromedriver\chromedriver.exe", options=options)

def scrape_ytchannel(url):
    driver.get(url)

    handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
    subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text


    #SCRIPTINO TO SCROLL PAGE UNTIL IT ENDS
    WAIT_IN_SECONDS = 5
    last_height = driver.execute_script("return document.documentElement.scrollHeight")

    while True:
        # Scroll to the bottom of page
        driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
        # Wait for new videos to show up
        time.sleep(WAIT_IN_SECONDS)
        
        # Calculate new document height and compare it with last height
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


    thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
    views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
    titles = driver.find_elements(By.ID, "video-title")
    links = driver.find_elements(By.ID, "video-title-link")


    videos = []
    for title, view, thumb, link in zip(titles, views, thumbnails, links):
        video_dict = {
            'title': title.text,
            'views': view.text,
            'thumbnail': thumb.get_attribute('src'),
            'link': link.get_attribute('href')
        }
        videos.append(video_dict)
    result = [videos, handle, subscriber_count]
    
url_conf = "https://www.youtube.com/@confindustria/videos"
scrape_ytchannel(url_conf)

Nothing that's been scraped is stored anywhere I can see. Can somebody explain to dumb ol' me what the heck is wrong? Sorry if this is really something dumb, but I'm studying this stuff by myself to get a job, and I can't really find something comprehensive & free to really teach me all there is to know, so I guess that my knowledge is really spotty. Thanks to everyone kind enough to help, If anything is unclear or I didn't explain it right, please feel free to ask me as many questions as necessary... I really need this job!


Solution

  • You just need to return the result from the defined function and print while calling it, That's all.

    import time
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.options import Options
    
    # CHROME DRIVER
    options = Options()
    
    options.add_argument("--start-maximized")
    options.add_experimental_option("useAutomationExtension", False)
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    
    driver = webdriver.Chrome(options=options)
    
    
    def scrape_ytchannel(url):
        driver.get(url)
    
        handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
        subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
    
        # SCRIPTINO TO SCROLL PAGE UNTIL IT ENDS
        WAIT_IN_SECONDS = 5
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
        while True:
            # Scroll to the bottom of page
            driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
            # Wait for new videos to show up
            time.sleep(WAIT_IN_SECONDS)
    
            # Calculate new document height and compare it with last height
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
    
        thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
        views = driver.find_elements(By.XPATH, '//div[@id="metadata-line"]/span[1]')
        titles = driver.find_elements(By.ID, "video-title")
        links = driver.find_elements(By.ID, "video-title-link")
    
        videos = []
        for title, view, thumb, link in zip(titles, views, thumbnails, links):
            video_dict = {
                'title': title.text,
                'views': view.text,
                'thumbnail': thumb.get_attribute('src'),
                'link': link.get_attribute('href')
            }
            videos.append(video_dict)
        result = [videos, handle, subscriber_count]
    
        return result
    
    
    url_conf = "https://www.youtube.com/@confindustria/videos"
    print(scrape_ytchannel(url_conf))
    

    and you see, we get the data back

    [[{'title':'Smart Working Roadmap. Imprese, competitività e benessere nella nuova era','views':'107 views','thumbnail':'https://i.ytimg.com/vi/rraHiETG4hg/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLA_1-Hkv8GB9qyj7AxZFb57v_k8Tw','link':'https://www.youtube.com/watch?v=rraHiETG4hg'},{'title':'Intervista-doppia Presidente Piccola Industria Giovanni Baroni & gli studenti del Liceo Vivona','views':'234 views','thumbnail':'https://i.ytimg.com/vi/-IhWoz1H6pg/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCKrXp_fAecXwhsAkpomXiq2bJYEQ','link':'https://www.youtube.com/watch?v=-IhWoz1H6pg'},......................,{'title':"IL CAPITALE UMANO E IL CAPITALE SOCIALE PER L'ITALIA - Intervento di Luca Paolazzi",'views':'129 views','thumbnail':'https://i.ytimg.com/vi/95UH9mTWl9g/hqdefault.jpg?sqp=-oaymwE2CNACELwBSFXyq4qpAygIARUAAIhCGAFwAcABBvABAfgB_gSAAuoCigIMCAAQARgTICMofzAP&rs=AOn4CLDOvnCNm21pvMYQ0eA6Al-59a991g','link':'https://www.youtube.com/watch?v=95UH9mTWl9g'}],'@confindustria','1.31K subscribers']