Search code examples
pythonselenium-webdrivertwitter

Twitter parsing with Selenium python


I'm newguy in parsing with selenium. i got a task "you should get 10 Elon Musk's last 10 twitts and show text of it in terminal". i tried do that like this:


import os

from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from seleniumwire import webdriver
from dotenv import load_dotenv

load_dotenv()
LOGIN = os.getenv('LOGIN')
PASSWORD = os.getenv('PASSWORD')
IP_ADRESS = os.getenv('FR_IP_ADRESS')
PORT = os.getenv('FR_PORT')

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('--ignore-certificate-errors')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

proxy_options = {
    'proxy': {
        'http': f'http://{LOGIN}:{PASSWORD}@{IP_ADRESS}:{PORT}'
    }
}

try:
    link = 'https://twitter.com/elonmusk'
    browser = webdriver.Chrome(
        options=options,
        seleniumwire_options=proxy_options
    )

    stealth(
        browser,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
    )

    browser.get(link)

    browser.implicitly_wait(20)
    target = browser.find_elements(
            By.CSS_SELECTOR, '[data-testid="tweet"]'
        )

    browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

    twits = browser.find_elements(
            By.CSS_SELECTOR, '[data-testid="tweet"] [data-testid="tweetText"]'
        )

    for twit in twits[::10]:
        print(twit.text)

finally:
    time.sleep(20)
    browser.quit()

The first problem is that scroling browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") dosen't work, but when i used it in the first version of code, it worked normal. The second problem is that my twitts colector dosen't collect anything or collects different values every next time. I tried use different methods of scroll, but they either work or don't work. There is similar situation in twtits collector. I think that the problem could be conect with twitter's work method, but i can't be sure because i have not enough exp in parsing. I hope you will help me. Thank you in advance


Solution

  • selenium find elements is not working so, we can pass HTML content to BeautifulSoup to Extract tweet text.

    here is full script to get first 10 tweet text

    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    import time
    
    options = Options()
    #options.add_argument("user-data-dir=C:\\Users\\yourusername\\AppData\\Local\\Google\\Chrome Beta\\User Data")
    options.add_argument("profile-directory=Default")
    
    driver_service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=driver_service, options=options)
    
    try:
        link = 'https://twitter.com/elonmusk'
        driver.get(link)
    
        tweet_text_elements = []
    
        # Scroll 5 times to load more tweets
        for _ in range(5):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(2) 
    
            html_content = driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')
    
            # Find all elements with 'tweetText'
            tweet_text_elements.extend(soup.find_all('div', {'data-testid': 'tweetText'}))
    
        # first 10 tweet
        for i, tweet_text_element in enumerate(tweet_text_elements[:10]):
            tweet_text = tweet_text_element.text.strip().replace("\n", "")
            print(f"Tweet {i+1}: {tweet_text}")
    
    finally:
        driver. Quit()