I'm newguy in parsing with selenium. i got a task "you should get 10 Elon Musk's last 10 twitts and show text of it in terminal". i tried do that like this:
import os
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from seleniumwire import webdriver
from dotenv import load_dotenv
load_dotenv()
LOGIN = os.getenv('LOGIN')
PASSWORD = os.getenv('PASSWORD')
IP_ADRESS = os.getenv('FR_IP_ADRESS')
PORT = os.getenv('FR_PORT')
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('--ignore-certificate-errors')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
proxy_options = {
'proxy': {
'http': f'http://{LOGIN}:{PASSWORD}@{IP_ADRESS}:{PORT}'
}
}
try:
link = 'https://twitter.com/elonmusk'
browser = webdriver.Chrome(
options=options,
seleniumwire_options=proxy_options
)
stealth(
browser,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
browser.get(link)
browser.implicitly_wait(20)
target = browser.find_elements(
By.CSS_SELECTOR, '[data-testid="tweet"]'
)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
twits = browser.find_elements(
By.CSS_SELECTOR, '[data-testid="tweet"] [data-testid="tweetText"]'
)
for twit in twits[::10]:
print(twit.text)
finally:
time.sleep(20)
browser.quit()
The first problem is that scroling browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
dosen't work, but when i used it in the first version of code, it worked normal.
The second problem is that my twitts colector dosen't collect anything or collects different values every next time.
I tried use different methods of scroll, but they either work or don't work. There is similar situation in twtits collector.
I think that the problem could be conect with twitter's work method, but i can't be sure because i have not enough exp in parsing.
I hope you will help me.
Thank you in advance
selenium find elements is not working so, we can pass HTML content to BeautifulSoup to Extract tweet text.
here is full script to get first 10 tweet text
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
options = Options()
#options.add_argument("user-data-dir=C:\\Users\\yourusername\\AppData\\Local\\Google\\Chrome Beta\\User Data")
options.add_argument("profile-directory=Default")
driver_service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=driver_service, options=options)
try:
link = 'https://twitter.com/elonmusk'
driver.get(link)
tweet_text_elements = []
# Scroll 5 times to load more tweets
for _ in range(5):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(2)
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with 'tweetText'
tweet_text_elements.extend(soup.find_all('div', {'data-testid': 'tweetText'}))
# first 10 tweet
for i, tweet_text_element in enumerate(tweet_text_elements[:10]):
tweet_text = tweet_text_element.text.strip().replace("\n", "")
print(f"Tweet {i+1}: {tweet_text}")
finally:
driver. Quit()