I am trying to scrape reddits and the code below works when the URL page = 'https://www.reddit.com/r/announcements/hot/'
However, when I just change the URL a bit, page='https://www.reddit.com/r/announcements/new/' does not work. I have tried changing the sleep() but doesn't work. I have confirmed that the scrolling is working fine in both cases. What is the problem?
import time
from selenium import webdriver
page = 'https://www.reddit.com/r/announcements/new/'
scroll_n_times = 10
scrape_comments = False
erase_db_first = True
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path = "core/chromedriver.exe",options=options)
#page = page.lower()
driver.get(page)
xpath = "//a[@data-click-id='body']"
sleep_time = 0.5
if (scroll_n_times != 0):
print(('Opening reddit and scrolling: takes approximately {0} seconds'
).format(sleep_time * scroll_n_times))
else:
print('Opening reddit and scrolling.. done')
try:
# When scroll_n_times = 0, loop stops
while scroll_n_times:
# Scrolls browser to the bottom of the page
height = driver.execute_script("return document.documentElement.scrollHeight")
driver.execute_script("window.scrollTo(0, " + str(height) + ");")
time.sleep(sleep_time)
scroll_n_times -= 1
elements = driver.find_elements_by_xpath(xpath)
# elements = self.driver.find_elements(By.XPATH, xpath)
print(elements)
# Get the link from the href attribute
links = [tag.get_attribute('href') for tag in elements]
finally:
driver.quit()
For not logged in user, DOM structure a bit different in you case
Here is correct selector:
css = "shreddit-post[comment-count]"
...
// after scroll
elements = driver.find_elements(By.CSS_SELECTOR, css)
links = [el.find_element(By.CSS_SELECTOR, "a[slot=full-post-link]")
.get_attribute('href') for el in elements]