I was finally able to scrape data from the website! And also print out the Headlines and Dates to the terminal. But I want to have it saved to a CSV file with a column for the headlines and a column for the dates. How do I do that?
My codes attached below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver = webdriver.Chrome(
chrome_options=options,
executable_path=r"//usr/local/Caskroom/chromedriver/81.0.4044.69/chromedriver")
driver.get(
"https://www.nytimes.com/search?dropmab=true&endDate=20180111&query=nyc§ions=New%20York%7Cnyt%3A%2F%2Fsection%2F39480374-66d3-5603-9ce1-58cfa12988e2&sort=best&startDate=20180107")
myLength = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located(
(By.XPATH, "//figure[@class='css-tap2ym']//following::a[1]"))))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, "//div[@class='css-vsuiox']//button[@data-testid='search-show-more-button']"))).click()
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_xpath(
"//figure[@class='css-tap2ym']//following::a[1]")) > myLength)
titles = driver.find_elements_by_xpath(
"//figure[@class='css-tap2ym']//following::a[1]")
myLength = len(titles)
except TimeoutException:
break
headlines_element = driver.find_elements_by_xpath('//p[@class="css-16nhkrn"]')
headlines = [x.text for x in eheadlines_element]
print('headlines:')
print(headlines, '\n')
dates_element = driver.find_elements_by_xpath("//time[@class='css-17ubb9w']")
dates = [x.text for x in dates_element]
print("dates:")
print(dates, '\n')
for headlines, dates in zip(headlines, dates):
print("Headlines : Dates")
print(headlines + ": " + dates, '\n')
driver.quit()
It's that last bit of code that gets the headline and the dates. Thanks in advance for the help!
You can use csv.writer
to write the data to the csv file.
Use:
with open("your_csv_file", "w") as file:
writer = csv.writer(file)
writer.writerow(["Headlines", "Dates"]) # --> Write header
for h, d in zip(headlines, dates):
writer.writerow([h, d]) # --> Write data