I built a webscraper in Selenium to scrape redfin estimate data on redfin.com. The problem I am having is when I output the scraped data onto csv it is copying the row values multiple times every so often and I am not sure how to fix it.
Here is my code:
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, InvalidElementStateException
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
input_file = ".\\pa-property-value-tools\\input\\addresses.xlsx"
input_df = pd.read_excel(input_file)
input_df['Address'] = input_df['Address'].astype(str)
output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate'])
driver = webdriver.Chrome('C:\\Users\\user\\Downloads\\chromedriver_win32 (1)\\chromedriver.exe')
#driver = webdriver.Firefox(executable_path = 'C:\\Users\\Morgan.weiss\\Downloads\\geckodriver-v0.24.0-win64\\geckodriver.exe')
def append_date_timestamp(filepath, extension):
return (
filepath + "-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + "." + extension
)
def get_redfin_estimate(address):
driver.get('https://www.redfin.com/')
print(address)
driver.find_element_by_name('searchInputBox').clear()
driver.find_element_by_name('searchInputBox').send_keys(address)
time.sleep(3)
try:
pop_up = driver.find_element_by_css_selector("div[data-rf-test-name='expanded-results']")
if pop_up:
types = pop_up.find_elements_by_class_name("expanded-row-content")
for ele in types:
val = ele.find_element_by_class_name("expanded-type")
if val.text == "ADDRESSES":
ele.find_element_by_css_selector("div[data-rf-test-name='item-row-active']").click()
else:
return ('N/A', 'N/A')
except:
pass
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
price1 = soup.find('div', {'class', 'avm'}).div.text
print(price1)
url = driver.current_url if driver.current_url else 'N/A'
return(price1, url)
except AttributeError:
try:
time.sleep(3)
price2 = soup.find('span',class_='avmLabel').find_next('span', class_='value').text
print(price2)
url = driver.current_url if driver.current_url else 'N/A'
return(price2, url)
except:
return('N/A', 'N/A')
outputfile = append_date_timestamp(".\\pa-property-value-tools\\output\\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
driver.quit()
The part that I think is causing this issue I think is here:
outputfile = append_date_timestamp(".\\pa-property-value-tools\\output\\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
I am not sure what the problem is, any suggestions are greatly appreciated.
EDIT:
For the code that is marked as problematic. What the code does is count the number of times we have iterated through an address. If we have went through 10 then we output those into a csv. We have a random waittime for each of those calls so that we do not get out ip address blocked. The problem is within those lines of code as for some reason I get duplicates.
It seems that you are not resetting output_df
after writing to the csv file.
You are appending to the dataframe here:
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
and then again appending the content of output_df
to the csv file with mode='a'
:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
This is why the rows are written multiple times.
Resetting the dataframe after writing to the csv file should fix this:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
output_df = pd.DataFrame()