This code is to crawl stock data from a website named cafef. The input is website link and element from that website's HTML and the expected output is a table with stock data including Date, Price, Volume. However, the code didn't work and it returns an Empty Dataframe. I don't understand the second try except block, so I cannot debug to run this code. Guys please explain for me
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
def crawl(stock):
date=[]
price=[]
volume=[]
close=[]
stock_id=[]
browser = webdriver.Chrome(executable_path="./chromedriver")
web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-"+stock+"-1.chn")
sleep(5)
for count in range (60):
try:
date_data=browser.find_elements("Item_DateItem")
for row in date_data:
date.append(row.text)
print(row.text())
date_data.clear()
price_data=browser.find_elements_by_class_name("Item_Price1")
for row in price_data:
price.append(row.text)
price_data.clear()
except:
break
try:
if count == 0:
next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[21]/a")
else:
try:
next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[22]/a")
except:
next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[23]/a")
next_page.click()
sleep(5)
except:
break
for i in range (int(len(price)/10)):
close.append(price[10*i+1].replace(",",""))
volume.append(price[10*i+2].replace(",",""))
for i in range (len(date)):
stock_id.append(stock)
d = {'Stock': stock_id,'Date': date,'Close': close,'Volume': volume}
df = pd.DataFrame(data=d)
df.to_csv(stock+".csv", index=False)
return df
print(crawl('ABC'))
I tried to find the xpath element but I didn't find
There are couple of minor issues, here is the list
By
need to be imported from selenium.webdriver.common.by
as the logic has usage of it.Item_DateItem
and Item_Price1
are class property values so, either we need to use By.CLASS_NAME
or By.CSS_SELECTOR
.row.text()
. But text is a property not a method so, it will raise a exception, the loop is breaking on exception and resulting empty data frame. I would suggest here to log the exceptions instead of catching and silently breaking the loop, to get the root cause quickly.So, after making all the changes, the code looks like this:
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By # in the code there are usage of By but not imported
def crawl(stock):
date=[]
price=[]
volume=[]
close=[]
stock_id=[]
browser = webdriver.Chrome(executable_path="./chromedriver")
web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-"+stock+"-1.chn")
sleep(5)
for count in range (60):
try:
date_data=browser.find_elements(By.CSS_SELECTOR, ".Item_DateItem") # Item_DateItem is a css class, so we have to use "By" accordingly
for row in date_data:
date.append(row.text)
print(row.text) # text is a propery
date_data.clear()
price_data=browser.find_elements(By.CSS_SELECTOR, ".Item_Price1") # Item_DateItem is a css class, so we have to use "By" accordingly
for row in price_data:
price.append(row.text)
price_data.clear()
except:
break
try:
next_page = browser.find_element(By.CSS_SELECTOR, ".CafeF_Paging td:last-child a") # better to use a stable selector than using a absolute XPATH which is more likely to change
next_page.click()
sleep(5)
except:
break
for i in range (int(len(price)/10)):
close.append(price[10*i+1].replace(",",""))
volume.append(price[10*i+2].replace(",",""))
for i in range (len(date)):
stock_id.append(stock)
d = {'Stock': stock_id,'Date': date,'Close': close,'Volume': volume}
df = pd.DataFrame(data=d)
df.to_csv(stock+".csv", index=False)
return df
print(crawl('ABC'))