python dataframe selenium-webdriver web-crawler stock

Code return empty dataframe, problem in understand logic

This code is to crawl stock data from a website named cafef. The input is website link and element from that website's HTML and the expected output is a table with stock data including Date, Price, Volume. However, the code didn't work and it returns an Empty Dataframe. I don't understand the second try except block, so I cannot debug to run this code. Guys please explain for me

from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
def crawl(stock):
 date=[]
 price=[]
 volume=[]
 close=[]
 stock_id=[]
 browser = webdriver.Chrome(executable_path="./chromedriver")
 web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-"+stock+"-1.chn")
 sleep(5)
 for count in range (60):
  try:
      date_data=browser.find_elements("Item_DateItem")
      for row in date_data:
        date.append(row.text)
        print(row.text())
      date_data.clear()
      price_data=browser.find_elements_by_class_name("Item_Price1")
      for row in price_data:
        price.append(row.text)
      price_data.clear()
  except:
   break
  try:
    if count == 0:
      next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[21]/a")
    else:
       try:
          next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[22]/a")
       except:
          next_page = browser.find_element(By.XPATH, "/html/body/form/div[3]/div/div[2]/div[2]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td[23]/a")
    next_page.click()
    sleep(5)
  except:
    break
 for i in range (int(len(price)/10)):
  close.append(price[10*i+1].replace(",",""))
  volume.append(price[10*i+2].replace(",",""))
 for i in range (len(date)):
  stock_id.append(stock)
 d = {'Stock': stock_id,'Date': date,'Close': close,'Volume': volume}
 df = pd.DataFrame(data=d)
 df.to_csv(stock+".csv", index=False)
 return df
print(crawl('ABC'))

I tried to find the xpath element but I didn't find

Solution

There are couple of minor issues, here is the list

By need to be imported from selenium.webdriver.common.by as the logic has usage of it.
Item_DateItem and Item_Price1 are class property values so, either we need to use By.CLASS_NAME or By.CSS_SELECTOR.
There is one usage of row.text(). But text is a property not a method so, it will raise a exception, the loop is breaking on exception and resulting empty data frame. I would suggest here to log the exceptions instead of catching and silently breaking the loop, to get the root cause quickly.
You are using absolute XPATH to find the next page button. Better to find some stable selector which does not change frequently.

So, after making all the changes, the code looks like this:


from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By # in the code there are usage of By but not imported

def crawl(stock):
 date=[]
 price=[]
 volume=[]
 close=[]
 stock_id=[]
 browser = webdriver.Chrome(executable_path="./chromedriver")
 web = browser.get("https://s.cafef.vn/Lich-su-giao-dich-"+stock+"-1.chn")
 sleep(5)
 for count in range (60):
  try:
      date_data=browser.find_elements(By.CSS_SELECTOR, ".Item_DateItem") # Item_DateItem is a css class, so we have to use "By" accordingly
      for row in date_data:
        date.append(row.text)
        print(row.text) # text is a propery
      date_data.clear()
      price_data=browser.find_elements(By.CSS_SELECTOR, ".Item_Price1") # Item_DateItem is a css class, so we have to use "By" accordingly
      for row in price_data:
        price.append(row.text)
      price_data.clear()
  except:
   break
  try:
    next_page = browser.find_element(By.CSS_SELECTOR, ".CafeF_Paging td:last-child a") # better to use a stable selector than using a absolute XPATH which is more likely to change
    next_page.click()
    sleep(5)
  except:
    break
 for i in range (int(len(price)/10)):
  close.append(price[10*i+1].replace(",",""))
  volume.append(price[10*i+2].replace(",",""))
 for i in range (len(date)):
  stock_id.append(stock)
 d = {'Stock': stock_id,'Date': date,'Close': close,'Volume': volume}
 df = pd.DataFrame(data=d)
 df.to_csv(stock+".csv", index=False)
 return df
print(crawl('ABC'))