Search code examples
pythonweb-scrapingyahoo-financeyahoo-api

Scraping Yahoo Finance after recent change Oc 2019


import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd

symbol = 'AAPL'

url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol



page = requests.get(url)

tree = html.fromstring(page.content)

tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span') 

Headers = []
for Header in tableHeaders:
    Headers.append(Header.text)

df = pd.DataFrame()

df = pd.DataFrame(columns=Headers,index=[1])


Xpath1 = "//span[contains(.,'"+item1+"')]/parent::div/parent::div/following-sibling::div"


item1 = 'Long Term Debt'
row1 = []
row1.append(item1)



rowvalues1 = tree.xpath(Xpath1) 

for value1 in rowvalues1:
    row1.append(value1.text)

Xpath1 = Xpath1+"/span"
Childvalues1 = tree.xpath(Xpath1) 
j=0
for i in range(len(row1)):
    if(row1[i]==None):
        row1[i] =Childvalues1[j].text
        j=j+1



df.loc[1] = row1



df=df.fillna(0)
df[df=='-'] ='0'

long_term_debt=float(str(df.iloc[0,4]).replace(',','')) 

When I run AAPL symbol I get an error:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-63-fe5e79eabd51> in <module>
     57 df[df=='-'] ='0'
     58 
---> 59 long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
     60 
--
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError: single positional indexer is out-of-bounds

I can't run value extracted from df as in case of AAPL there is no year 2015 for the company. Yahoo finance closed that year for this company however for other companies there is year 2015.

What I can do to remove such error for this specific year and symbol. I tried using 'None' in function for long-term variable however it does not work. Any idea how to approach this case?

The formula tried to use :

def debt():
    if df.iloc[0,4]== None : return 0
    else: float(str(df.iloc[0,4]).replace(',',''))

However it does not work


Solution

  • You have to install YahooFinancials library to your python.

    import lxml
    from lxml import html
    import requests
    import numpy as np
    import pandas as pd
    
    symbol = 'ORCL'
    
    url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
    
    
    
    page = requests.get(url)
    
    tree = html.fromstring(page.content)
    
    tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span') 
    
    Headers = []
    for Header in tableHeaders:
        Headers.append(Header.text)
    
    df = pd.DataFrame()
    #Adding columns
    df = pd.DataFrame(columns=Headers,index=[1])
    item5 = 'Inventory'
    row5 = []
    row5.append(item5)
    
    Xpath = "//span[contains(.,'"+item5+"')]/parent::div/parent::div/following-sibling::div"
    
    rowvalues5 = tree.xpath(Xpath) # identify all 4 high level nodes
    # This will store the high level node values, but store 'None' if value is not present.
    for value5 in rowvalues5:
        row5.append(value5.text)
     #if the value is None, we are going to fetch to next level node values using /span   
    Xpath = Xpath+"/span"
    Childvalues = tree.xpath(Xpath) #Fetch low level nodes
    j=0
    for i in range(len(row5)):
        if(row5[i]==None):
            row5[i] =Childvalues[j].text
            j=j+1
    
    
    df.loc[1] = row5
    
    print(df)