Search code examples
pythonyahoo-finance

How to change this formula for python? Newbie to coding, any help is appreciated


Hi I have currently obtained this code from online. It currently gets the url and company information for the following tickers. Is there any way to update this code to instead to show the sector and industry information in replacement of the url and company information? Newbie to coding, so would appreciate any help :)

Code Below:

import bs4 as BeautifulSoup 
from bs4 import SoupStrainer
import re
import urllib.request 
import pandas as pd
import requests

symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']


headers = {'User-agent': 'Mozilla/5.0'}

mySymbols = {}

for s in symbols:
    vals = {}
    url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
    webpage = requests.get(url, headers=headers)
    soup = BeautifulSoup.BeautifulSoup(webpage.content) 

    title = soup.find("title")
    tmp = title.get_text()
    rxTitle = re.compile(r'\(.*$')
    coName = rxTitle.sub("", tmp)


for link in soup.find_all('a', href=True):
        try:
            if link['target'] and "" == link['title']:
                m = re.search('yahoo', link['href'], flags=re.IGNORECASE)
                if None == m:

                    url = link['href']
                    webpage = requests.get(url, headers=headers)
                    soup = BeautifulSoup.BeautifulSoup(webpage.content) 

                    vals = {"company":coName, "url":link['href']} 
                    print (s, vals)
                    mySymbols[s] = vals
        except:
            pass

Solution

  • Looking at one of those pages I see the Sector is in a span with 'class'='Fw(600)' and 'data-reactid'=21 and the industry with data-reactid=25, so you could use

    sector = soup.find('span', {'class':'Fw(600)','data-reactid': '21'})
    print(sector.next)
    industry = soup.find('span', {'class':'Fw(600)','data-reactid': '25'})
    print(industry.next)
    

    The sector.next gets the stuff within the span instead of returning the entire thing.

    A better approach that looks for the 'Sector' and 'Industry' spans and returns the subsequent span is fully coded below:

    import bs4 as BeautifulSoup
    import requests
    
    def get_tags(url):
        webpage = requests.get(url, headers=headers)
        soup = BeautifulSoup.BeautifulSoup(webpage.content)
        title = soup.find("title")
        results = {}
        tmp = title.get_text()
        results['title'] = tmp
        spans = soup.findAll('span')
        for i in range(len(spans)):
            if spans[i] and spans[i].text == 'Sector':
                sector = spans[i+1].text
                results['Sector'] = sector
            if spans[i] and spans[i].text == 'Industry':
                industry = spans[i+1].text
                results['Industry'] = industry
        return results
    
    headers = {'User-agent': 'Mozilla/5.0'}
    symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']
    for s in symbols:
        url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
        results = get_tags(url)
        print(results)