Search code examples
pythonweb-scrapingurllibfinancegoogle-finance

Python Efficient Web Scraping?


I'm fairly new to Python and am trying to make a web parser for a stock app. I'm essentially using urllib to open the desired webpage for each stock in the argument list and reading the full contents of the html code for that page. Then I'm slicing that in order to find the quote I'm looking for. The method I've implemented works, but I'm doubtful that this is the most efficient means of achieving this result. I've spent some time looking into other potential methods for reading files more rapidly, but none seem to pertain to web scraping. Here's my code:

from urllib.request import urlopen

def getQuotes(stocks):
    quoteList = {}
    for stock in stocks:
        html = urlopen("https://finance.google.com/finance?q={}".format(stock))
        webpageData = html.read()
        scrape1 = webpageData.split(str.encode('<span class="pr">\n<span id='))[1].split(str.encode('</span>'))[0]
        scrape2 = scrape1.split(str.encode('>'))[1]
        quote = bytes.decode(scrape2)
        quoteList[stock] = float(quote)
    return quoteList

print(getQuotes(['FB', 'GOOG', 'TSLA']))

Thank you all so much in advance!


Solution

  • I'm essentially using urllib to open the desired webpage for each stock in the argument list and reading the full contents of the html code for that page. Then I'm slicing that in order to find the quote I'm looking for.

    Here's that implementation in Beautiful Soup and requests:

    import requests
    from bs4 import BeautifulSoup
    
    def get_quotes(*stocks):
        quotelist = {}
        base = 'https://finance.google.com/finance?q={}'
        for stock in stocks:
            url = base.format(stock)
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
            quotelist[stock] = float(quote)
        return quotelist
    
    print(get_quotes('AAPL', 'GE', 'C'))
    {'AAPL': 160.86, 'GE': 23.91, 'C': 68.79}
    # 1 loop, best of 3: 1.31 s per loop
    

    As mentioned in the comments you may want to look into multithreading or grequests.

    Using grequests to make asynchronous HTTP requests:

    def get_quotes(*stocks):
        quotelist = {}
        base = 'https://finance.google.com/finance?q={}'
        rs = (grequests.get(u) for u in [base.format(stock) for stock in stocks])
        rs = grequests.map(rs)
        for r, stock in zip(rs, stocks):
            soup = BeautifulSoup(r.text, 'html.parser')
            quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
            quotelist[stock] = float(quote)
        return quotelist
    
    %%timeit 
    get_quotes('AAPL', 'BAC', 'MMM', 'ATVI',
               'PPG', 'MS', 'GOOGL', 'RRC')
    1 loop, best of 3: 2.81 s per loop
    

    Update: here's a modified version from Dusty Phillips' Python 3 Object-oriented Programming that uses the built-in threading module.

    from threading import Thread
    
    from bs4 import BeautifulSoup
    import numpy as np
    import requests
    
    
    class QuoteGetter(Thread):
        def __init__(self, ticker):
            super().__init__()
            self.ticker = ticker
        def run(self):
            base = 'https://finance.google.com/finance?q={}'
            response = requests.get(base.format(self.ticker))
            soup = BeautifulSoup(response.text, 'html.parser')
            try:
                self.quote = float(soup.find('span', attrs={'class':'pr'})
                                    .get_text()
                                    .strip()
                                    .replace(',', ''))
            except AttributeError:
                self.quote = np.nan
    
    
    def get_quotes(tickers):
        threads = [QuoteGetter(t) for t in tickers]
        for thread in threads:        
            thread.start()
        for thread in threads:
            thread.join()
        quotes = dict(zip(tickers, [thread.quote for thread in threads]))
        return quotes
    
    tickers = [
        'A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACN', 'ADBE', 'ADI', 
        'ADM',  'ADP', 'ADS', 'ADSK', 'AEE', 'AEP', 'AES', 'AET', 'AFL', 'AGN', 
        'AIG', 'AIV', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE',
        ]
    
    %time get_quotes(tickers)
    # Wall time: 1.53 s