Search code examples
pythonpandastradingpandas-datareader

Why is python only printing one data set in the algorithm?


So I am trying to build a trading software and I am using the code from an online YouTuber. I am gathering all of the data for the companies on the S&P 500 in the get_data_from_yahoo() function. So when I run that code it says Already Have (then the given ticker) which is fine, but when I got to print the data for this in the following function, which is compile_data(), it only print one ticker which is ZTS. Anyone have any ideas?

import bs4 as bs
import datetime as dt
import os
import pandas as pd
from pandas_datareader import data as pdr    
import pickle
import requests
import fix_yahoo_finance as yf


def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text.replace('.', '-')
        ticker = ticker[:-1]
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)

    print(tickers)

    return tickers


save_sp500_tickers()

def get_data_from_yahoo(reload_sp500=False):

    if reload_sp500:
    tickers = save_sp500_tickers()
else:
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

if not os.path.exists('stock_dfs'):
    os.makedirs('stock_dfs')

start = dt.datetime(2019, 6, 8)
end = dt.datetime.now()

for ticker in tickers:
        print(ticker)
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            df = pdr.get_data_yahoo(ticker, start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv('stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


save_sp500_tickers()
get_data_from_yahoo()

def complied_data():
        with open("sp500tickers.pickle","rb") as f:
            tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)

        df.rename(columns = {'Adj Close':ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low','Close','Volume'], 1, inplace=True)

    if main_df.empty:
        main_df = df
    else:
        main_df = main_df.join(df, how='outer')

    if count % 10 == 0:
        print(count)

    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')

complied_data()

When I run this code this is what it says:

MMM
Already have MMM
ABT
Already have ABT
ABBV
Already have ABBV
ABMD
Already have ABMD
ACN
Already have ACN
ATVI
Already have ATVI
ADBE
Already have ADBE
AMD
Already have AMD
AAP
Already have AAP
AES
Already have AES
AMG
Already have AMG
AFL
Already have AFL
A
Already have A
APD
Already have APD
AKAM
Already have AKAM
ALK
Already have ALK
ALB
Already have ALB

It then continues to say that it already has all of the 500 companies(I did not show the hole thing because the list is very long). But when I run the compile_data() function it only prints the data for one ticker:

ZTS
Date                 
2019-01-02  83.945038
2019-01-03  81.043526
2019-01-04  84.223267
2019-01-07  84.730026
2019-01-08  85.991997

Solution

  • The problem is in a for loop, specifically the one in complied_data.

    The if-else and if blocks should be included in the for loop:

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)
    
        df.rename(columns = {'Adj Close':ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low','Close','Volume'], 1, inplace=True)
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')
    
        if count % 10 == 0:
            print(count)
    

    Otherwise they will be evaluated only after it is done looping and elaborate the last element.

    The following is the output when changing to the above indentation:

    (... omitted counting from 0)
    470
    480
    490
    500
                       MMM        ABT       ABBV        ABMD  ...         YUM         ZBH       ZION         ZTS
    Date                                                      ...
    2019-06-10  165.332672  80.643486  74.704918  272.429993  ...  107.794380  121.242027  43.187107  109.920105
    2019-06-11  165.941788  80.494644  75.889320  262.029999  ...  106.722885  120.016762  43.758469  109.860268
    2019-06-12  166.040024  81.318237  76.277657  254.539993  ...  108.082100  120.225945  43.512192  111.136780
    2019-06-13  165.882843  81.655624  76.646561  255.529999  ...  108.121788  119.329407  44.063854  109.730621
    2019-06-14  163.760803  81.586166  76.394157  250.960007  ...  108.925407  116.998398  44.211620  110.488556
    
    [5 rows x 505 columns]