Search code examples
pythontime-seriesplotlypycaret

Weird Time-Series Graph Using Pycaret and plotly


I am trying to visualize Air Quality Data as time-series charts using pycaret and plotly dash python libraries , but i am getting very weird graphs, below is my code:

import pandas as pd
import plotly.express as px
data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
#data.set_index('Date', inplace=True)


# combine store and item column as time_series
data['OBJECTID'] = ['Location_' + str(i) for i in data['OBJECTID']]
#data['AQI_Bins_AI'] = ['Bin_' + str(i) for i in data['AQI_Bins_AI']]
data['time_series'] = data[['OBJECTID']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['OBJECTID'], axis=1, inplace=True)
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head(4000)

data['time_series'].nunique()


for i in data['time_series'].unique():
    subset = data[data['time_series'] == i]
    subset['moving_average'] = subset['CO'].rolling(window = 30).mean()
    fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
    fig.show()

here is my weird output graph

require needful help in this regard,

here is my sample data Google Drive Link


Solution

    • data has not been provided in a usable way. Sought out publicly available similar data. found: https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
    • using this data, with a couple of cleanups of your code, no issues with plots. I suspect your data has one of these issues
      1. date is not datetime64[ns] in your data frame
      2. date is not sorted, leading to lines being drawn in way you have noted
    • by refactoring way moving average is calculated, you can use animation instead of lots of separate figures

    get some data

    import kaggle.cli
    import sys, math
    import pandas as pd
    from pathlib import Path
    from zipfile import ZipFile
    import plotly.express as px
    
    # download data set
    # https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
    sys.argv = [
        sys.argv[0]
    ] + "datasets download rohanrao/air-quality-data-in-india".split(
        " "
    )
    kaggle.cli.main()
    
    zfile = ZipFile("air-quality-data-in-india.zip")
    print([f.filename for f in zfile.infolist()])
    
    

    plot using code from question

    import pandas as pd
    import plotly.express as px
    from pathlib import Path
    from distutils.version import StrictVersion
    
    # data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
    # use kaggle data
    # dfs = {f.filename:pd.read_csv(zfile.open(f)) for f in zfile.infolist() if f.filename in ['station_day.csv',"stations.csv"]}
    # data = pd.merge(dfs['station_day.csv'],dfs["stations.csv"], on="StationId")
    
    # data['Date'] = pd.to_datetime(data['Date'])
    # # kaggle data is different from question, make it compatible with questions data
    # data = data.assign(OBJECTID=lambda d: d["StationId"])
    
    # sample data from google drive link
    data2 = pd.read_csv(Path.home().joinpath("Downloads").joinpath("AQI.csv"))
    data2["Date"] = pd.to_datetime(data2["Date"])
    
    data = data2
    # as per very first commment - it's important data is ordered !
    data = data.sort_values(["Date","OBJECTID"])
    data['time_series'] = "Location_" + data["OBJECTID"].astype(str)
    # clean up data, remove rows where there is no CO value
    data = data.dropna(subset=["CO"])
    # can do moving average in one step (can also be used by animation)
    if StrictVersion(pd.__version__) < StrictVersion("1.3.0"):
        data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean().to_frame()["CO"].values
    else:
        data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean()["CO"]
    
    # just first two for purpose of demonstration
    for i in data['time_series'].unique()[0:3]:
        subset = data.loc[data['time_series'] == i]
        fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
        fig.show()
    

    can use animation

    px.line(
        data,
        x="Date",
        y=["CO", "moving_average"],
        animation_frame="time_series",
        template="plotly_dark",
    ).update_layout(yaxis={"range":[data["CO"].min(), data["CO"].quantile(.97)]})
    

    enter image description here