I am trying to visualize Air Quality Data as time-series charts using pycaret and plotly dash python libraries , but i am getting very weird graphs, below is my code:
import pandas as pd
import plotly.express as px
data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
#data.set_index('Date', inplace=True)
# combine store and item column as time_series
data['OBJECTID'] = ['Location_' + str(i) for i in data['OBJECTID']]
#data['AQI_Bins_AI'] = ['Bin_' + str(i) for i in data['AQI_Bins_AI']]
data['time_series'] = data[['OBJECTID']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['OBJECTID'], axis=1, inplace=True)
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head(4000)
data['time_series'].nunique()
for i in data['time_series'].unique():
subset = data[data['time_series'] == i]
subset['moving_average'] = subset['CO'].rolling(window = 30).mean()
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
require needful help in this regard,
here is my sample data Google Drive Link
datetime64[ns]
in your data frameimport kaggle.cli
import sys, math
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import plotly.express as px
# download data set
# https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
sys.argv = [
sys.argv[0]
] + "datasets download rohanrao/air-quality-data-in-india".split(
" "
)
kaggle.cli.main()
zfile = ZipFile("air-quality-data-in-india.zip")
print([f.filename for f in zfile.infolist()])
import pandas as pd
import plotly.express as px
from pathlib import Path
from distutils.version import StrictVersion
# data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
# use kaggle data
# dfs = {f.filename:pd.read_csv(zfile.open(f)) for f in zfile.infolist() if f.filename in ['station_day.csv',"stations.csv"]}
# data = pd.merge(dfs['station_day.csv'],dfs["stations.csv"], on="StationId")
# data['Date'] = pd.to_datetime(data['Date'])
# # kaggle data is different from question, make it compatible with questions data
# data = data.assign(OBJECTID=lambda d: d["StationId"])
# sample data from google drive link
data2 = pd.read_csv(Path.home().joinpath("Downloads").joinpath("AQI.csv"))
data2["Date"] = pd.to_datetime(data2["Date"])
data = data2
# as per very first commment - it's important data is ordered !
data = data.sort_values(["Date","OBJECTID"])
data['time_series'] = "Location_" + data["OBJECTID"].astype(str)
# clean up data, remove rows where there is no CO value
data = data.dropna(subset=["CO"])
# can do moving average in one step (can also be used by animation)
if StrictVersion(pd.__version__) < StrictVersion("1.3.0"):
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean().to_frame()["CO"].values
else:
data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean()["CO"]
# just first two for purpose of demonstration
for i in data['time_series'].unique()[0:3]:
subset = data.loc[data['time_series'] == i]
fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
fig.show()
px.line(
data,
x="Date",
y=["CO", "moving_average"],
animation_frame="time_series",
template="plotly_dark",
).update_layout(yaxis={"range":[data["CO"].min(), data["CO"].quantile(.97)]})