Let's say I have the following time series data dataframe:
import numpy as np
import pandas as pd
import random
np.random.seed(2019)
# Generate TS
#rng = pd.date_range('2019-01-01', freq='MS', periods=N)
ts = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H') #.strftime('%d-%m-%Y %H:%M:%S') # freq='MS'set the frequency of date in months and start from day 1. You can use 'T' for minutes and so on
# number of samples
N = len(ts)
# Create a random dataset
data = {
#"TS": ts,
'Appx': [random.choice(['App1', 'App2', 'App3', None]) for _ in range(N)], # generate categorical data including missing data "None"
'VM': [random.choice(['VM1' , 'VM2' ]) for _ in range(N)]
}
df = pd.DataFrame(data, index=ts)
#df.resample('M').mean().plot()
df
# Appx VM
#2000-01-01 00:00:00 App1 VM2
#2000-01-01 01:00:00 None VM1
#2000-01-01 02:00:00 None VM2
#2000-01-01 03:00:00 App3 VM2
#2000-01-01 04:00:00 App1 VM1
#... ... ...
#2000-12-31 19:00:00 App2 VM1
#2000-12-31 20:00:00 App3 VM1
#2000-12-31 21:00:00 App3 VM1
#2000-12-31 22:00:00 App1 VM1
#2000-12-31 23:00:00 App1 VM1
# 8784 rows × 2 columns
After checking other available resources:
Problem: plotting count records of categorical variables including their missing values (None
or NaN
) within pandas dataframe
My tries: I tried to plot count records using the following scripts unsuccessfully: At first, I used a simple example close to what I can depict both data and missing values inspired here by plotting App column records within missing values (dashed line) over time for each desired VM:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
day = ([ 1 , 2 , 3, 4, 5 , 6 , 7 , 8 , 9])
App1 = ([0.6 , 0.8 , np.nan, np.nan, 4 , 6 , 6.5 ,7 , 8])
App2 = ([ 1 , 2 , np.nan, np.nan, 0.5 , 7 , 8 , 9 , 10])
App3 = ([ 1.5 , 2.5 , np.nan, np.nan, 3 , 4 , 6 , 8 , 11])
cf = pd.DataFrame({'App1': App1, 'App2': App2, 'App3': App3}, index = day)
cf.index.name = 'day'
fig, ax = plt.subplots()
line, = ax.plot(cf['App1'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App1'], color='k', lw=1.5, marker = 'v', label='App1',)
line, = ax.plot(cf['App2'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App2'], color='k', lw=1.5, marker = 's', label='App2')
line, = ax.plot(cf['App3'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App3'], color='k', lw=1.5, marker = 'o', label='App3')
plt.xlabel('Time Stamp')
plt.ylabel('Record counts')
plt.title('Apps within missing values for VM1')
plt.legend()
plt.show()
My outputs so far:
But I get error when I apply it to my generated time-series data based on this answer:
import matplotlib.pyplot as plt
df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)
# Filter the DataFrame by a list of string values in the "App1" column
filtered_df = df[ df["Appx"].isin([np.nan])]
filtered_dff = df[~df["Appx"].isin([np.nan])]
cf = pd.DataFrame({'Appx': filtered_dff["Appx"]}, index = df.index)
#cf.index.name = df.index #'TS'
fig, ax = plt.subplots()
line, = ax.plot(cf['Appx'].fillna(method='ffill'), ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['Appx'], color=line.get_color(), lw=1.5, marker = 'o')
ax.tick_params(axis='x', labelrotation=45)
plt.xlabel('TS')
plt.ylabel('mm')
plt.legend('best')
plt.show()
TypeError: 'value' must be an instance of str or bytes, not a float
Even I digged further using groupby()
:
# reset_index() gives a column for counting, after groupby uses year and category
ctdf = (df.reset_index()
.groupby(['Appx','VM'], as_index=False)
.count()
# rename isn't strictly necessary here, it's just for readability
.rename(columns={'index':'ct'})
)
ctdf
# Appx VM ct
#0 App1 VM1 1127
#1 App1 VM2 1084
#2 App2 VM1 1066
#3 App2 VM2 1098
#4 App3 VM1 1084
#5 App3 VM2 1049
df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)
# Filter the DataFrame by a list of string values in the "App1" column
filtered_df = df[ df["Appx"].isin([np.nan])]
#filtered_dff = df[~df["Appx"].isin([np.nan])]
# reset_index() gives a column for counting, after groupby uses year and category
ctdff = (filtered_df
#.isna()
.reset_index()
.groupby(['VM'], as_index=False)
.count()
# rename isn't strictly necessary here, it's just for readability
.rename(columns={'index':'ct'})
)
ctdff
# VM ct Appx
#0 VM1 1153 0
#1 VM2 1123 0
Similar to this answer I might be interested such plot so called cat_horizontal_plot
:
Note: I'm not interested in removing or imputing solutions as much as possible:
In those corner cases I can't show missing values:
import seaborn as sns
import matplotlib.pyplot as plt
sns.lineplot(data = df, x = df.index, y = 'Appx', hue = 'Appx', marker='o', alpha=0.2)
plt.legend(bbox_to_anchor=[0.5, 1.02], loc='lower center')
plt.xticks(rotation=45)
plt.show()
grouped = df.groupby(['VM','Appx'])
for key, group in grouped:
data = group.groupby(lambda x: x.hour).count()
data['Appx'].plot(label=key , legend=True)
None
or NaN
, then convert them to a string.# convert the None or NaN to a string
df.Appx = df.Appx.fillna('None')
# make the column categorical
df.Appx = pd.Categorical(values=df.Appx, categories=sorted(df.Appx.unique()), ordered=True)
# separate the dataframe by uniqe VM, and aggreate
vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
# plot each VM separately
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
for ax, key in zip(axes, vm.keys()):
data = vm[key]['VM']
data.plot(kind='bar', ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)
ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))
vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
for ax, key in zip(axes, vm.keys()):
data = vm[key]['VM']
percent = data.div(data.sum(axis=1), axis=0).mul(100).round(1)
percent.plot(kind='bar', stacked=True, ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)
ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))
# separate the dataframe by uniqe VM, and aggreate
vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
# plot each VM separately
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
for ax, key in zip(axes, vm.keys()):
data = vm[key]['VM']
data.plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle='none', marker='.')
marker
or linestyle
.vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
for ax, key in zip(axes, vm.keys()):
data = vm[key]['VM']
cols = data.columns
linestyles = ['-', '-', '-', ':']
markers = ['d', 'v', 's', '.']
for col, ls, marker in zip(cols, linestyles, markers):
data[col].plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle=ls, marker=marker)
ax.legend()