Search code examples
pythondataframematplotlibtime-seriesmissing-data

Visualizaion of count records of categorical variables including their missing values (`None` or `NaN`)


Let's say I have the following time series data dataframe:

import numpy as np
import pandas as pd
import random
np.random.seed(2019)

# Generate TS
#rng = pd.date_range('2019-01-01', freq='MS', periods=N)
ts = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H') #.strftime('%d-%m-%Y %H:%M:%S')    # freq='MS'set the frequency of date in months and start from day 1. You can use 'T' for minutes and so on

# number of samples
N = len(ts)

# Create a random dataset
data = {
    #"TS": ts,
    'Appx':  [random.choice(['App1', 'App2', 'App3', None]) for _ in range(N)], # generate categorical data including missing data "None"
    'VM':    [random.choice(['VM1' , 'VM2' ]) for _ in range(N)]
}

df = pd.DataFrame(data, index=ts)
#df.resample('M').mean().plot()
df
#                       Appx    VM
#2000-01-01 00:00:00    App1    VM2
#2000-01-01 01:00:00    None    VM1
#2000-01-01 02:00:00    None    VM2
#2000-01-01 03:00:00    App3    VM2
#2000-01-01 04:00:00    App1    VM1
#...    ... ...
#2000-12-31 19:00:00    App2    VM1
#2000-12-31 20:00:00    App3    VM1
#2000-12-31 21:00:00    App3    VM1
#2000-12-31 22:00:00    App1    VM1
#2000-12-31 23:00:00    App1    VM1
# 8784 rows × 2 columns

After checking other available resources:

Problem: plotting count records of categorical variables including their missing values (None or NaN) within dataframe

My tries: I tried to plot count records using the following scripts unsuccessfully: At first, I used a simple example close to what I can depict both data and missing values inspired here by plotting App column records within missing values (dashed line) over time for each desired VM:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

day = ([ 1 , 2 , 3, 4, 5 , 6 , 7 , 8 , 9])
App1 = ([0.6 , 0.8 , np.nan, np.nan, 4 , 6 , 6.5 ,7 , 8])
App2 = ([ 1 , 2 , np.nan, np.nan, 0.5 , 7 , 8 , 9 , 10])
App3 = ([ 1.5 , 2.5 , np.nan, np.nan, 3 , 4 , 6 , 8 , 11])

cf = pd.DataFrame({'App1': App1, 'App2': App2, 'App3': App3}, index = day)
cf.index.name = 'day'

fig, ax = plt.subplots()
line, = ax.plot(cf['App1'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App1'], color='k', lw=1.5, marker = 'v', label='App1',)
line, = ax.plot(cf['App2'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App2'], color='k', lw=1.5, marker = 's', label='App2')
line, = ax.plot(cf['App3'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App3'], color='k', lw=1.5, marker = 'o', label='App3')

plt.xlabel('Time Stamp')
plt.ylabel('Record counts')
plt.title('Apps within missing values for VM1')
plt.legend()
plt.show()

My outputs so far:

img


But I get error when I apply it to my generated time-series data based on this answer:

import matplotlib.pyplot as plt

df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)


# Filter the DataFrame by a list of string values in the "App1" column
filtered_df  = df[ df["Appx"].isin([np.nan])]
filtered_dff = df[~df["Appx"].isin([np.nan])]


cf = pd.DataFrame({'Appx': filtered_dff["Appx"]}, index = df.index)
#cf.index.name = df.index #'TS'

fig, ax = plt.subplots()
line, = ax.plot(cf['Appx'].fillna(method='ffill'), ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['Appx'], color=line.get_color(), lw=1.5, marker = 'o')
ax.tick_params(axis='x', labelrotation=45)
plt.xlabel('TS')
plt.ylabel('mm')
plt.legend('best')
plt.show()

TypeError: 'value' must be an instance of str or bytes, not a float

Even I digged further using groupby():

# reset_index() gives a column for counting, after groupby uses year and category
ctdf = (df.reset_index()
          .groupby(['Appx','VM'], as_index=False)
          .count()
          # rename isn't strictly necessary here, it's just for readability
          .rename(columns={'index':'ct'})
       )
ctdf

#    Appx    VM  ct
#0  App1    VM1 1127
#1  App1    VM2 1084
#2  App2    VM1 1066
#3  App2    VM2 1098
#4  App3    VM1 1084
#5  App3    VM2 1049

df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)


# Filter the DataFrame by a list of string values in the "App1" column
filtered_df  = df[ df["Appx"].isin([np.nan])]
#filtered_dff = df[~df["Appx"].isin([np.nan])]

# reset_index() gives a column for counting, after groupby uses year and category
ctdff = (filtered_df
          #.isna()
          .reset_index()
          .groupby(['VM'], as_index=False)
          .count()
          # rename isn't strictly necessary here, it's just for readability
          .rename(columns={'index':'ct'})
       )
ctdff

#   VM   ct   Appx
#0  VM1 1153    0
#1  VM2 1123    0

Similar to this answer I might be interested such plot so called cat_horizontal_plot: img

Note: I'm not interested in removing or imputing solutions as much as possible:


In those corner cases I can't show missing values:

import seaborn as sns
import matplotlib.pyplot as plt
sns.lineplot(data = df, x = df.index, y = 'Appx', hue = 'Appx', marker='o', alpha=0.2)
plt.legend(bbox_to_anchor=[0.5, 1.02], loc='lower center')
plt.xticks(rotation=45)
plt.show()

img

grouped = df.groupby(['VM','Appx'])

for key, group in grouped:
    data = group.groupby(lambda x: x.hour).count()
    data['Appx'].plot(label=key , legend=True)

img


Solution

    • If the point is to always include the counts for None or NaN, then convert them to a string.
    # convert the None or NaN to a string
    df.Appx = df.Appx.fillna('None')
    
    # make the column categorical
    df.Appx = pd.Categorical(values=df.Appx, categories=sorted(df.Appx.unique()), ordered=True)
    

    Bar Chart for Monthly Count Aggregation

    # separate the dataframe by uniqe VM, and aggreate
    vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
    
    # plot each VM separately
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
    
    for ax, key in zip(axes, vm.keys()):
    
        data = vm[key]['VM']
    
        data.plot(kind='bar', ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)
    
        ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))
    

    enter image description here

    Proportional Stack Bars

    vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
    
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
    
    for ax, key in zip(axes, vm.keys()):
    
        data = vm[key]['VM']
    
        percent = data.div(data.sum(axis=1), axis=0).mul(100).round(1)
    
        percent.plot(kind='bar', stacked=True, ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)
    
        ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))
    

    enter image description here

    Scatter Points for Daily Count Aggregation

    # separate the dataframe by uniqe VM, and aggreate
    vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
    
    # plot each VM separately
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
    
    for ax, key in zip(axes, vm.keys()):
    
        data = vm[key]['VM']
    
        data.plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle='none', marker='.')
    

    enter image description here

    • Iterate through each column to add a different marker or linestyle.
    vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}
    
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)
    
    for ax, key in zip(axes, vm.keys()):
    
        data = vm[key]['VM']
    
        cols = data.columns
    
        linestyles = ['-', '-', '-', ':']
        markers = ['d', 'v', 's', '.']
    
        for col, ls, marker in zip(cols, linestyles, markers):
            data[col].plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle=ls, marker=marker)
    
        ax.legend()
    

    enter image description here