Search code examples
pythonpandastime-seriesbokeh

How can retrieve numbers of records above and under mean\median over time data with respect to time resolution in python?


Let's say I have the following time data for 1 month or (31 days) in January:

import os, holoviews as hv
os.environ['HV_DOC_HTML'] = 'true'
hv.extension('bokeh')

import pandas as pd
import pandas_bokeh
from pandas_bokeh import plot_bokeh
pandas_bokeh.output_notebook()

#-----------------------------------------------------------
# Libs
#-----------------------------------------------------------
#!pip install hvplot
#!pip install pandas-bokeh

#-----------------------------------------------------------
# LOAD THE DATASET
#-----------------------------------------------------------

df = pd.read_csv('azure.csv')
df['timestamp'] =  pd.to_datetime(df['timestamp'])
df = df.rename(columns={'min cpu': 'min_cpu',
                        'max cpu': 'max_cpu',
                        'avg cpu': 'avg_cpu',})
#df = df.set_index('timestamp')
df.head()


# Data preparation
# ==============================================================================
sliced_df = df[['timestamp', 'avg_cpu']]
# convert column to datetime object
#sliced_df['timestamp'] = pd.to_datetime(sliced_df['timestamp'], format='%Y-%m-%d %H:%M:%S')

# get the hour, day month
sliced_df['hour']        = sliced_df['timestamp'].dt.hour
sliced_df['day']         = sliced_df['timestamp'].dt.day
sliced_df['month']       = sliced_df['timestamp'].dt.month
sliced_df['year']        = sliced_df['timestamp'].dt.year

year_input=2017
month_input=1
day_input=21

# Retrive average CPU usage for hourly
df_avg = sliced_df.groupby('hour').agg({'avg_cpu': 'mean'}).reset_index()
df_21  = sliced_df[(sliced_df.year == year_input) & (sliced_df.month == month_input) & (sliced_df.day == day_input)]


df_21 = df_21.groupby('hour').agg({'avg_cpu': 'max'}).reset_index()

df_above = pd.merge(df_21, df_avg, on='hour', suffixes=('_hour','_avg'))
df_above['above']       = df_above.loc[df_above[f"avg_cpu_hour"] >= df_above["avg_cpu_avg"], f"avg_cpu_hour"]
df_above['above_value'] = df_above['avg_cpu_hour'] - df_above['avg_cpu_avg']

df_below = pd.merge(df_21, df_avg, on='hour', suffixes=('_hour','_avg'))
df_below['below']       = df_below.loc[df_below[f"avg_cpu_hour"] < df_below["avg_cpu_avg"], f"avg_cpu_hour"]
df_below['below_value'] = df_below['avg_cpu_hour'] - df_below['avg_cpu_avg']


above_count = df_above['above'].value_counts().sum()
below_count = df_below['below'].value_counts().sum()


dark_red = "#FF5555"
dark_blue = "#5588FF"


plot      = sliced_df.hvplot(     x="hour",       y="avg_cpu", by="day",  color="grey", alpha=0.02, legend=False, hover=False)
plot_avg  =    df_avg.hvplot(     x="hour",       y="avg_cpu",             color="grey",             legend=False)
plot_21th =    df_21.hvplot(      x="hour",       y="avg_cpu",             color="black",            legend=False)

plot_above = df_above.hvplot.area(x="hour", y="avg_cpu_avg", y2="avg_cpu_hour").opts(fill_alpha=0.2, line_alpha=0.8, line_color=dark_red, fill_color=dark_red)
plot_below = df_below.hvplot.area(x="hour", y="avg_cpu_avg", y2="avg_cpu_hour").opts(fill_alpha=0.2, line_alpha=0.8, line_color=dark_blue, fill_color=dark_blue)

text_days_above = hv.Text(5, df_21["avg_cpu"].max(), f"{above_count}", fontsize=14).opts(text_align="right", text_baseline="bottom", text_color=dark_red, text_alpha=0.8)
text_days_below = hv.Text(5, df_21["avg_cpu"].max(), f"{below_count}", fontsize=14).opts(text_align="right", text_baseline="top", text_color=dark_blue, text_alpha=0.8)

text_above      = hv.Text(5, df_21["avg_cpu"].max(), "DAYS ABOVE", fontsize=7).opts(text_align="left", text_baseline="bottom", text_color="lightgrey", text_alpha=0.8)
text_below      = hv.Text(5, df_21["avg_cpu"].max(), "DAYS BELOW", fontsize=7).opts(text_align="left", text_baseline="above", text_color="lightgrey", text_alpha=0.8)

hv.renderer("bokeh").theme = theme

final = (
    plot
    * plot_21th
    * plot_avg
    * plot_above
    * plot_below
    * text_days_above
    * text_days_below
    * text_above
    * text_below
).opts(
    xlabel="hourly",
    ylabel="CPU [Hz]",
    title=f"{day_input}th Jan data vs AVERAGE",
    gridstyle={"ygrid_line_alpha": 0},
    xticks=[
        (0, "00:00"),
        (1, "01:00"),
        (2, "02:00"),
        (3, "03:00"),
        (4, "04:00"),
        (5, "05:00"),
        (6, "06:00"),
        (7, "07:00"),
        (8, "08:00"),
        (9, "09:00"),
        (10, "10:00"),
        (11, "11:00"),
        (12, "12:00"),
        (13, "13:00"),
        (14, "14:00"),
        (15, "15:00"),
        (16, "16:00"),
        (17, "17:00"),
        (18, "18:00"),
        (19, "19:00"),
        (20, "20:00"),
        (21, "21:00"),
        (22, "22:00"),
        (23, "23:00"),
    ],
    xrotation=45,
    show_grid=True,
    fontscale=1.18,
)
hv.save(final, "final.html")

final

Update II:

I have tried the following code unsuccessfully inspired by the answer of @chitown88 using instead of package to get the desired output for output1 at least. My current output: img


Example output for years: img

As seen in the above example, I want to retrieve, reflect, and visualize my data in smaller time resolution (hourly\daily\monthly) rather than annually. I need to retrieve the number of CPU usage of column avg cpu within dataframe observations that exceeding\below:

  • Output1:

the hourly average for one certain day ( i.e., 21st Jan. 2024-01-21 00:00:00 till 2024-01-22 00:00:00)

  • x-axis: hourly timestamp of records for 21 Jan
  • y-axis: 'avg CPU' usage for 21 Jan
  • Threshold line: average CPU usage for 21 Jan
  • Output2:

The daily average for Jan.

  • x-axis: daily timestamp of Jan (1th-31th)
  • y-axis: 'avg CPU' usage for Jan
  • Threshold line: average CPU usage for Jan month

What is the elegant way to treat my dataframe to achieve this and the find threshold for interested outpu1 and output2 accordingly?

img


Solution

  • Let's try something more concrete:

    # read data
    df = pd.read_csv('https://raw.githubusercontent.com/amcs1729/Predicting-cloud-CPU-usage-on-Azure-data/master/azure.csv')
    
    # parse date time
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.normalize()
    df['hour'] = df['timestamp'].dt.hour
    
    # store the hourly average for the entire dataset
    hourly_avg = df.groupby('hour')['avg_cpu'].mean()
    
    # compute the max by hour-date
    max_hour_daily = df.groupby(['date', 'hour'])['avg_cpu'].max().unstack('hour')
    
    # compare these maxes to the hourly_avg
    # convert to DataFrame for easy manipulation
    labels = pd.DataFrame(np.where(max_hour_daily.ge(hourly_avg, axis=1), 'above', 'below'), 
                          columns=max_hour_daily.columns, 
                          index=max_hour_daily.index)
    
    # the expected output - a dataframe
    # indexed by the date, with columns being `above`, `below`
    output = labels.stack().groupby(level='date').value_counts().unstack(fill_value=0)
    

    Sample output:

    # first five days
    print(output.head())
    

    Output:

                above  below
    date                    
    2017-01-01     13     11
    2017-01-02     22      2
    2017-01-03     21      3
    2017-01-04     24      0
    2017-01-05     16      8
    

    Or you can get data for a specific day

    # yes, string date works with `loc`
    print(output.loc['2017-01-21'])
    

    Output:

    above    17
    below     7
    Name: 2017-01-21 00:00:00, dtype: int64
    

    If you want the plot, it's pretty straightforward from the DataFrames/Series above:

    day = '2017-01-21'
    day_max_hourly = max_hour_daily.loc[day]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.fill_between(day_max_hourly.index, day_max_hourly, hourly_avg, where=day_max_hourly >= hourly_avg, facecolor='red', interpolate=True, alpha=0.5)
    plt.fill_between(day_max_hourly.index, day_max_hourly, hourly_avg, where=day_max_hourly < hourly_avg, facecolor='blue', interpolate=True, alpha=0.5)
    plt.plot(hourly_avg, label='Hourly Average', color='black')
    plt.plot(day_max_hourly, label='Max Hourly', color='green')
    plt.xlabel('Hour')
    plt.xticks(hourly_avg.index)
    plt.legend()
    

    Output:

    enter image description here