This is in relations to a previous post from here:
I am trying to build a set of analytics that would catch students sharing answers. I need to have the format so that I can create some nice plots that would show the results. Not only that but I need to have the evidence to back up my findings. Here, I am working on an analytic that would look at the challenges that a student answered within 5 minutes from each other. I need the timedelta window of 5 minutes to ignore the times that have already been counted for. I have already scowled many pandas functions to see if there was something that would do the like of what I've been building from a modified version of a rolling list. I need the 5 minute window to be the span of one record back down to the second. My end goal is to have a dataframe looking like this:
ValCountTimeWin_df
Time 5 10
0 2022-06-22 14:00:06 11 0
1 2022-06-22 14:04:27 4 0
2 2022-06-22 14:15:08 4 0
3 2022-06-22 14:24:34 1 1
3 2022-06-22 14:26:49 0 2
I have a rolling class that should record a count for each of the various values seen within a timedelta window of 5 minutes.
import pandas as pd
import os
df = pd.DataFrame(
data={
"Value": [0,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,10,10,10],
"Time": ['2022-06-22 13:55:01',
'2022-06-22 13:55:53',
'2022-06-22 13:56:25',
'2022-06-22 13:56:32',
'2022-06-22 13:56:39',
'2022-06-22 13:56:48',
'2022-06-22 13:58:49',
'2022-06-22 13:58:57',
'2022-06-22 13:59:28',
'2022-06-22 13:59:37',
'2022-06-22 13:59:46',
'2022-06-22 13:59:57',
'2022-06-22 14:00:06',
'2022-06-22 14:01:30',
'2022-06-22 14:02:11',
'2022-06-22 14:03:42',
'2022-06-22 14:04:27',
'2022-06-22 14:10:50',
'2022-06-22 14:11:25',
'2022-06-22 14:12:40',
'2022-06-22 14:15:08',
'2022-06-22 14:19:33',
'2022-06-22 14:24:34',
'2022-06-22 14:25:13',
'2022-06-22 14:26:49',
],
}
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
class ValueRollingList:
def __init__(self,T='5T'):
self.cur = pd.DataFrame(columns=['Value','Time'])
self.window = pd.Timedelta(T)
self.last_window_mask = pd.Series()
self.new_df = pd.DataFrame()
self.last_window = list()
def __add(self, row):
idx = self.cur.index.max()
new_idx = idx+1 if idx==idx else 0
self.cur.loc[new_idx] = row[['Value','Time']]
def handle_row(self, row):
self.cur = self.cur[~self.cur.Value.eq(0)]
# Add the row before anything else to make sure we process all rows
self.__add(row)
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
print("...........")
print("(row['Time'] - self.cur['Time']).abs()")
print((row['Time'] - self.cur['Time']).abs())
print("window_mask")
print(window_mask)
print("self.cur")
print(self.cur)
print("...........")
if ~window_mask.all():
# Select all rows from 0 to one before the current row in "self.cur"
# This would mean, process the rows from 0 to the current row (Including it) from the original data
df = self.cur.iloc[:row.name]
df= df[~df.Time.isin(self.last_window)]
print("$$$$$$$$$$$$$$$$$$$$$")
if len(df)>0:
df2=df[['Value']].value_counts().reset_index(inplace=False)
df2.columns = ["Value","Count"]
if len(self.last_window_mask)>0:
print(window_mask.value_counts()[True])
print(self.last_window_mask.value_counts()[True])
if self.last_window_mask.value_counts()[True]>window_mask.value_counts()[True]:
print("WOW")
#os._exit(os.EX_OK)
# Only record the time windows that have more than one record
if len(df)>2:
print('#######################')
print("row")
print(row)
#this is what will be added to the new df
print(self.cur.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True)))
print('#######################')
# add value count to the new df
self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])
self.last_window = self.last_window + (df["Time"].tolist())
self.last_window_mask = window_mask.copy()
self.cur=self.cur.loc[window_mask]
return
def dump_last(self):
return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)
rolling_list = ValueRollingList('5T')
df.apply(rolling_list.handle_row, axis=1)
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)
I know this will be a lot but I wanted to have full visual of what I'm seeing.The following are the printouts that I'm getting:
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
Name: Time, dtype: bool
self.cur
Value Time
0 0 2022-06-22 13:55:01
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
Name: Time, dtype: bool
self.cur
Value Time
0 0 2022-06-22 13:55:53
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:00:07
1 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:00:14
1 0 days 00:00:07
2 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:00:23
1 0 days 00:00:16
2 0 days 00:00:09
3 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:02:24
1 0 days 00:02:17
2 0 days 00:02:10
3 0 days 00:02:01
4 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:02:32
1 0 days 00:02:25
2 0 days 00:02:18
3 0 days 00:02:09
4 0 days 00:00:08
5 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:03:03
1 0 days 00:02:56
2 0 days 00:02:49
3 0 days 00:02:40
4 0 days 00:00:39
5 0 days 00:00:31
6 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
6 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:03:12
1 0 days 00:03:05
2 0 days 00:02:58
3 0 days 00:02:49
4 0 days 00:00:48
5 0 days 00:00:40
6 0 days 00:00:09
7 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:03:21
1 0 days 00:03:14
2 0 days 00:03:07
3 0 days 00:02:58
4 0 days 00:00:57
5 0 days 00:00:49
6 0 days 00:00:18
7 0 days 00:00:09
8 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:03:32
1 0 days 00:03:25
2 0 days 00:03:18
3 0 days 00:03:09
4 0 days 00:01:08
5 0 days 00:01:00
6 0 days 00:00:29
7 0 days 00:00:20
8 0 days 00:00:11
9 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:03:41
1 0 days 00:03:34
2 0 days 00:03:27
3 0 days 00:03:18
4 0 days 00:01:17
5 0 days 00:01:09
6 0 days 00:00:38
7 0 days 00:00:29
8 0 days 00:00:20
9 0 days 00:00:09
10 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
...........
...........
(row['Time'] - self.cur['Time']).abs()
0 0 days 00:05:05
1 0 days 00:04:58
2 0 days 00:04:51
3 0 days 00:04:42
4 0 days 00:02:41
5 0 days 00:02:33
6 0 days 00:02:02
7 0 days 00:01:53
8 0 days 00:01:44
9 0 days 00:01:33
10 0 days 00:01:24
11 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
Name: Time, dtype: bool
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 5 2022-06-22 14:01:30
...........
$$$$$$$$$$$$$$$$$$$$$
#######################
row
Value 5
Time 2022-06-22 14:01:30
Name: 13, dtype: object
Time 5
0 2022-06-22 14:01:30 12
#######################
...........
(row['Time'] - self.cur['Time']).abs()
1 0 days 00:05:39
2 0 days 00:05:32
3 0 days 00:05:23
4 0 days 00:03:22
5 0 days 00:03:14
6 0 days 00:02:43
7 0 days 00:02:34
8 0 days 00:02:25
9 0 days 00:02:14
10 0 days 00:02:05
11 0 days 00:00:41
12 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
1 False
2 False
3 False
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
Name: Time, dtype: bool
self.cur
Value Time
1 5 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 5 2022-06-22 14:01:30
12 5 2022-06-22 14:02:11
...........
$$$$$$$$$$$$$$$$$$$$$
9
11
WOW
12
12
...........
(row['Time'] - self.cur['Time']).abs()
4 0 days 00:04:53
5 0 days 00:04:45
6 0 days 00:04:14
7 0 days 00:04:05
8 0 days 00:03:56
9 0 days 00:03:45
10 0 days 00:03:36
11 0 days 00:02:12
12 0 days 00:01:31
13 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
Name: Time, dtype: bool
self.cur
Value Time
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 5 2022-06-22 14:01:30
12 5 2022-06-22 14:02:11
13 5 2022-06-22 14:03:42
...........
...........
(row['Time'] - self.cur['Time']).abs()
4 0 days 00:05:38
5 0 days 00:05:30
6 0 days 00:04:59
7 0 days 00:04:50
8 0 days 00:04:41
9 0 days 00:04:30
10 0 days 00:04:21
11 0 days 00:02:57
12 0 days 00:02:16
13 0 days 00:00:45
14 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
4 False
5 False
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
Name: Time, dtype: bool
self.cur
Value Time
4 5 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 5 2022-06-22 14:01:30
12 5 2022-06-22 14:02:11
13 5 2022-06-22 14:03:42
14 5 2022-06-22 14:04:27
...........
$$$$$$$$$$$$$$$$$$$$$
9
9
#######################
row
Value 5
Time 2022-06-22 14:04:27
Name: 16, dtype: object
Time 5
0 2022-06-22 14:04:27 3
#######################
...........
(row['Time'] - self.cur['Time']).abs()
6 0 days 00:11:22
7 0 days 00:11:13
8 0 days 00:11:04
9 0 days 00:10:53
10 0 days 00:10:44
11 0 days 00:09:20
12 0 days 00:08:39
13 0 days 00:07:08
14 0 days 00:06:23
15 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 True
Name: Time, dtype: bool
self.cur
Value Time
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 5 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 5 2022-06-22 14:01:30
12 5 2022-06-22 14:02:11
13 5 2022-06-22 14:03:42
14 5 2022-06-22 14:04:27
15 5 2022-06-22 14:10:50
...........
$$$$$$$$$$$$$$$$$$$$$
1
9
WOW
10
11
...........
(row['Time'] - self.cur['Time']).abs()
15 0 days 00:00:35
16 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
15 True
16 True
Name: Time, dtype: bool
self.cur
Value Time
15 5 2022-06-22 14:10:50
16 5 2022-06-22 14:11:25
...........
...........
(row['Time'] - self.cur['Time']).abs()
15 0 days 00:01:50
16 0 days 00:01:15
17 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
15 True
16 True
17 True
Name: Time, dtype: bool
self.cur
Value Time
15 5 2022-06-22 14:10:50
16 5 2022-06-22 14:11:25
17 5 2022-06-22 14:12:40
...........
...........
(row['Time'] - self.cur['Time']).abs()
15 0 days 00:04:18
16 0 days 00:03:43
17 0 days 00:02:28
18 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
15 True
16 True
17 True
18 True
Name: Time, dtype: bool
self.cur
Value Time
15 5 2022-06-22 14:10:50
16 5 2022-06-22 14:11:25
17 5 2022-06-22 14:12:40
18 5 2022-06-22 14:15:08
...........
...........
(row['Time'] - self.cur['Time']).abs()
15 0 days 00:08:43
16 0 days 00:08:08
17 0 days 00:06:53
18 0 days 00:04:25
19 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
15 False
16 False
17 False
18 True
19 True
Name: Time, dtype: bool
self.cur
Value Time
15 5 2022-06-22 14:10:50
16 5 2022-06-22 14:11:25
17 5 2022-06-22 14:12:40
18 5 2022-06-22 14:15:08
19 5 2022-06-22 14:19:33
...........
$$$$$$$$$$$$$$$$$$$$$
2
1
#######################
row
Value 5
Time 2022-06-22 14:19:33
Name: 21, dtype: object
Time 5
0 2022-06-22 14:19:33 5
#######################
...........
(row['Time'] - self.cur['Time']).abs()
18 0 days 00:09:26
19 0 days 00:05:01
20 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
18 False
19 False
20 True
Name: Time, dtype: bool
self.cur
Value Time
18 5 2022-06-22 14:15:08
19 5 2022-06-22 14:19:33
20 10 2022-06-22 14:24:34
...........
$$$$$$$$$$$$$$$$$$$$$
1
2
WOW
3
5
...........
(row['Time'] - self.cur['Time']).abs()
20 0 days 00:00:39
21 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
20 True
21 True
Name: Time, dtype: bool
self.cur
Value Time
20 10 2022-06-22 14:24:34
21 10 2022-06-22 14:25:13
...........
...........
(row['Time'] - self.cur['Time']).abs()
20 0 days 00:02:15
21 0 days 00:01:36
22 0 days 00:00:00
Name: Time, dtype: timedelta64[ns]
window_mask
20 True
21 True
22 True
Name: Time, dtype: bool
self.cur
Value Time
20 10 2022-06-22 14:24:34
21 10 2022-06-22 14:25:13
22 10 2022-06-22 14:26:49
...........
ValCountTimeWin_df
Time 5
0 2022-06-22 14:01:30 12
1 2022-06-22 14:04:27 3
2 2022-06-22 14:19:33 5
I believe there are a few issues in play.
The window_mask:we can see there can be cases when the new row being added causes two or more false instances and it being fewer in true instances than the previous iteration.This would then neglect the previous records that were marked as true previously.
Also, I'm noticing that the timedelta of 5 minutes in the first value count is going beyond the 5 minute window by 5 seconds. That's why the first record in the ValCountTimeWin_df is 12 instead of 11.
Another issue is that the last few records won't get recorded because there is not false instance.
Can anyone help me?
I NEED the solution to be in the form of a class that reflects this one.
So I believe I have found a viable solution:
import pandas as pd
import os
df = pd.DataFrame(
data={
"Value": [0,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,10,10,10],
"Time": ['2022-06-22 13:55:01',
'2022-06-22 13:55:53',
'2022-06-22 13:56:25',
'2022-06-22 13:56:32',
'2022-06-22 13:56:39',
'2022-06-22 13:56:48',
'2022-06-22 13:58:49',
'2022-06-22 13:58:57',
'2022-06-22 13:59:28',
'2022-06-22 13:59:37',
'2022-06-22 13:59:46',
'2022-06-22 13:59:57',
'2022-06-22 14:00:06',
'2022-06-22 14:01:30',
'2022-06-22 14:02:11',
'2022-06-22 14:03:42',
'2022-06-22 14:04:27',
'2022-06-22 14:10:50',
'2022-06-22 14:11:25',
'2022-06-22 14:12:40',
'2022-06-22 14:15:08',
'2022-06-22 14:19:33',
'2022-06-22 14:24:34',
'2022-06-22 14:25:13',
'2022-06-22 14:26:49',
],
}
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
class ValueRollingList:
def __init__(self,T='5T'):
self.cur = pd.DataFrame(columns=['Value','Time'])
self.window = pd.Timedelta(T)
self.last_window_mask = pd.Series()
self.new_df = pd.DataFrame()
self.last_df = pd.DataFrame()
self.last_window = list()
def __add(self, row):
idx = self.cur.index.max()
new_idx = idx+1 if idx==idx else 0
self.cur.loc[new_idx] = row[['Value','Time']]
def handle_row(self, row):
# This will handle each row and add it to the previous iteration
# in order to implement the timedelta window of counting
# the values per student
self.cur = self.cur[~self.cur.Value.eq(0)]
# Add the row before anything else to make sure we process all rows
self.__add(row)
# remove the last window
self.cur = self.remove_last_window(self.cur,self.last_window)
# create a true/false series of identifying the records that are within the timedelta
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
# This would mean, process the rows from 0 to the current row (Including it) from the original data
df = self.cur.iloc[:row.name]
if ~window_mask.all():
# Select all rows from 0 to one before the current row in "self.cur"
df=self.cur.loc[window_mask]
if len(df)>1:
if len(self.last_window_mask)>0:
# if the last window mask had more true counts
# then process the last df instead of this one
if self.last_window_mask.value_counts()[True]>=window_mask.value_counts()[True]:
self.process(self.last_window_mask,self.last_df)
# now remove the last window from the last df
# and run the handle_row function again
self.cur = self.remove_last_window(df,self.last_window)
self.cur.drop(self.cur.tail(1).index,inplace=True)
self.handle_row(row)
#os._exit(os.EX_OK)
else:
self.process(window_mask,df)
else:
self.last_window_mask = window_mask.copy()
self.last_df = df
return
def remove_last_window(self,df,last_window):
# Return df with the times of last window removed
return df[~df.Time.isin(last_window)]
def process(self,window_mask,df):
# Only record the time windows that have more than one record
df2=df[['Value']].value_counts().reset_index(inplace=False)
df2.columns = ["Value","Count"]
#this is what will be added to the new df
# add value count to the new df
self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])
self.last_window = self.last_window + (df["Time"].tolist())
self.last_df = pd.DataFrame()
self.last_window_mask = pd.Series()
return
def dump_last(self):
return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)
rolling_list = ValueRollingList('5T')
df.apply(rolling_list.handle_row, axis=1)
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)
I had to breakdown the class into some extra functions because I realized that I needed to compensate for when the previous iteration of the rows was either equal to or greater than the current dataframe. If there was such a case, I would have to process the previous dataframe and remove the current row that was added and the rows that were processed before looping the dataframe back through the handle_row function.
The final output was this:
ValCountTimeWin_df
Time 5 10
0 2022-06-22 14:00:06 11.0 0.0
1 2022-06-22 14:04:27 4.0 0.0
2 2022-06-22 14:15:08 4.0 0.0
3 2022-06-22 14:25:13 0.0 2.0