#Create synthetic dataframe
import pandas as pd
import numpy as np
temp_series = pd.Series(range(25, 126, 1))
temp_noise = np.random.rand(len(temp_series))*3
df = pd.DataFrame({'temp':(temp_series+temp_noise), 'data':(np.random.rand(len(temp_series)))*400})
#calculate length of original and copy original because function works in place.
before_length = len(df)
df_dup = df
temp_intervals = [50, 70, 92.7]
window = 5
From here, run a function based on the dataframe (df), the temperature observations (temp_intervals) and the window size (window):
def remove_window(df, intervals, window):
'''Loop through the temperature intervals to define a window of indices around given temperatures in the dataframe to drop. Drop the window of indices in place and reset the index prior to moving to the next interval.
'''
def remove_window(df, intervals, window):
for temp in intervals[0:len(intervals)]:
#Find index where temperature first crosses the interval input
cent_index = max(df.index[df['temp']<=temp].tolist())
#Define window of indices to remove from the df
drop_indices = list(range(cent_index-window, cent_index+window))
#Use df.drop
df.drop(drop_indices, inplace=True)
df.reset_index(drop=True)
return df
So, is this a problem with he funtcion I've defined or is there a problem with df.drop?
Thank you, Brad
It can be tricky to repeatedly delete parts of the dataframe and keep track of what you're doing. A cleaner approach is to keep track of which rows you want to delete within the loop, but only delete them outside of the loop, all at once. This should also be faster.
def remove_window(df, intervals, window):
# Create a Boolean array indicating which rows to keep
keep_row = np.repeat(True, len(df))
for temp in intervals[0:len(intervals)]:
# Find index where temperature first crosses the interval input
cent_index = max(df.index[df['temp']<=temp].tolist())
# Define window of indices to remove from the df
keep_row[range(cent_index - window, cent_index + window)] = False
# Delete all unwanted rows at once, outside the loop
df = df[keep_row]
df.reset_index(drop=True, inplace=True)
return df