I have two Pandas DataFrames df_x and df_y. df_x has two columns 'high target' and 'low target'. Per every row of df_x, I would like to search through the instances of df_y and see whether the 'high target' was reached before the 'low target'. Currently, I implemented the above using .apply. However, my code is too inefficient as it linearly scales with the number of rows in df_x. Any suggestions to optimize/vectorize my code?
def efficient_high_after_low(row, minute_df):
"""True, if high happened after the low, else False.
Args:
row: A pandas dataframe
with high, low,
minute_df: the whole dataframe
"""
minute_df_after = minute_df.loc[row.period_end_idx+pd.Timedelta(minutes=1):]
#print(minute_df_after)
first_highs = (minute_df_after.ge(row['high target']))
first_lows = (minute_df_after.le(row['low target']))
hi_sum, lo_sum = first_highs.sum(), first_lows.sum()
if (len(first_highs) != len(first_lows)):
raise Exception('Unequal length of first_highs and first_lows')
else:
if ((len(first_highs) == 0)):
return None
elif ((hi_sum == 0) & (lo_sum != 0)):
return True
elif ((hi_sum != 0) & (low_sum == 0)):
return False
elif ((hi_sum == 0) & (low_sum == 0)):
return None
elif (first_highs.idxmax() > first_lows.idxmax()):
return True
elif(first_highs.idxmax() < first_lows.idxmax()):
return False
else:
return None
And I do the following to get these boolean values:
df_x.apply(efficient_high_after_low, axis=1, args=(df_y['open'],))
Running the code above on the first 1000 lines takes 4 seconds.
def high_after_low(high_targets,
low_targets,
vals,
dic):
"""True, if the high target was hit before low; else, False.
Args:
high_target: A NumPy array, the high targets
of each row.
low_target: A NumPy array, the low targets
of each row.
vals: A NumPy array, the current values
dic: A dictionary; maps every row of the targets to the open values.
Returns:
high_after_low: A pandas series with True, False, and None values per row.
Meaning of each value:
True: the timeseries hit high after low (or low was never hit)
False: the timeseries hit high before low (or high was never hit)
None: (1) neither low or high were hit, (2) low and high were hit
at the same row
"""
dic_keys = list(dic.keys())
size = len(dic_keys)
high_hit_rows = [(((a).argmax()+dic[i] ) if((a:= (vals[dic[i]:]
>= high_target[i])).any()) else np.nan ) for i in range(size)]
low_hit_rows = [(((b).argmax()+dic[i]) if((b:= (vals[dic[i]:]
<= low_target[i])).any()) else np.nan ) for i in range(size)]
high_hit_rows = np.array(high_hit_rows, dtype=np.float32)
low_hit_rows = np.array(low_hit_rows, dtype=np.float32)
high_after_low = np.empty((size))
high_after_low[:] = np.nan
high_after_low[np.isnan(low_hit_rows) & (~np.isnan(high_hit_rows))] = False
high_after_low[(~np.isnan(low_hit_rows)) & np.isnan(high_hit_rows)] = True
high_after_low[((~np.isnan(low_hit_rows)) & (~np.isnan(high_hit_rows))
& (low_hit_rows < high_hit_rows))] = True
high_after_low[((~np.isnan(low_hit_rows)) & (~np.isnan(high_hit_rows))
& (low_hit_rows > high_hit_rows))] = False
return high_after_low
The vectorized solution requires pre-processing of the input array. We need to pre-process the input array into a 2d array such that the i-th row contains the array's values from [i:i+T]. Then,
def vectorized_high_after_low(df, high_values, low_values):
"""Args:
df: A pandas DataFrame, containing each value per row. Each column contains the values or row i, t rows ahead.
high_values: The high targets corresponding to each row
low_values: The low targets corresponding to each row.
"""
higher = (df.ge(high_values)).idxmax(axis=1)
lower = (df.le(low_values)).idxmax(axis=1)
higher[higher==0] = df.shape[1]
lower[lower==0] = df.shape[1]
high_after_low = higher < lower
high_after_low[higher==lower] = np.nan
return high_after_low