I am trying to compare different lines, to know if one is above the other one, and if not, at which x
this change happens.
If I had the same x
values and same length, that would be very easy and only difference in y
s of the lines.
But I have different x
values for different lines, and the vectors do not have the same length, but x
intervals are the same for all curves.
As a very simple example I use the following data:
#curve 1: len = 9
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2: len = 10
x2 = np.array([3,4,5,6,7,8,9,10,11,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3: len = 8
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,250,270,350,380,400,390,380])
They are supposed to be 2 regression lines. In this simple example, the result is supposed to be that Curve 2 has higher values than curve 1 in all x
I was trying to bin x
in the range of 2.5-12.5 with the bin length of 1 to compare the corresponding y
s in each bin.
My actual data are big, and this comparison needs to be done many times, so I need to find a solution that does not take much time.
plt.figure(figsize=(6, 6))
plt.plot(x1, y1, marker='o', label='y1')
plt.plot(x2, y2, marker='o', label='y2')
plt.plot(x3, y3, marker='o', label='y3')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
def get_new_x
uses np.digitize
to re-bin the x-axis values.def get_comparison
adds a column of Booleans for each two columns compared
, however this can be updated to be a separate comparison
is a list column combinations
[Index(['y1', 'y2'], dtype='object'), Index(['y2', 'y3'], dtype='object')]
# function to create the bins
def get_bins(x_arrays: List[np.array]) -> np.array:
bin_len = np.diff(x_arrays[0][:2]) # calculate bin length
all_x = np.concatenate(x_arrays) # join arrays
min_x = min(all_x) # get min
max_x = max(all_x) # get max
return np.arange(min_x, max_x + bin_len, bin_len)
# function using np.digitize to bin the old x-axis into new bins
def get_new_x(x_arrays: List[np.array]) -> List[np.array]:
bins = get_bins(x_arrays) # get the bins
x_new = list()
for x in x_arrays:
x_new.append(bins[np.digitize(np.round(x), bins, right=True)]) # determine bins
return x_new
# function to create dataframe for arrays with new x-axis as index
def get_df(x_arrays: List[np.array], y_arrays: List[np.array]) -> pd.DataFrame:
x_new = get_new_x(x_arrays)
return pd.concat([pd.DataFrame(y, columns=[f'y{i+1}'], index=x_new[i]) for i, y in enumerate(y_arrays)], axis=1)
# compare each successive column of the dataframe
# if the left column is greater than the right column, then True
def get_comparison(df: pd.DataFrame):
cols = df.columns
combs = [cols[i:i+2] for i in range(0, len(cols), 1) if i < len(cols)-1]
for comb in combs:
df[f'{comb[0]} > {comb[1]}'] = df[comb[0]] > df[comb[1]]
import numpy as np
import pandas as pd
# put the arrays into a list
y = [y1, y2, y3]
x = [x1, x2, x3]
# call get_df
df = get_df(x, y)
# call get_comparison
# get only the index of True values with Boolean indexing
for col in df.columns[3:]:
vals = df.index[df[col]].tolist()
if vals:
print(f'{col}: {vals}')
y2 > y3: [8.0]
y1 y2 y3 y1 > y2 y2 > y3
3.0 NaN 90.0 NaN False False
4.0 NaN 210.0 NaN False False
5.0 100.0 211.0 NaN False False
6.0 101.0 250.0 NaN False False
7.0 110.0 260.0 300.0 False False
8.0 130.0 261.0 250.0 False True
9.0 132.0 265.0 270.0 False False
10.0 170.0 180.0 350.0 False False
11.0 190.0 200.0 380.0 False False
12.0 192.0 210.0 400.0 False False
13.0 210.0 NaN 390.0 False False
14.0 NaN NaN 380.0 False False
fig, ax = plt.subplots(figsize=(8, 6))
# add markers for problem values
for i, col in enumerate(df.columns[3:], 1):
vals = df.iloc[:, i][df[col]]
if not vals.empty:
ax.scatter(vals.index, vals.values, color='red', s=110, label='bad')
df.iloc[:, :3].plot(marker='o', ax=ax) # plot the dataframe
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('y-values plotted against rebinned x-values')