For a regression problem, I have a training data set with : - 3 variables with a gaussian distribution - 20 variables with a uniform distribution.
All my variables are continious, between [0;1].
The problem is the test data, used to score my regression model has an uniform distribution for all the variables. Actually, I have bad results at tail-end distribution, so I want to oversample my training set, in order to duplicate the rarest rows.
So my idea is to bootstrap (using sampling with replacement) on my training set in order to have a set of data with the same distribution as the test set.
In order to do that, my idea (don't know if it's a good one !) is to add 3 columns with intervals for my 3 variables and use this columns to stratify the resampling.
Example : First, generating the data
from scipy.stats import truncnorm
def get_truncated_normal(mean=0.5, sd=0.15, min_value=0, max_value=1):
return truncnorm(
(min_value - mean) / sd, (max_value - mean) / sd, loc=mean, scale=sd)
generator = get_truncated_normal()
import numpy as np
from sklearn.preprocessing import MinMaxScaler
S1 = generator.rvs(1000)
S2 = generator.rvs(1000)
S3 = generator.rvs(1000)
u = np.random.uniform(0, 1, 1000)
Then check the distribution :
import seaborn as sns
sns.distplot(u);
sns.distplot(S2);
It's OK, so I'll add categories columns
import pandas as pd
df = pd.DataFrame({'S1':S1,'S2':S2,'S3':S3,'Unif':u})
BINS_NUMBER = 10
df['S1_range'] = pd.cut(df.S1,
bins=BINS_NUMBER,
precision=6,
right=True,
include_lowest=True)
df['S2_range'] = pd.cut(df.S2,
bins=BINS_NUMBER,
precision=6,
right=True,
include_lowest=True)
df['S3_range'] = pd.cut(df.S3,
bins=BINS_NUMBER,
precision=6,
right=True,
include_lowest=True)
a check
df.groupby('S1_range').size()
S1_range
(0.022025899999999998, 0.116709] 3
(0.116709, 0.210454] 15
(0.210454, 0.304199] 64
(0.304199, 0.397944] 152
(0.397944, 0.491689] 254
(0.491689, 0.585434] 217
(0.585434, 0.679179] 173
(0.679179, 0.772924] 86
(0.772924, 0.866669] 30
(0.866669, 0.960414] 6
dtype: int64
It's good for me. So now I'll try to resample but it's not working as intended
from sklearn.utils import resample
df_resampled = resample(df,replace=True,n_samples=1000, stratify=df['S1_range'])
df_resampled.groupby('S1_range').size()
S1_range
(0.022025899999999998, 0.116709] 3
(0.116709, 0.210454] 15
(0.210454, 0.304199] 64
(0.304199, 0.397944] 152
(0.397944, 0.491689] 254
(0.491689, 0.585434] 217
(0.585434, 0.679179] 173
(0.679179, 0.772924] 86
(0.772924, 0.866669] 30
(0.866669, 0.960414] 6
dtype: int64
So it's not working, I get the same distribution in output as in input...
Can you help me ? Perhaps it's not the good way to do this ?
Thanks !!
My solution:
def create_sampled_data_set(n_samples_by_bin=1000,
n_bins=10,
replace=True,
save_csv=True):
"""In order to have the same distribution for S1..S3 between training
set and test set, this function will generate a new
training set resampled
Return: (X_train, y_train)
"""
def stratified_sample_df_(df, col, n_samples, replace=True):
if replace:
n = n_samples
else:
n = min(n_samples, df[col].value_counts().min())
df_ = df.groupby(col).apply(lambda x: x.sample(n, replace=replace))
df_.index = df_.index.droplevel(0)
return df_
X_train, y_train = load_data_for_train()
# merge the dataframe for the sampling. Target will be removed after
X_train = pd.merge(
X_train, y_train[['Target']], left_index=True, right_index=True)
del y_train
# build a categorical feature, from S1..S3 distribution
disc = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
disc.fit(X_train[['S1', 'S2', 'S3']])
y_bin = disc.transform(X_train[['S1', 'S2', 'S3']])
del disc
vint = np.vectorize(np.int)
y_bin = vint(y_bin)
y_concat = []
for i in range(len(y_bin)):
a = y_bin[i, 0].astype('str')
b = y_bin[i, 1].astype('str')
c = y_bin[i, 2].astype('str')
y_concat.append(a + ';' + b + ';' + c)
del y_bin
X_train['S_Class'] = y_concat
del y_concat
X_train_resampled = stratified_sample_df_(
X_train, 'S_Class', n_samples_by_bin)
del X_train
y_train_resampled = X_train_resampled[['Target']].copy()
y_train_resampled.rename(
columns={y_train_resampled.columns[0]: 'Target'}, inplace=True)
X_train_resampled = X_train_resampled.drop(['S_Class', 'Target'], axis=1)
# save in file for further usage
if save_csv:
X_train_resampled.to_csv(
"./data/training_input_resampled.csv", sep=",")
y_train_resampled.to_csv(
"./data/training_output_resampled.csv", sep=",")
return(X_train_resampled,
y_train_resampled)