I'm trying to implement a GradientBoostingClassifier
I took the algorithm from the StatQuest video (Gradient Boost Part 4 (of 4): Classification Details) and trying to implement it using numpy
+ sklearn.DecisionTreeRegressor
as a base model.
Here is my code:
from sklearn.tree import DecisionTreeRegressor
def sigmoid(x):
if x > 0:
z = np.exp(-x)
return 1/(1+z)
else:
z = np.exp(x)
return z/(1+z)
class GradientBoostingClassifier():
def __init__(self, n_estimators=20, lr=0.1):
self.n_estimators = n_estimators
self.lr = lr
self.training_history = {
'log_loss': [], 'roc_auc': [], 'pr_auc': []
}
self.base_learners = []
def fit(self, X, y):
data = X.copy()
features = data.columns
# 1. Initialize model with a constant value:
p = y.sum() / len(y)
odds = p / (1-p)
log_odds = np.log(odds)
data['cur_log_odds'] = log_odds
data['prediction'] = data['cur_log_odds'].apply(sigmoid)
self.training_history['log_loss'].append( log_loss(y, data['prediction']) )
self.training_history['roc_auc'].append( roc_auc_score(y, data['prediction']) )
self.training_history['pr_auc'].append( average_precision_score(y, data['prediction']) )
# 2. For m = 1 to M:
for _ in tqdm(range(self.n_estimators)):
# 2.1 Compute so-called pseudo-residuals:
data['residuals'] = (data['prediction'] - y)
# 2.2 Fit a base learner regressor to predict pseudo-residuals:
base_learner = DecisionTreeRegressor(max_depth=3, min_samples_split=2, random_state=42)
base_learner.fit(data[features], data['residuals'])
self.base_learners.append(base_learner)
# 2.3 For each leaf calculate its output log-odds
# get leaf number from Regression tree
data['leaf'] = base_learner.apply(data[features])
# compute output log-odd as sum(residuals) / sum(old_prediction * (1 - old_prediction))
leafs_output = data.groupby('leaf', as_index=False).apply(
lambda d: d['residuals'].sum() / (0.00001+(d['prediction'] * (1-d['prediction']) ).sum())
).rename(columns={None: 'lambda_odds'})
data = data.merge(leafs_output, on='leaf')
# 2.4 Update current_log_odds = current_log_odds + lr*predicted_log_odds
data['cur_log_odds'] += self.lr*data['lambda_odds']
data = data.drop('lambda_odds', axis=1)
data['prediction'] = data['cur_log_odds'].apply(sigmoid)
self.training_history['log_loss'].append( log_loss(y, data['prediction']) )
self.training_history['roc_auc'].append( roc_auc_score(y, data['prediction']) )
self.training_history['pr_auc'].append( average_precision_score(y, data['prediction']) )
return data
def predict_proba(self, X):
pass
The problem is that even after 1000 iterations (n_estimators = 1000
) my roc_auc
and pr_auc
scores are close to what the random model gives (roc_auc=0.5
, pr_auc=0.29
which is a proportion of the positive class).
What am I doing wrong?
An sklearn GradientBoostingClassifier implementation gives much higher scores on the same dataset even with n_estimators = 10
After a few hours of debugging I figured out that
data.merge(leafs_output, on='leaf')
function merge reorders the rows of the left dataframe, so that my data
dataframe gets reordered on the every interation
data.merge(leafs_output, on='leaf', how='left')
solves the issue