Search code examples
machine-learningboosting

Custom Gradient Boosting Classifier implementation. No training progress


I'm trying to implement a GradientBoostingClassifier I took the algorithm from the StatQuest video (Gradient Boost Part 4 (of 4): Classification Details) and trying to implement it using numpy + sklearn.DecisionTreeRegressor as a base model.

Here is my code:

from sklearn.tree import DecisionTreeRegressor

def sigmoid(x):
  if x > 0:   
    z = np.exp(-x)
    return 1/(1+z)
  else:
    z = np.exp(x)
    return z/(1+z)

class GradientBoostingClassifier():
  def __init__(self, n_estimators=20, lr=0.1):
    self.n_estimators = n_estimators
    self.lr = lr
    self.training_history = {
        'log_loss': [], 'roc_auc': [], 'pr_auc': []
    }
    self.base_learners = []

  def fit(self, X, y):
    data = X.copy()
    features = data.columns
    # 1. Initialize model with a constant value:
    p = y.sum() / len(y)
    odds = p / (1-p)
    log_odds = np.log(odds)
    data['cur_log_odds'] = log_odds
    data['prediction'] = data['cur_log_odds'].apply(sigmoid)

    self.training_history['log_loss'].append( log_loss(y, data['prediction']) )
    self.training_history['roc_auc'].append( roc_auc_score(y, data['prediction']) )
    self.training_history['pr_auc'].append( average_precision_score(y, data['prediction']) )

    # 2. For m = 1 to M:
    for _ in tqdm(range(self.n_estimators)):
      # 2.1 Compute so-called pseudo-residuals:
      data['residuals'] = (data['prediction'] - y)

      # 2.2 Fit a base learner regressor to predict pseudo-residuals:
      base_learner = DecisionTreeRegressor(max_depth=3,  min_samples_split=2, random_state=42)
      base_learner.fit(data[features], data['residuals'])
      self.base_learners.append(base_learner)

      # 2.3 For each leaf calculate its output log-odds
      # get leaf number from Regression tree
      data['leaf'] = base_learner.apply(data[features])

      # compute output log-odd as sum(residuals) / sum(old_prediction * (1 - old_prediction))
      leafs_output = data.groupby('leaf', as_index=False).apply(
          lambda d: d['residuals'].sum() / (0.00001+(d['prediction'] * (1-d['prediction']) ).sum())
      ).rename(columns={None: 'lambda_odds'})

      data = data.merge(leafs_output, on='leaf')
      # 2.4 Update current_log_odds =  current_log_odds + lr*predicted_log_odds
      data['cur_log_odds'] += self.lr*data['lambda_odds']
      data = data.drop('lambda_odds', axis=1)

      data['prediction'] = data['cur_log_odds'].apply(sigmoid)

      self.training_history['log_loss'].append( log_loss(y, data['prediction']) )
      self.training_history['roc_auc'].append( roc_auc_score(y, data['prediction']) )
      self.training_history['pr_auc'].append( average_precision_score(y, data['prediction']) )

    return data

  def predict_proba(self, X):
    pass

The problem is that even after 1000 iterations (n_estimators = 1000) my roc_auc and pr_auc scores are close to what the random model gives (roc_auc=0.5, pr_auc=0.29 which is a proportion of the positive class).

What am I doing wrong?

An sklearn GradientBoostingClassifier implementation gives much higher scores on the same dataset even with n_estimators = 10


Solution

  • After a few hours of debugging I figured out that

    data.merge(leafs_output, on='leaf')
    

    function merge reorders the rows of the left dataframe, so that my data dataframe gets reordered on the every interation

    data.merge(leafs_output, on='leaf', how='left')
    

    solves the issue