Reproducing LightGBM's `logloss` in the Python API

I want to start using custom classification loss functions in LightGBM, and I thought that having a custom implementation of binary_logloss is a good place to start. Following the answer here I managed to get a custom logloss with performance approximately identical to the builtin logloss (in the scikit-learn API).

I tried following the same logic in the Python API:

import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = lightgbm.Dataset(x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(x_test, label=y_test)

# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
    y_true = data.get_label()
    preds = np.where(preds >= 0,
                 1. / (1. + np.exp(-preds)),
                 np.exp(preds) / (1. + np.exp(preds)))
    grad = -(y_true - preds)
    hess = preds * (1.0 - preds)
    return grad, hess

# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
    y_true = data.get_label()
    preds = np.where(preds >= 0,
             1. / (1. + np.exp(-preds)),
             np.exp(preds) / (1. + np.exp(preds)))
    loss = -(y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
    return "binary_logloss", np.mean(loss), False

# RUN MODEL WITH BUILTIN LOSS

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 10,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1
}


model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=[train_data, test_data], 
                       valid_names=['train','valid'],
                       num_boost_round=100
                      )

# RUN MODEL WITH CUSTOM LOSS

parameters = {
    'application': 'binary',
#     'objective': 'binary',
#     'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 10,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=[train_data, test_data], 
                       valid_names=['train','valid'],
                       num_boost_round=100,
                      fobj=my_logloss,  
                       ,feval=logloss_eval
                      )

Unlike the scikit-learn API case, I am not getting the same sequence of training/validation losses between the two models. For example the first model ends with a training loss of 0.159872 and the second model ends with a training loss of 0.157686.

Is there a way to modify my code above so that it matches the builtin logloss exactly? If so, I would appreciate to know how. If not, I would like to know what causes the difference.

Solution

The differences in the results are due to:

The different initialization used by LightGBM when a custom loss function is provided, this GitHub issue explains how it can be addressed. The easiest solution is to set 'boost_from_average': False.
The sub-sampling of the features due to the fact that feature_fraction < 1. This may require opening an issue in GitHub as it is not clear why the results are not reproducible given that the feature_fraction_seed is fixed by default.

import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = lightgbm.Dataset(data=x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(data=x_test, label=y_test)

# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
    y_true = data.get_label()
    preds = np.where(preds >= 0, 1. / (1. + np.exp(- preds)), np.exp(preds) / (1. + np.exp(preds)))
    grad = - (y_true - preds)
    hess = preds * (1.0 - preds)
    return grad, hess

# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
    y_true = data.get_label()
    preds = np.where(preds >= 0, 1. / (1. + np.exp(-preds)), np.exp(preds) / (1. + np.exp(preds)))
    loss = - (y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
    return 'custom_loss', np.mean(loss), False

# RUN MODEL WITH BUILTIN LOSS
parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 10,
    # 'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1,
    'boost_from_average': False,
}

model = lightgbm.train(params=parameters,
                       train_set=train_data,
                       valid_sets=[train_data, test_data],
                       valid_names=['train', 'valid'],
                       num_boost_round=100)

# [100] train's binary_logloss: 0.133561    valid's binary_logloss: 0.271294

# RUN MODEL WITH CUSTOM LOSS
parameters = {
    'boosting': 'gbdt',
    'num_leaves': 10,
    # 'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1,
    'boost_from_average': False,
}

model = lightgbm.train(params=parameters,
                       train_set=train_data,
                       valid_sets=[train_data, test_data],
                       valid_names=['train', 'valid'],
                       num_boost_round=100,
                       fobj=my_logloss,
                       feval=logloss_eval)

# [100] train's custom_loss: 0.133561   valid's custom_loss: 0.271294