How can I extract LightGBM model coefficients, rules of prediction by features? I am in insurance industry. This is absolutely necessary for strict regulatory requirements. Something like a set of rules like: "if feature 1 and feature 2 and feature 3 then y_pred = 104.62" would work.
Many thanks!
Philip W.
Example Codes:
import time
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import d2_tweedie_score
tweedie_p = 1.50568
mono_constraints = (1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
s_time = time.time()
print("Starting the LightGBM training")
params = {
'task': 'train',
'boosting': 'gbdt',
'num_leaves': 174,
'max_depth': 11,
'n_jobs': 6,
'n_estimators': 1000,
'subsample_for_bin': 200000,
'learning_rate': 0.06308264944407528,
'random_state': seed2,
'min_gain_to_split': 9.540907629434999,
'min_data_in_leaf': 1800,
'bagging_fraction': 1.00,
'bagging_seed': 49,
'bagging_freq': 1,
'feature_fraction': 1.00,
'feature_fraction_seed': 1958,
'extra_trees': True,
'extra_seed': 1975,
'lambda_l1': 70,
'lambda_l2': 0,
'importance_type': 'gain',
# 'init_model': model_saved,
'monotone_constraints': mono_constraints,
'monotone_constraints_method': 'advanced'
}
mdl = lgb.LGBMRegressor(
**params,
objective='tweedie',
tweedie_variance_power=tweedie_p
)
mdl.fit(
X_train, y_train, sample_weight=w_train,
eval_set=[(X_valid, y_valid), (X_train, y_train)],
eval_sample_weight=[w_valid, w_train],
eval_metric=['auc', 'tweedie'],
callbacks=[lgb.early_stopping(40)]
)
y_pred = mdl.predict(X_valid, num_iteration=mdl.best_iteration_)
d2_score = d2_tweedie_score(y_valid, y_pred, sample_weight=w_valid, power=tweedie_p)
run_time = round(time.time() - s_time, 3)
print(f"Fit_time = {run_time}") # Fit_time = 7.203
print(f"Best_iteration = {mdl.best_iteration_}") # best_iteration = 56
print(f"Best_score = {mdl.best_score_}")
print(f"\td2_tweedie_score: {d2_score:.5f}")
LightGBM model training creates an ensemble of decision trees.
When predicting for one sample, the sample is passed through each tree, and then all of the trees' outputs are added to produce a final prediction.
So LightGBM models do not have "coefficients" in the way that something like a linear model might.
There are several ways to inspect the structure of those trees in lightgbm
, the LightGBM Python package. For example, given that you've trained a regression model in Python 3.10, using lightgbm==4.0.0
, as follows...
import lightgbm as lgb
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1_000)
dtrain = lgb.Dataset(data=X, label=y)
bst = lgb.train(
train_set=dtrain,
params={
"objective": "regression",
"min_data_per_leaf": 3,
"num_iterations": 10
}
)
... you could do the following to obtain the model structure.
pandas
DataFramebst.trees_to_dataframe().head(10)
tree_index node_depth node_index left_child right_child parent_index split_feature split_gain threshold decision_type missing_direction missing_type value weight count
0 0 1 0-S0 0-S1 0-S2 None Column_79 5864870.0 0.053069 <= left None -1.711640 0.0 1000
1 0 2 0-S1 0-S3 0-S6 0-S0 Column_23 2577040.0 0.319818 <= left None -9.143480 515.0 515
2 0 3 0-S3 0-S11 0-S7 0-S1 Column_88 1499690.0 -0.403728 <= left None -14.462300 329.0 329
3 0 4 0-S11 0-L0 0-S15 0-S3 Column_79 375834.0 -1.545806 <= left None -23.199700 123.0 123
4 0 5 0-L0 None None 0-S11 None NaN NaN None None None -37.494507 16.0 16
5 0 5 0-S15 0-L12 0-S29 0-S11 Column_80 303124.0 0.047732 <= left None -21.062200 107.0 107
6 0 6 0-L12 None None 0-S15 None NaN NaN None None None -26.434711 53.0 53
7 0 6 0-S29 0-L16 0-L30 0-S15 Column_22 132407.0 -0.607386 <= left None -15.789200 54.0 54
8 0 7 0-L16 None None 0-S29 None NaN NaN None None None -22.245471 20.0 20
9 0 7 0-L30 None None 0-S29 None NaN NaN None None None -11.991366 34.0 34
bst.dump_model()["tree_info"]
{"tree_index": 0, "num_leaves": 31, "num_cat": 0, "shrinkage": 1, "tree_structure":
{"split_index": 0, "split_feature": 79, "split_gain": 5864870, "threshold": 0.0530686,
"decision_type": "<=", "default_left": true, "missing_type": "None",
"internal_value": -1.71164, "internal_weight": 0, "internal_count": 1000,
"left_child": {"split_index": 1, "split_feature": 23, "split_gain": 2577040,
"threshold": 0.31981814026944827, "decision_type": "<=",
"default_left": true, "missing_type": "None",
...
import matplotlib.pyplot as plt
# plot third tree (just as an example)
ax = lgb.plot_tree(gbm, tree_index=2)
plt.show()