It comes to a situation that needs to train a model progressively, and I want to get a model with a small size, but just as the example in the following, my temporary model size increased each iteration, and I don't know how to control the model size.
# -*- coding: utf-8 -*-
import xgboost as xgb
from sklearn.model_selection import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse
X = load_boston()['data']
y = load_boston()['target']
# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror', 'tree_method': 'hist', 'max_depth': 3, 'max_leaves': 100, 'grow_policy': 'lossguide'}
model_1 = xgb.train(params, xg_train_1, 30)
# model_1.save_model('t0.json')
# ================= train two versions of the model =====================#
model_v1 = xgb.train(params, xg_train_2, 30)
model_v1.save_model('./t1.json')
print(model_v1.__dict__)
model_v2 = xgb.train(params, xg_train_2, 30, xgb_model='t1.json')
model_v2.save_model('./t2.json')
print(model_v2.__dict__)
model_v3 = xgb.train(params, xg_train_2, 30, xgb_model='t2.json')
model_v3.save_model('./t3.json')
print(model_v3.__dict__)
model_v4 = xgb.train(params, xg_train_2, 30, xgb_model='t3.json')
model_v4.save_model('./t4.json')
print(model_v4.__dict__)
model_v5 = xgb.train(params, xg_train_2, 30, xgb_model='t4.json')
model_v5.save_model('./t4.json')
print(model_v5.__dict__)
BTW, I have read each parameter of XGBoost of training a model and tried max_depth, max_leaves, and many other combinations, neither works.
The detail of JSON data shows more and more trees added, after reading the document carefully again and I find the two arguments are designed for this.(max_depth, max_leaves just for tree property, but not for tree numbers)
we can solve this problem by the following code.
# -*- coding: utf-8 -*-
import xgboost as xgb
from sklearn.model_selection import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse
X = load_boston()['data']
y = load_boston()['target']
# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {
'objective': 'reg:squarederror',
'tree_method': 'hist',
'max_depth': 3,
'max_leaves': 100,
}
model_1 = xgb.train(params, xg_train_1, 10)
# model_1.save_model('t0.json')
# ================= train two versions of the model =====================#
model_v1 = xgb.train(params, xg_train_2)
model_v1.save_model('./t1.json')
print(model_v1.__dict__)
params1 = params.copy()
# key arguments for this
params1['process_type'] = 'update'
params1['updater'] = 'refresh'
model_v2 = xgb.train(params1, xg_train_2, xgb_model='t1.json')
model_v2.save_model('./t2.json')
print(model_v2.__dict__)
Reference:
[1]https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.training [2]https://github.com/dmlc/xgboost/issues/3055