Search code examples
pythonxgboost

How to control the model size of xgboost?


It comes to a situation that needs to train a model progressively, and I want to get a model with a small size, but just as the example in the following, my temporary model size increased each iteration, and I don't know how to control the model size.

# -*- coding: utf-8 -*-

import xgboost as xgb
from sklearn.model_selection import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse

X = load_boston()['data']
y = load_boston()['target']

# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)

xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)

params = {'objective': 'reg:squarederror', 'tree_method': 'hist', 'max_depth': 3, 'max_leaves': 100, 'grow_policy': 'lossguide'}
model_1 = xgb.train(params, xg_train_1, 30)
# model_1.save_model('t0.json')

# ================= train two versions of the model =====================#
model_v1 = xgb.train(params, xg_train_2, 30)
model_v1.save_model('./t1.json')
print(model_v1.__dict__)

model_v2 = xgb.train(params, xg_train_2, 30, xgb_model='t1.json')
model_v2.save_model('./t2.json')
print(model_v2.__dict__)

model_v3 = xgb.train(params, xg_train_2, 30, xgb_model='t2.json')
model_v3.save_model('./t3.json')
print(model_v3.__dict__)

model_v4 = xgb.train(params, xg_train_2, 30, xgb_model='t3.json')
model_v4.save_model('./t4.json')
print(model_v4.__dict__)

model_v5 = xgb.train(params, xg_train_2, 30, xgb_model='t4.json')
model_v5.save_model('./t4.json')
print(model_v5.__dict__)

enter image description here

BTW, I have read each parameter of XGBoost of training a model and tried max_depth, max_leaves, and many other combinations, neither works.


Solution

  • The detail of JSON data shows more and more trees added, after reading the document carefully again and I find the two arguments are designed for this.(max_depth, max_leaves just for tree property, but not for tree numbers)

    we can solve this problem by the following code.

    # -*- coding: utf-8 -*-
    
    import xgboost as xgb
    from sklearn.model_selection import train_test_split as ttsplit
    from sklearn.datasets import load_boston
    from sklearn.metrics import mean_squared_error as mse
    
    X = load_boston()['data']
    y = load_boston()['target']
    
    # split data into training and testing sets
    # then split training set in half
    X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
    X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)
    
    xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
    xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'max_depth': 3,
        'max_leaves': 100,
    }
    model_1 = xgb.train(params, xg_train_1, 10)
    # model_1.save_model('t0.json')
    
    # ================= train two versions of the model =====================#
    model_v1 = xgb.train(params, xg_train_2)
    model_v1.save_model('./t1.json')
    print(model_v1.__dict__)
    
    params1 = params.copy()
    # key arguments for this
    params1['process_type'] = 'update'
    params1['updater'] = 'refresh'
    
    model_v2 = xgb.train(params1, xg_train_2, xgb_model='t1.json')
    model_v2.save_model('./t2.json')
    print(model_v2.__dict__)
    

    Reference:

    [1]https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.training [2]https://github.com/dmlc/xgboost/issues/3055