I have been using with great satisfaction lightGBM models, as I have big datasets with tens of features and million of rows, with lots of categorical columns.
I like a lot the way lightGBM can get a pandas dataframe with categorical features defined simply with astype('category')
without any one-hot encoding.
I also have some float columns, which I am attempting to convert into categorical bins to speed up the convergence and force the boundaries of the decision points.
The problem is that attempting to bin the float columns with pd.cut
causes the fit method to fail and throw a ValueError: Circular reference detected
There is a similar question here and actually in the traceback there is mention of the Json encoder, but I have no DateTime columns as suggested by the answer there. I guess the .cut categories may be not supported by lightGBM but I cant'find any information about this in the docs.
To replicate the problem there is no need of big dataset, here is a toy exmaple, where I build a 100 rows, 10 columns dataset. 5 columns are of int numbers, which I convert to categorical with astype 5 columns are of float numbers. Keeping the float numbers as float everything is OK, converting one or more of the float columns to categorical with pd.cut causes the fit function to throw the error.
import lightgbm as lgb
from sklearn.model_selection import train_test_split
rows = 100
fcols = 5
ccols = 5
# Let's define some ascii readable names for convenience
fnames = ['Float_'+str(chr(97+n)) for n in range(fcols)]
cnames = ['Cat_'+str(chr(97+n)) for n in range(fcols)]
# The dataset is built by concatenation of the float and the int blocks
dff = pd.DataFrame(np.random.rand(rows,fcols),columns=fnames)
dfc = pd.DataFrame(np.random.randint(0,20,(rows,ccols)),columns=cnames)
df = pd.concat([dfc,dff],axis=1)
# Target column with random output
df['Target'] = (np.random.rand(rows)>0.5).astype(int)
# Conversion into categorical
df[cnames] = df[cnames].astype('category')
df['Float_a'] = pd.cut(x=df['Float_a'],bins=10)
# Dataset split
X = df.drop('Target',axis=1)
y = df['Target'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
# Model instantiation
lgbmc = lgb.LGBMClassifier(objective = 'binary',
boosting_type = 'gbdt' ,
is_unbalance = True,
metric = ['binary_logloss'])
lgbmc.fit(X_train,y_train)
Here is the error, which does not appear if there is not np.cat column.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-207-751795a98846> in <module>
4 metric = ['binary_logloss'])
5
----> 6 lgbmc.fit(X_train,y_train)
7
8 prob_pred = lgbmc.predict(X_test)
~\AppData\Local\conda\conda\envs\py36\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
740 verbose=verbose, feature_name=feature_name,
741 categorical_feature=categorical_feature,
--> 742 callbacks=callbacks)
743 return self
744
~\AppData\Local\conda\conda\envs\py36\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
540 verbose_eval=verbose, feature_name=feature_name,
541 categorical_feature=categorical_feature,
--> 542 callbacks=callbacks)
543
544 if evals_result:
~\AppData\Local\conda\conda\envs\py36\lib\site-packages\lightgbm\engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
238 booster.best_score[dataset_name][eval_name] = score
239 if not keep_training_booster:
--> 240 booster.model_from_string(booster.model_to_string(), False).free_dataset()
241 return booster
242
~\AppData\Local\conda\conda\envs\py36\lib\site-packages\lightgbm\basic.py in model_to_string(self, num_iteration, start_iteration)
2064 ptr_string_buffer))
2065 ret = string_buffer.value.decode()
-> 2066 ret += _dump_pandas_categorical(self.pandas_categorical)
2067 return ret
2068
~\AppData\Local\conda\conda\envs\py36\lib\site-packages\lightgbm\basic.py in _dump_pandas_categorical(pandas_categorical, file_name)
299 pandas_str = ('\npandas_categorical:'
300 + json.dumps(pandas_categorical, default=json_default_with_numpy)
--> 301 + '\n')
302 if file_name is not None:
303 with open(file_name, 'a') as f:
~\AppData\Local\conda\conda\envs\py36\lib\json\__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
--> 238 **kw).encode(obj)
239
240
~\AppData\Local\conda\conda\envs\py36\lib\json\encoder.py in encode(self, o)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
~\AppData\Local\conda\conda\envs\py36\lib\json\encoder.py in iterencode(self, o, _one_shot)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
258
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
ValueError: Circular reference detected
As in here, your problem is related to the JSON serialization. The serializer 'doesn't like' the labels of the category created by pd.cut (labels similar to '(0.109, 0.208]').
You can override the labels generated using the labels optional parameter of the cut function (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html).
In your example, you could replace the line:
df['Float_a'] = pd.cut(x=df['Float_a'],bins=10)
with the lines:
bins = 10
df['Float_a'] = pd.cut(x=df['Float_a'],bins=bins, labels=[f'bin_{i}' for i in range(bins)])