I am creating a catboost pool from a pandas dataframe (columns have strings as names, not sure if thats relevant) and then quantizing it and saving to disk using this code:
import catboost as cb
import pandas as pd
data = {
'label': np.random.randint(0, 5, 100), # Random integers 0-4 for label
'feature1': np.random.randn(100), # Random normal distribution
'feature2': np.random.randn(100),
'feature3': np.random.randn(100),
'feature4': np.random.randn(100),
'feature5': np.random.randn(100),
'feature6': np.random.randn(100),
'feature7': np.random.randn(100),
'feature8': np.random.randn(100),
'feature9': np.random.randn(100)
}
train = pd.DataFrame(data)
factors = train.columns.values.tolist()[1:]
pool = cb.Pool(data = train[factors], label = train['label'])
pool.quantize()
pool.save('cbpool')
pool2 = cb.Pool('cbpool')
This will yield the following error on the last line:
---------------------------------------------------------------------------
CatBoostError Traceback (most recent call last)
<ipython-input-10-440a13636ec9> in <cell line: 0>()
----> 1 pool = cb.Pool('/content/drive/MyDrive/temp/clarifi/traincb'+endtrain.strftime("%Y%m%d"))
1 frames
/usr/local/lib/python3.11/dist-packages/catboost/core.py in __init__(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, column_description, pairs, graph, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count, log_cout, log_cerr, data_can_be_none)
785 "feature_names should have None or string or pathlib.Path type when the pool is read from the file."
786 )
--> 787 self._read(data, column_description, pairs, graph, feature_names, delimiter, has_header, ignore_csv_quoting, thread_count)
788 else:
789 if isinstance(data, FeaturesData):
/usr/local/lib/python3.11/dist-packages/catboost/core.py in _read(self, pool_file, column_description, pairs, graph, feature_names_path, delimiter, has_header, ignore_csv_quoting, thread_count, quantization_params, log_cout, log_cerr)
1334 item = ''
1335 self._check_thread_count(thread_count)
-> 1336 self._read_pool(
1337 pool_file,
1338 column_description,
_catboost.pyx in _catboost._PoolBase._read_pool()
_catboost.pyx in _catboost._PoolBase._read_pool()
CatBoostError: library/cpp/string_utils/csv/csv.cpp:30: RFC4180 violation: quotation mark must be in the escaped string only
quantize() converts the data to a binary format
You have to load the file with a prefix quantized When you want to load, the "quantized://" prefix tells CatBoost to expect binary data
# Load from Drive
pool2 = cb.Pool(f"quantized://path_to_dir/{'cbpool'}")
# You can verify the loaded pool
print("Number of features:", pool2.num_col())
print("Number of samples:", pool2.num_row())
Note: quantized:// is a protocol identifier