This is how I load my train and test datasets with HF:
dataset = {name.replace('/', '.'): f'{name}/*.parquet' for name in ["train", "test"]}
dataset = load_dataset("parquet", data_files=dataset)
However, there is a problem, since my train
dataset has extra columns that test
doesn't have (and I need to load them in order to prepare training data.
Since there is a column mismatch, I get an error on load:
datasets.table.CastError: Couldn't cast
column1: string
column2: list<item: string>
child 0, item: string
column3: list<item: int32>
child 0, item: int32
__index_level_0__: int64
-- schema metadata --
pandas: '{"index_columns": ["__index_level_0__"], "column_indexes": [{"na' + 1438
to
{'column1': Value(dtype='string', id=None), 'column2': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), '__index_level_0__': Value(dtype='int64', id=None)}
because column names don't match (column3
is missing).
What is the correct way to load train and test with different set of columns?
from datasets import load_dataset
def load_datasets_with_varying_columns(train_path, test_path):
train_dataset = load_dataset("parquet", data_files={"train": train_path})
test_dataset = load_dataset("parquet", data_files={"test": test_path})
common_columns = set(train_dataset["train"].column_names).intersection(set(test_dataset["test"].column_names))
test_dataset = test_dataset.map(lambda x: {col: x[col] for col in common_columns})
return {"train": train_dataset["train"], "test": test_dataset["test"]}
train_path = "train/*.parquet"
test_path = "test/*.parquet"
datasets = load_datasets_with_varying_columns(train_path, test_path)
Hope this helps