python deep-learning dataset coco mscoco

custom load datset error with MSCOCO2017 data

I made my own dataset loader python file to load MSCOCO2017 with caption data

this is my coco.py

    _CITATION = """\
@article{DBLP:journals/corr/LinMBHPRDZ14,
  author    = {Tsung{-}Yi Lin and
           Michael Maire and
           Serge J. Belongie and
           Lubomir D. Bourdev and
           Ross B. Girshick and
           James Hays and
           Pietro Perona and
           Deva Ramanan and
           Piotr Doll{'{a} }r and
           C. Lawrence Zitnick},
  title     = {Microsoft {COCO:} Common Objects in Context},
  journal   = {CoRR},
  volume    = {abs/1405.0312},
  year      = {2014},
  url       = {http://arxiv.org/abs/1405.0312},
  archivePrefix = {arXiv},
  eprint    = {1405.0312},
  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""

# Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
COCO is a large-scale object detection, segmentation, and captioning 
dataset.
"""

# Add a link to an official homepage for the dataset here
_HOMEPAGE = "http://cocodataset.org/#home"

# Add the licence for the dataset here if you can find it
_LICENSE = ""

# Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)

# This script is supposed to work with local (downloaded) COCO dataset.
_URLs = {}

class NewDataset(datasets.GeneratorBasedBuilder):


VERSION = datasets.Version("0.0.0")

# You will be able to load one or the other configurations in the following list with
# data = datasets.load_dataset('my_dataset', 'first_domain')
# data = datasets.load_dataset('my_dataset', 'second_domain')

def _info(self):
    feature_dict = {
        'filename': datasets.Value(dtype='string'),
        'imgid': datasets.Value(dtype='int64'),
        'tokens': datasets.Sequence(feature=datasets.Sequence(feature=datasets.Value(dtype='string'), length=-1), length=1),
        'sentences': datasets.Sequence(datasets.Value(dtype='string'), length=1),
        'split': datasets.Value(dtype='string'),
        'sentids': datasets.Sequence(feature=datasets.Value(dtype='int64'), length=1),
        'image': datasets.Array3D(shape=(224, 224, 3), dtype='uint8')
    }

    features = datasets.Features(feature_dict)

    return datasets.DatasetInfo(
        # This is the description that will appear on the datasets page.
        description=_DESCRIPTION,
        # This defines the different columns of the dataset and their types
        features=features,  # Here we define them above because they are different between the two configurations
        # If there's a common (input, target) tuple from the features,
        # specify them here. They'll be used if as_supervised=True in
        # builder.as_dataset.
        supervised_keys=None,
        # Homepage of the dataset for documentation
        homepage=_HOMEPAGE,
        # License for the dataset if available
        license=_LICENSE,
        # Citation for the dataset
        citation=_CITATION,
    )

def _split_generators(self, dl_manager):
    """Returns SplitGenerators."""
    
    data_dir = self.config.data_dir
    train_root = os.path.join(data_dir, 'images/train2017')
    valid_root = os.path.join(data_dir, 'images/val2017')

    train_captions = os.path.join(data_dir, 'annotations/captions_train2017.json')
    valid_captions = os.path.join(data_dir, 'annotations/captions_val2017.json')
    
    
    train_img_list = glob(train_root+ '/*.jpg')
    valid_img_list = glob(valid_root+ '/*.jpg')

    with open(train_captions) as f:
        train_data = json.load(f)
    with open(valid_captions) as f:
        valid_data = json.load(f)
    train_cap_dict={}
    valid_cap_dict={}
    for d in train_data['annotations']:
        train_cap_dict[d['image_id']] = d['caption']
    for d in valid_data['annotations']:
        valid_cap_dict[d['image_id']] = d['caption']
        
    _items = defaultdict(list)
    '''
    _items = {
    'train' : {'filename': _____, 'sentences' : ______,....},
    'val' :
    'test':
    }
    '''
    valid_len =  int(len(train_data['images']) * 0.8)
    random.shuffle(train_img_list)
    train_list_=[]
    valid_list_=[]
    test_list_=[]

    for idx,data in enumerate(train_img_list):
        if idx < valid_len:
            filename = os.path.basename(data)
            file_num = filename[6:12]
            temp_dict={}
            temp_dict['filename'] = filename
            temp_dict['imgid'] = idx
            temp_dict['sentids'] = [idx]
            temp_dict['split']='train'
            temp_dict['filepath'] = f"{data_dir}images/train2017/000000{file_num}.jpg"
            sent_temp =  train_cap_dict[int(file_num)]
            temp_dict['sentences'] = [{"raw":sent_temp,"tokens":sent_temp.split(),"sentid":idx,"imgid":idx}]
            train_list_.append(temp_dict)
        else:
            filename = os.path.basename(data)
            file_num = filename[6:12]
            temp_dict={}
            temp_dict['filename'] = filename
            temp_dict['imgid'] = idx
            temp_dict['sentids'] = [idx]
            temp_dict['split']='val'
            temp_dict['filepath'] = f"{data_dir}images/train2017/000000{file_num}.jpg"
            sent_temp =  train_cap_dict[int(file_num)]
            temp_dict['sentences'] = [{"raw":sent_temp,"tokens":sent_temp.split(),"sentid":idx,"imgid":idx}]
            valid_list_.append(temp_dict)
    _items["train"] = train_list_
    _items["val"]=valid_list_
    
    for idx,data in enumerate(valid_img_list):
        filename = os.path.basename(data)
        file_num = filename[6:12]
        temp_dict={}
        temp_dict['filename'] = filename
        temp_dict['imgid'] = (idx + len(train_img_list))
        temp_dict['sentids'] = [(idx + len(train_img_list))]
        temp_dict['split']='test'
        temp_dict['filepath'] = f"{data_dir}images/val2017/000000{file_num}.jpg"
        sent_temp =  valid_cap_dict[int(file_num)]
        temp_dict['sentences'] = [{"raw":sent_temp,"tokens":sent_temp.split(),"sentid":(idx + len(train_img_list)),"imgid":(idx + len(train_img_list))}]
        test_list_.append(temp_dict)
    _items["test"] = test_list_
            
    return [
        datasets.SplitGenerator(
            name=datasets.Split.TRAIN,
            gen_kwargs={"items":_items["train"], "data_dir":data_dir},
        ),
        datasets.SplitGenerator(
            name=datasets.Split.TEST,
            gen_kwargs={"items":_items["test"], "data_dir":data_dir},
        ),
        datasets.SplitGenerator(
            name=datasets.Split.VALIDATION,
            gen_kwargs={"items":_items["val"], "data_dir":data_dir},
        ),
    ]

def _generate_examples(self, items, data_dir):
    """ Yields examples as (key, example) tuples. """
    # "sentences": [{"tokens": ["many", "planes", "are", "parked", "next", "to", "a", "long", "building", "in", "an", "airport"], "raw": "many planes are parked next to a long building in an airport .",
    # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
    # The `key` is here for legacy reason (tfds) and is not important in itself.
    #'filename' 'imgid''tokens' 'sentences''split''sentids'image
    for _id, item in enumerate(items):
        image = Image.open(item.pop('filepath'))
        newsize = (224,224)
        image = image.resize(newsize)
        image = np.asarray(image)
        sentences = item.pop('sentences')
        #print('\n',item)
        #print(sentences)
        sample = {"image":image,
                  "sentences":[s["raw"] for s in sentences], 
                  "tokens":[s["tokens"] for s in sentences],
                  **item}
        yield _id, sample

and when i load coco dataset

from datasets import load_dataset
squad_it_dataset = load_dataset('coco.py', data_files="data/coco.json", data_dir='~/coco2017/')

i get this error

---------------------------------------------------------------------------
ArrowTypeError                            Traceback (most recent call last)
File ~/.local/lib/python3.8/site-packages/datasets/builder.py:1588, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1587 example = self.info.features.encode_example(record) if self.info.features is not None else record
-> 1588 writer.write(example, key)
   1589 num_examples_progress_update += 1

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:488, in ArrowWriter.write(self, example, key, writer_batch_size)
    486     self.hkey_record = []
--> 488 self.write_examples_on_file()

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:551, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    550 typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
--> 551 arrays.append(pa.array(typed_sequence))
    552 inferred_features[col] = typed_sequence.get_inferred_type()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:231, in pyarrow.lib.array()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:110, in pyarrow.lib._handle_arrow_array_protocol()

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:179, in TypedSequence.__arrow_array__(self, type)
    178 if isinstance(pa_type, _ArrayXDExtensionType):
--> 179     storage = to_pyarrow_listarray(data, pa_type)
    180     return pa.ExtensionArray.from_storage(pa_type, storage)

File ~/.local/lib/python3.8/site-packages/datasets/features/features.py:1438, in to_pyarrow_listarray(data, pa_type)
   1437 else:
-> 1438     return pa.array(data, pa_type.storage_dtype)

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:317, in pyarrow.lib.array()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:39, in pyarrow.lib._sequence_to_array()

File ~/.local/lib/python3.8/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/.local/lib/python3.8/site-packages/pyarrow/error.pxi:123, in pyarrow.lib.check_status()

ArrowTypeError: Could not convert 255 with type int: was not a sequence or recognized null for conversion to list type

During handling of the above exception, another exception occurred:

ArrowTypeError                            Traceback (most recent call last)
File ~/.local/lib/python3.8/site-packages/datasets/builder.py:1597, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1596 num_shards = shard_id + 1
-> 1597 num_examples, num_bytes = writer.finalize()
   1598 writer.close()

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:581, in ArrowWriter.finalize(self, close_stream)
    580     self.hkey_record = []
--> 581 self.write_examples_on_file()
    582 # If schema is known, infer features even if no examples were written

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:551, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    550 typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
--> 551 arrays.append(pa.array(typed_sequence))
    552 inferred_features[col] = typed_sequence.get_inferred_type()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:231, in pyarrow.lib.array()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:110, in pyarrow.lib._handle_arrow_array_protocol()

File ~/.local/lib/python3.8/site-packages/datasets/arrow_writer.py:179, in TypedSequence.__arrow_array__(self, type)
    178 if isinstance(pa_type, _ArrayXDExtensionType):
--> 179     storage = to_pyarrow_listarray(data, pa_type)
    180     return pa.ExtensionArray.from_storage(pa_type, storage)

File ~/.local/lib/python3.8/site-packages/datasets/features/features.py:1438, in to_pyarrow_listarray(data, pa_type)
   1437 else:
-> 1438     return pa.array(data, pa_type.storage_dtype)

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:317, in pyarrow.lib.array()

File ~/.local/lib/python3.8/site-packages/pyarrow/array.pxi:39, in pyarrow.lib._sequence_to_array()

File ~/.local/lib/python3.8/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/.local/lib/python3.8/site-packages/pyarrow/error.pxi:123, in pyarrow.lib.check_status()

ArrowTypeError: Could not convert 255 with type int: was not a sequence or recognized null for conversion to list type

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Input In [67], in <cell line: 2>()
      1 from datasets import load_dataset
----> 2 squad_it_dataset = load_dataset('coco.py', data_files="data/coco.json", data_dir='/home/vision-ai/nas_data/vision_inspection/coco2017/')

File ~/.local/lib/python3.8/site-packages/datasets/load.py:1757, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1754 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1756 # Download and prepare data
-> 1757 builder_instance.download_and_prepare(
   1758     download_config=download_config,
   1759     download_mode=download_mode,
   1760     ignore_verifications=ignore_verifications,
   1761     try_from_hf_gcs=try_from_hf_gcs,
   1762     num_proc=num_proc,
   1763 )
   1765 # Build dataset for splits
   1766 keep_in_memory = (
   1767     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1768 )

File ~/.local/lib/python3.8/site-packages/datasets/builder.py:860, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    858     if num_proc is not None:
    859         prepare_split_kwargs["num_proc"] = num_proc
--> 860     self._download_and_prepare(
    861         dl_manager=dl_manager,
    862         verify_infos=verify_infos,
    863         **prepare_split_kwargs,
    864         **download_and_prepare_kwargs,
    865     )
    866 # Sync info
    867 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/.local/lib/python3.8/site-packages/datasets/builder.py:1611, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_splits_kwargs)
   1610 def _download_and_prepare(self, dl_manager, verify_infos, **prepare_splits_kwargs):
-> 1611     super()._download_and_prepare(
   1612         dl_manager, verify_infos, check_duplicate_keys=verify_infos, **prepare_splits_kwargs
   1613     )

File ~/.local/lib/python3.8/site-packages/datasets/builder.py:953, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    949 split_dict.add(split_generator.split_info)
    951 try:
    952     # Prepare split will record examples associated to the split
--> 953     self._prepare_split(split_generator, **prepare_split_kwargs)
    954 except OSError as e:
    955     raise OSError(
    956         "Cannot find data file. "
    957         + (self.manual_download_instructions or "")
    958         + "\nOriginal error:\n"
    959         + str(e)
    960     ) from None

File ~/.local/lib/python3.8/site-packages/datasets/builder.py:1449, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1447 gen_kwargs = split_generator.gen_kwargs
   1448 job_id = 0
-> 1449 for job_id, done, content in self._prepare_split_single(
   1450     gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1451 ):
   1452     if done:
   1453         result = content

File ~/.local/lib/python3.8/site-packages/datasets/builder.py:1606, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1604     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1605         e = e.__context__
-> 1606     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1608 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Is there anything wrong in my code ?

I don't get what

'ArrowTypeError: Could not convert 255 with type int: was not a sequence or recognized null for conversion to list type'

means

I kept seeing those error messages but i still dont' get it..

Solution

The features should be like that (pay attention on the type of the "image"):

feature_dict = {
    'filename': datasets.Value(dtype='string'),
    'imgid': datasets.Value(dtype='int64'),
    'tokens': datasets.Sequence(feature=datasets.Sequence(feature=datasets.Value(dtype='string'), length=-1), length=1),
    'sentences': datasets.Sequence(datasets.Value(dtype='string'), length=1),
    'split': datasets.Value(dtype='string'),
    'sentids': datasets.Sequence(feature=datasets.Value(dtype='int64'), length=1),
    'image': datasets.Image(decode=True)
}

'image': datasets.Image(decode=True)