Search code examples
chdf5

How to write fixed length strings in HDF5?


I have some C++ code that is calling the HDF5 C API to write out fixed length strings. For some reason, the result is total garbage. Here is an example of the h5dump:

DATASET "simple" {
   DATATYPE  H5T_STRING {
      STRSIZE 10;
      STRPAD H5T_STR_NULLTERM;
      CSET H5T_CSET_ASCII;
      CTYPE H5T_C_S1;
   }
   DATASPACE  SIMPLE { ( 100 ) / ( 100 ) }
   DATA {
   (0): "\001\026", "", "\37777777635\177", "", "y\026", "",
   (6): "\37777777635\177", "", "\37777777761\026", "",
   (10): "\37777777635\177", "", "i\027", "", "\37777777635\177", "",
   (16): "\37777777741\027", "", "\37777777635\177", "", "Y\030", "",
   (22): "\37777777635\177", "", "\37777777721\030", "",
   (26): "\37777777635\177", "", "I\031", "", "\37777777635\177", "",
   (32): "\37777777701\031", "", "\37777777635\177", "", "9\032", "",

But if I change the size to H5Tset_size (datatype_id, H5T_VARIABLE);, the output looks as expected.

--- EDIT ---- Here is a smaller example in only C which has the same problem. It can be built with h5cc:

#include <string.h>
#include <stdio.h>

#include "hdf5.h"
#define FILE "chard.h5"

int main() {

   hid_t       file_id, dataset_id, dataspace_id;  /* identifiers */
   herr_t      status;
   hid_t       dtype;
   size_t      size;

   file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

   hsize_t dims[1] =  {4};
   dataspace_id = H5Screate_simple(1, dims, NULL);


   dtype = H5Tcopy (H5T_C_S1);
   size = 5;
   status = H5Tset_size (dtype, size);

   char *strs[4] = {
     "this",
     "this",
     "this",
     "this"};

   dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
       H5P_DEFAULT, H5P_DEFAULT);

   status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);

   status = H5Dclose(dataset_id);
   status = H5Sclose(dataspace_id);
   status = H5Fclose(file_id);
}

--- END EDIT ---

Here is my code:

#include <iostream>
#include <assert.h>
#include <cstring>
#include <memory>
#include <string>
#include <vector>

#include "hdf5.h"

const char** vec_to_ptr(const std::vector<std::string>& v) {
  const char** ret;
  ret = new const char*[v.size()];
  for (size_t i = 0; i < v.size(); ++i) {
    ret[i] = v[i].c_str();
  }
  return ret;
}

hid_t get_datatype_id(const std::vector<std::string>& v) {
  hid_t datatype_id = H5Tcopy (H5T_C_S1);

  /* H5Tset_size (datatype_id, H5T_VARIABLE); */

  // If I replace the H5Tset_size line below with the commented one above, this
  // code works well... but compression isn't nearly as good since almost all
  // my strings are of the same length
  herr_t status = H5Tset_size (datatype_id, 10);

  assert( status >= 0 );
  v.size(); // shutup, compiler
  return datatype_id;
}

// str_vec: a vector of string to be written out
// group_id: a group_id which has already been opened
// dataset_name: the to write out to
// release_type: if 'true', release the datatype
// compression_level: the level of compression (6 seems reasonable)
//
// return: the status of H5Dwrite (last H5 operation)
template <typename T>
herr_t data_to_h5(
    const std::vector<T>& str_vec,
    hid_t group_id,
    const std::string& dataset_name,
    bool release_type,
    uint compression_level = 6
    ) {
  herr_t status;

  hsize_t dims[1] = {str_vec.size()};

  // create the propery which allows for compression
  hid_t prop_id = H5Pcreate(H5P_DATASET_CREATE);
  // chunk size is same size as vector
  status = H5Pset_chunk(prop_id, 1, dims);
  assert( status >= 0 );
  // compress using gzip
  status = H5Pset_deflate(prop_id, compression_level);
  assert( status >= 0 );

  // create the data type
  hid_t datatype_id = get_datatype_id(str_vec);

  // create the dataspace
  hid_t dataspace_id = H5Screate_simple(1, dims, NULL);

  // create the dataset
  hid_t dataset_id = H5Dcreate(group_id, dataset_name.c_str(), datatype_id,
      dataspace_id, H5P_DEFAULT, prop_id, H5P_DEFAULT);

  // get the ptrs from the string and write out
  auto ptr = vec_to_ptr(str_vec);
  status = H5Dwrite(dataset_id, datatype_id, H5S_ALL, H5S_ALL, H5P_DEFAULT,
      ptr);
  assert( status >= 0 );

  status = H5Pclose(prop_id);
  assert( status >= 0 );
  status = H5Dclose(dataset_id);
  assert( status >= 0 );
  status = H5Sclose(dataspace_id);
  assert( status >= 0 );
  if (release_type) {
    status = H5Tclose(datatype_id);
    assert( status >= 0 );
    delete [] ptr;
  }

  return status;
}

int main(int argc, char *argv[])
{
  if (argc != 2) {
    std::cerr << "Usage: h5_string output.h5" << std::endl;
    return 1;
  }

  std::string fname(argv[1]);
  std::cout << "Opening file " << fname << std::endl;

  std::vector<std::string> str_vec;

  // make a bunch of strings
  const auto NSTR = 100;
  for (auto i = 0; i < NSTR; ++i) {
    std::string cur_str = std::to_string( i );
    str_vec.push_back( cur_str );
  }

  hid_t file_id;
  file_id = H5Fcreate(fname.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

  herr_t status;

  hid_t root;
  root = H5Gopen(file_id, "/", H5P_DEFAULT);
  assert(status >= 0);

  status = data_to_h5(str_vec, root, "simple", true, 6);
  assert(status >= 0);

  status = H5Gclose(root);
  assert(status >= 0);
  status = H5Fclose(file_id);
  assert(status >= 0);

  return 0;
}

And the corresponding CMakeLists.txt:

find_package(HDF5)

add_executable(h5_string h5_string.cpp)

if(HDF5_FOUND)
    include_directories( ${HDF5_INCLUDE_DIR} )
    target_link_libraries( h5_string ${HDF5_LIBRARIES} )
else()
    message(FATAL_ERROR "HDF5 not found. Required to output files")
endif()

Any help is much appreciated. Thanks in advance!


Solution

  • Your code works fine in setting the size (however I think you are off by one). The real problem is in how you define your strings.

    char *strs[4] = {
      "this",
      "this",
      "this",
      "this"};
    

    You are defining them as a pointer array vs a 2D array:

    char strs[4][4] = {
      "this",
      "this",
      "this",
      "this"};
    

    And also fix the off by one:

    dtype = H5Tcopy (H5T_C_S1);
    size = 4;
    status = H5Tset_size (dtype, size);
    

    I also tend to close the data type too.

    status = H5Tclose(dtype);
    

    Here's your test program in full:

    #include <string.h>
    #include <stdio.h>
    
    #include "hdf5.h"
    #define FILE "chard.h5"
    
    int main() {
    
       hid_t       file_id, dataset_id, dataspace_id;  /* identifiers */
       herr_t      status;
       hid_t       dtype;
       size_t      size;
    
       file_id = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
    
       hsize_t dims[1] =  {4};
       dataspace_id = H5Screate_simple(1, dims, NULL);
    
    
       dtype = H5Tcopy (H5T_C_S1);
       size = 4 * sizeof(char);
       status = H5Tset_size (dtype, size);
    
       char strs[4][4] = {
         "this",
         "this",
         "this",
         "this"};
    
       dataset_id = H5Dcreate(file_id, "simp", dtype, dataspace_id, H5P_DEFAULT,
           H5P_DEFAULT, H5P_DEFAULT);
    
       status = H5Dwrite (dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);
    
       status = H5Dclose(dataset_id);
       status = H5Sclose(dataspace_id);
       status = H5Tclose(dtype);
       status = H5Fclose(file_id);
    
    }
    

    If you compile, run and dump the output:

     ~$ h5pcc -g -O0 -Wall -o foo foo.c 
     ~$ ./foo
     ~$ h5dump chard.h5
    HDF5 "chard.h5" {
    GROUP "/" {
       DATASET "simp" {
          DATATYPE  H5T_STRING {
             STRSIZE 4;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SIMPLE { ( 4 ) / ( 4 ) }
          DATA {
          (0): "this", "this", "this", "this"
          }
       }
    }
    }