Search code examples
c++stlhdf5

How to best write out a std::vector < std::string > container to a HDF5 dataset?


Given a vector of strings, what is the best way to write them out to a HDF5 dataset? At the moment I'm doing something like the following:

  const unsigned int MaxStrLength = 512;

  struct TempContainer {
    char string[MaxStrLength];
  };

  void writeVector (hid_t group, std::vector<std::string> const & v)
  {
    //
    // Firstly copy the contents of the vector into a temporary container
    std::vector<TempContainer> tc;
    for (std::vector<std::string>::const_iterator i = v.begin ()
                                              , end = v.end ()
      ; i != end
      ; ++i)
    {
      TempContainer t;
      strncpy (t.string, i->c_str (), MaxStrLength);
      tc.push_back (t);
    }


    //
    // Write the temporary container to a dataset
    hsize_t     dims[] = { tc.size () } ;
    hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                               , dims
                               , NULL);

    hid_t strtype = H5Tcopy (H5T_C_S1);
    H5Tset_size (strtype, MaxStrLength);

    hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
    H5Tinsert (datatype
      , "string"
      , HOFFSET(TempContainer, string)
      , strtype);

    hid_t dataset = H5Dcreate1 (group
                          , "files"
                          , datatype
                          , dataspace
                          , H5P_DEFAULT);

    H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );

    H5Dclose (dataset);
    H5Sclose (dataspace);
    H5Tclose (strtype);
    H5Tclose (datatype);
}

At a minimum, I would really like to change the above so that:

  1. It uses variable length strings
  2. I don't need to have a temporary container

I have no restrictions over how I store the data so for example, it doesn't have to be a COMPOUND datatype if there is a better way to do this.

EDIT: Just to narrow the problem down, I'm relatively familiar with playing with the data on the C++ side, it's the HDF5 side where I need most of the help.

Thanks for your help.


Solution

  • [Many thanks to dirkgently for his help in answering this.]

    To write a variable length string in HDF5 use the following:

    // Create the datatype as follows
    hid_t datatype = H5Tcopy (H5T_C_S1);
    H5Tset_size (datatype, H5T_VARIABLE);
    
    // 
    // Pass the string to be written to H5Dwrite
    // using the address of the pointer!
    const char * s = v.c_str ();
    H5Dwrite (dataset
      , datatype
      , H5S_ALL
      , H5S_ALL
      , H5P_DEFAULT
      , &s );
    

    One solution for writing a container is to write each element individually. This can be achieved using hyperslabs.

    For example:

    class WriteString
    {
    public:
      WriteString (hid_t dataset, hid_t datatype
          , hid_t dataspace, hid_t memspace)
        : m_dataset (dataset), m_datatype (datatype)
        , m_dataspace (dataspace), m_memspace (memspace)
        , m_pos () {}
    
    private:
      hid_t m_dataset;
      hid_t m_datatype;
      hid_t m_dataspace;
      hid_t m_memspace;
      int m_pos;
    

    //...

    public:
      void operator ()(std::vector<std::string>::value_type const & v)
      {
        // Select the file position, 1 record at position 'pos'
        hsize_t count[] = { 1 } ;
        hsize_t offset[] = { m_pos++ } ;
        H5Sselect_hyperslab( m_dataspace
          , H5S_SELECT_SET
          , offset
          , NULL
          , count
          , NULL );
    
        const char * s = v.c_str ();
        H5Dwrite (m_dataset
          , m_datatype
          , m_memspace
          , m_dataspace
          , H5P_DEFAULT
          , &s );
        }    
    };
    

    // ...

    void writeVector (hid_t group, std::vector<std::string> const & v)
    {
      hsize_t     dims[] = { m_files.size ()  } ;
      hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                        , dims, NULL);
    
      dims[0] = 1;
      hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                        , dims, NULL);
    
      hid_t datatype = H5Tcopy (H5T_C_S1);
      H5Tset_size (datatype, H5T_VARIABLE);
    
      hid_t dataset = H5Dcreate1 (group, "files", datatype
                                 , dataspace, H5P_DEFAULT);
    
      // 
      // Select the "memory" to be written out - just 1 record.
      hsize_t offset[] = { 0 } ;
      hsize_t count[] = { 1 } ;
      H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
                         , NULL, count, NULL );
    
      std::for_each (v.begin ()
          , v.end ()
          , WriteStrings (dataset, datatype, dataspace, memspace));
    
      H5Dclose (dataset);
      H5Sclose (dataspace);
      H5Sclose (memspace);
      H5Tclose (datatype);
    }