Search code examples
c++memorydata-structuresboost-serializationoverhead

Why is there space overhead when deserializing from a binary archive into a std::map


this is my program:

void loadB(map<unsigned int,myParam> & myParams)
{
    std::ifstream ifs("/tmp/all_params", std::ios::in | std::ios::binary);
    if( ifs.good() ){
        try{
            boost::archive::binary_iarchive ia(ifs);
            ia >> myParams;

            ifs.close();
        }catch(boost::archive::archive_exception& ex){
            syslog(LOG_NOTICE, "Archive Exception during deserializing params");
        }
    }else{  }
}

The size of the file "/tmp/all_params" is 133M but when I load it with the loadB() function the memory consumption is more than 650M(1.7G virtual). Does it make any sense?

PID   USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
16619 root      20   0 1767468 653772   2988 S   3.7  8.0   0:06.21 engine                                                                                                                                           

Solution

  • Of course it makes sense.

    E.g. when /tmp/all_params is a file generated with the following program:

    Live On Coliru

    #include <boost/serialization/map.hpp>
    #include <boost/archive/binary_oarchive.hpp>
    #include <boost/archive/binary_iarchive.hpp>
    #include <boost/random.hpp>
    #include <boost/bind.hpp>
    
    struct myParam { 
        std::string data; 
        template <typename Ar> void serialize(Ar& ar, unsigned) {
            ar & data;
        }
    };
    
    static inline std::string generate_value() {
        static auto rand_char = boost::bind(boost::uniform_int<unsigned char>(0,255), boost::mt19937{});
    
        std::string s;
        std::generate_n(back_inserter(s), rand_char(), rand_char);
        return s;
    }
    
    using Map = std::map<unsigned int,myParam>;
    
    Map generate_data(unsigned n) {
        Map map;
    
        for (unsigned i=0; i<n; ++i)
            map.emplace(i, myParam { generate_value() });
    
        return map;
    }
    
    #include <fstream>
    #include <iostream>
    
    int main() {
        {
            std::ofstream ofs("/tmp/all_params", std::ios::binary);
            boost::archive::binary_oarchive oa(ofs);
    
            auto data = generate_data(10ul<<19);
            oa << data;
            std::cout << "Serialized " << data.size() << " entries\n";
        }
    }
    

    The file was 698miB on my system. The memory footprint looks like this (takes a while:)

    ==27420== Memcheck, a memory error detector
    ==27420== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
    ==27420== Using Valgrind-3.10.0.SVN and LibVEX; rerun with -h for copyright info
    ==27420== Command: ./test
    ==27420== 
    Serialized 5242880 entries
    ==27420== 
    ==27420== HEAP SUMMARY:
    ==27420==     in use at exit: 0 bytes in 0 blocks
    ==27420==   total heap usage: 47,021,247 allocs, 47,021,247 frees, 3,069,877,283 bytes allocated
    ==27420== 
    ==27420== All heap blocks were freed -- no leaks are possible
    ==27420== 
    

    The peak usage snapshot was at 1.2 GiB:

    enter image description here

    Of course you can optimize the memory layout, e.g. by using Boost Flat Map (with the ordered_unique_range_t insertion overloads!) and a custom allocator for e.g. the strings there. This will reduce/eliminate the overhead:

    enter image description here

    The tweaked code:

    #include <boost/serialization/map.hpp>
    #include <boost/serialization/collections_load_imp.hpp>
    #include <boost/serialization/collections_save_imp.hpp>
    #include <boost/container/flat_map.hpp>
    #include <boost/archive/binary_oarchive.hpp>
    #include <boost/archive/binary_iarchive.hpp>
    #include <boost/random.hpp>
    #include <boost/bind.hpp>
    #include <boost/utility/string_ref.hpp>
    #include <cassert>
    
    namespace string_pool {
        static auto pool = []{
            std::vector<char> init;
            init.reserve(700ul<<20); // 700MiB
            return init;
        }();
    
        using entry = boost::string_ref;
    
        entry add(std::string const& s) {
            assert((pool.capacity() >= (pool.size() + s.size())));
    
            auto it = pool.end();
            pool.insert(it, s.begin(), s.end());
            return { &*it, s.size() };
        }
    
        static inline entry generate_random() {
            static auto rand_char = boost::bind(boost::uniform_int<unsigned char>(0,255), boost::mt19937{});
    
            static std::string s; // non-reentrant, but for lazy demo
            s.resize(rand_char());
            std::generate_n(s.begin(), s.size(), rand_char);
            return add(s);
        }
    }
    
    struct myParam { 
        string_pool::entry data;
    
        template <typename Ar> void save(Ar& ar, unsigned) const {
            std::string s = data.to_string();
            ar & s;
        }
        template <typename Ar> void load(Ar& ar, unsigned) {
            std::string s;
            ar & s;
            data = string_pool::add(s);
        }
        BOOST_SERIALIZATION_SPLIT_MEMBER()
    };
    
    // flat map serialization
    namespace boost { 
    namespace serialization {
    
        template<class Archive, typename...TArgs>
        inline void save(
            Archive & ar,
            const boost::container::flat_map<TArgs...> &t,
            const unsigned int /* file_version */
        ){
            boost::serialization::stl::save_collection<
                Archive, 
                boost::container::flat_map<TArgs...> 
            >(ar, t);
        }
    
        template<class Archive, typename...TArgs>
        inline void load(Archive & ar, boost::container::flat_map<TArgs...> &t, const unsigned int /* file_version */) {
            boost::serialization::stl::load_collection<Archive, boost::container::flat_map<TArgs...>,
                boost::serialization::stl::archive_input_map<Archive, boost::container::flat_map<TArgs...> >, 
                boost::serialization::stl::reserve_imp   <boost::container::flat_map<TArgs...> >
            >(ar, t);
        }
    
        // split non-intrusive serialization function member into separate
        // non intrusive save/load member functions
        template<class Archive, typename...TArgs>
        inline void serialize(Archive & ar, boost::container::flat_map<TArgs...> &t, const unsigned int file_version) {
            boost::serialization::split_free(ar, t, file_version);
        }
    }
    }
    
    using Map = boost::container::flat_map<unsigned int,myParam>;
    
    Map generate_data(unsigned n) {
        Map map;
        map.reserve(n);
        std::cout << "Capacity: " << map.capacity() << "\n";
    
        for (unsigned i=0; i<n; ++i)
            map.emplace(i, myParam { string_pool::generate_random() });
    
        std::cout << "Capacity: " << map.capacity() << "\n";
        std::cout << "Total length: " << std::accumulate(
                map.begin(), map.end(), 0ul, [](size_t acc, Map::value_type const& v) {
                    return acc + v.second.data.size();
                }) << "\n";
        return map;
    }
    
    #include <fstream>
    #include <iostream>
    
    int main() {
        {
            std::ofstream ofs("/tmp/all_params", std::ios::binary);
            boost::archive::binary_oarchive oa(ofs);
    
            auto data = generate_data(10ul<<19);
            oa << data;
            std::cout << "Serialized " << data.size() << " entries\n";
        }
    }
    

    The md5sum of the generated /tmp/all_params file matched that of the first version: ac75521dc0dc65585368677c834613cb, proving that the data serialized is actually the same.