Search code examples
c++boostboost-multi-index

How to find most commonly occurring non-unique keys in Boost MultiIndex?


Boost MultiIndex Container, when defined to have hashed_non_unique keys, can group equivalent keys together and return them all against an equal_range query, as mentioned here. But I see no way of querying the largest range (or n largest ranges) in a set. Without comparing between the range sizes of distinct hashes, which can become computationally very expensive, is there a way to query the largest equal ranges?

If we consider a simple example, such as this one, I would like to query by frequency and get Tom as the first result, and then Jack and Leo in no particular order.


Solution

  • Ok, if you're using non-unique hashed indices, turns out equal_range does not invoke equality comparison for all the elements in the returned range (unlike common implementations of std::unordered_multimap, BTW), so the following can be very efficient:

    template<typename HashIndex>
    std::multimap<
      std::size_t,
      std::reference_wrapper<const typename HashIndex::value_type>,
      std::greater<std::size_t>
    > group_sizes(const HashIndex& i)
    {
      decltype(group_sizes(i)) res;
      for(auto it=i.begin(),end=i.end();it!=end;){
        auto next=i.equal_range(*it).second;
        res.emplace((std::size_t)std::distance(it,next),*it);
        it=next;
      }
      return res;
    }
    

    To check how efficient this actually is, let's try instrumenting the element type:

    Live Coliru Demo

    #include <boost/multi_index_container.hpp>
    #include <boost/multi_index/hashed_index.hpp>
    #include <boost/multi_index/identity.hpp>
    #include <cstring>
    #include <functional>
    #include <iostream>
    #include <string>
    #include <tuple>
    #include <map>
    
    template<typename HashIndex>
    std::multimap<
      std::size_t,
      std::reference_wrapper<const typename HashIndex::value_type>,
      std::greater<std::size_t>
    > group_sizes(const HashIndex& i)
    {
      decltype(group_sizes(i)) res;
      for(auto it=i.begin(),end=i.end();it!=end;){
        auto next=i.equal_range(*it).second;
        res.emplace((std::size_t)std::distance(it,next),*it);
        it=next;
      }
      return res;
    }
    
    struct instrumented_string:std::string
    {
      using std::string::string;
      
      static void reset_nums()
      {
        num_hashes=0;
        num_eqs=0;
      }
      
      static std::size_t num_hashes,num_eqs;
    };
    
    std::size_t instrumented_string::num_hashes=0;
    std::size_t instrumented_string::num_eqs=0;
    
    bool operator==(const instrumented_string& x,const instrumented_string& y)
    {
      ++instrumented_string::num_eqs;
      return static_cast<std::string>(x)==y;
    }
    
    std::size_t hash_value(const instrumented_string& x)
    {
      ++instrumented_string::num_hashes;
      return boost::hash<std::string>{}(x);
    }
    
    using namespace boost::multi_index;
    using container=multi_index_container<
      instrumented_string,
      indexed_by<
        hashed_non_unique<identity<instrumented_string>>
      >
    >;
    
    int main()
    {
      auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
      container c;
      for(auto& v:values){
        for(auto i=100*std::strlen(v);i--;)c.insert(v);
      }
      
      instrumented_string::reset_nums();
      
      auto gs=group_sizes(c);
      for(const auto& g:gs){
        std::cout<<g.first<<": "<<g.second.get()<<"\n";
      }
      
      std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
      std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
    }
    

    Output

    800: Subhamoy
    600: Bjarne
    400: Jack
    300: Tom
    300: Leo
    # hashes: 5
    # eqs: 5
    

    So, for a container with 2,400 elements, invoking group_sizes has resulted in just 5 hash calculations and 5 equality comparisons (plus ~2,400 iterator increments, of course).

    If you really want to get rid of hashes, the following can do:

    Live Coliru Demo

    #include <boost/multi_index_container.hpp>
    #include <boost/multi_index/hashed_index.hpp>
    #include <boost/multi_index/identity.hpp>
    #include <cstring>
    #include <functional>
    #include <iostream>
    #include <memory>
    #include <string>
    #include <map>
    
    template<typename HashIndex>
    struct internal_reference
    {
      const HashIndex&                      i;
      const typename HashIndex::value_type& r;
      std::size_t                           buc;
    };
    
    template<typename HashIndex>
    struct internal_reference_equal_to
    {
      bool operator()(
        const typename HashIndex::value_type& x,
        const internal_reference<HashIndex>& y)const
      {
        return
          std::addressof(x)==std::addressof(y.r)||
          y.i.key_eq()(y.i.key_extractor()(x),y.i.key_extractor()(y.r));
      }
    
      bool operator()(
        const internal_reference<HashIndex>& x,
        const typename HashIndex::value_type& y)const
      {
        return (*this)(y,x);
      }
    };
    
    template<typename HashIndex>
    struct internal_reference_hash
    {
      std::size_t operator()(const internal_reference<HashIndex>& x)const
      {
        return x.buc;
      }
    };
    
    template<typename HashIndex>
    std::multimap<
      std::size_t,
      std::reference_wrapper<const typename HashIndex::value_type>,
      std::greater<std::size_t>
    > group_sizes(const HashIndex& i)
    {
      decltype(group_sizes(i)) res;
      for(std::size_t buc=0,buc_count=i.bucket_count();buc<buc_count;++buc){
        for(auto it=i.begin(buc),end=i.end(buc);it!=end;){
          auto        p=i.equal_range(
                          internal_reference<HashIndex>{i,*it,buc},
                          internal_reference_hash<HashIndex>{},
                          internal_reference_equal_to<HashIndex>{});
          std::size_t dist=0;
          auto        next=it;
          while(p.first!=p.second){
            ++p.first;
            ++dist;
            ++next;
          }
          res.emplace(dist,*it);
          it=next;
        }
      }
      return res;
    }
    
    struct instrumented_string:std::string
    {
      using std::string::string;
      
      static void reset_nums()
      {
        num_hashes=0;
        num_eqs=0;
      }
      
      static std::size_t num_hashes,num_eqs;
    };
    
    std::size_t instrumented_string::num_hashes=0;
    std::size_t instrumented_string::num_eqs=0;
    
    bool operator==(const instrumented_string& x,const instrumented_string& y)
    {
      ++instrumented_string::num_eqs;
      return static_cast<std::string>(x)==y;
    }
    
    std::size_t hash_value(const instrumented_string& x)
    {
      ++instrumented_string::num_hashes;
      return boost::hash<std::string>{}(x);
    }
    
    using namespace boost::multi_index;
    using container=multi_index_container<
      instrumented_string,
      indexed_by<
        hashed_non_unique<identity<instrumented_string>>
      >
    >;
    
    int main()
    {
      auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
      container c;
      for(auto& v:values){
        for(auto i=100*std::strlen(v);i--;)c.insert(v);
      }
      
      instrumented_string::reset_nums();
      
      auto gs=group_sizes(c);
      for(const auto& g:gs){
        std::cout<<g.first<<": "<<g.second.get()<<"\n";
      }
      
      std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
      std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
    }
    

    Output

    800: Subhamoy
    600: Bjarne
    400: Jack
    300: Tom
    300: Leo
    # hashes: 0
    # eqs: 0
    

    But please bear in mind this version of group_sizes exploits the undocumented fact that elements with hash value h get placed in the bucket h%bucket_count() (or, put another way, internal_reference<HashIndex> hashing is technically not a conformant compatible extension of the index hash function).