Boost does not give a merge method to merge set or multiset.
So I need to do something like this to merge.
int main() {
boost::container::flat_multiset<int> set1 = {1, 2, 3};
boost::container::flat_multiset<int> set2 = {3, 4, 5};
// Manually insert elements from set2 into set1
for (const auto& value : set2) {
set1.insert(value);
}
}
Is it efficient? How to deal with multiple sets ?
So, I went ahead and checked my prediction:
@TedZach why? The ordered_range insertion is likely (much) more efficient. – sehe 1 hour ago
Indeed, merging 100 random flat multisets of 10'000 elements is MUCH faster using the specialized order insertion than the naive approach. Doing it correctly with std::merge
is very clumsy because you cannot insert directly into the destination anyways:
The behavior is undefined if the destination range overlaps either of the input ranges (the input ranges may overlap each other).
So no, you most definitely don't want std::merge
.
You CAN do a lot of work and manually inplace-merge into a fixed sequence to then adopt into a multiset, but that's not worth the effort, and merely does the same as the ordered insertion anyways. This becomes even more felt if you need to support unique as well as multi-sets, or sets with custom order predicates.
#include <algorithm>
#include <boost/container/flat_set.hpp>
#include <boost/container_hash/hash.hpp>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>
#include <ranges>
#include <vector>
namespace bc = boost::container;
using set_t = bc::flat_multiset<int>;
using data_t = std::vector<set_t>;
using namespace std::chrono_literals;
namespace /*file static*/ {
auto generate_sets(unsigned num_sets, unsigned set_size = 10'000) {
data_t sets(num_sets);
std::mt19937 prng{std::random_device{}()};
auto gen = bind(std::uniform_int_distribution<int>{-9'999, +9'999}, prng);
for (auto& set : sets)
generate_n(inserter(set, set.end()), set_size, gen);
return sets;
}
// the naive approach
set_t naive_merge(data_t const& sets) {
set_t r;
for (auto const& s : sets)
r.insert(s.begin(), s.end());
return r;
}
// std::merge really doesn't fit the bill here:
set_t std_merge(data_t const& sets) {
if (sets.empty())
return {};
set_t r(sets.front()), tmp;
for (auto const& s : sets | std::views::drop(1)) {
tmp.clear();
std::merge(r.begin(), r.end(), s.begin(), s.end(), inserter(tmp, tmp.end()));
r.swap(tmp);
}
return r;
}
// the correct usage of ordered flatmultiset::insert!
set_t fast_merge(data_t const& sets) {
set_t r;
for (auto const& s : sets)
r.insert(bc::ordered_range, s.begin(), s.end());
return r;
}
// this is the manual to do similar to fast_merge:
set_t manual_inplace(data_t const& sets) {
set_t::sequence_type underlying;
// preallocate
underlying.reserve( //
accumulate(sets.begin(), sets.end(), size_t{},
[](size_t acc, set_t const& s) { return acc + s.size(); }));
// using repeated inplace_merge:
for (auto const& s : sets) {
underlying.reserve(underlying.size() + s.size()); // avoid iterator invalidation
auto m = underlying.end();
underlying.insert(m, s.begin(), s.end());
std::inplace_merge(underlying.begin(), m, underlying.end());
}
// adopt the sequence!
return {bc::ordered_range, underlying.begin(), underlying.end()};
}
} // namespace
int main() {
auto bench = [sets = generate_sets(100, 10'000)](auto caption, set_t (*f)(data_t const&)) {
static auto constexpr now = std::chrono::high_resolution_clock::now;
auto start = now();
auto r = f(sets);
auto stop = now();
size_t checksum = boost::hash_range(begin(r), end(r));
std::cout << caption << "\t" << (stop - start) / 1.ms << "ms\t" //
<< std::hex << checksum << std::dec << "\n";
return r;
};
// benchmark
bench("naive_merge", naive_merge);
bench("std_merge", std_merge);
bench("fast_merge", fast_merge);
bench("manual_inplace", manual_inplace);
}
Printing
naive_merge 96.4958ms e81a4707eb9e51c6
std_merge 573.727ms e81a4707eb9e51c6
fast_merge 67.776ms e81a4707eb9e51c6
manual_inplace 78.4506ms e81a4707eb9e51c6
Or locally: