Search code examples
algorithmdata-structuressubsetbinary-decision-diagram

Map<bitset,object>-like data structure that can check subsets of bitsets?


I have a big huge hashtable (so big I can't check every row) (in C++ using boost::unordered_map) where the keys are std::bitset and the values are some structure that I have.

Say I have this in the table:

00010101 -> {"Hello"}
00100100 -> {"Good Bye"}
01111101 -> {"Whatever"}

If I query the map as map[01111101] I want it to return "Whatever". That is fine, its what a map is for. BUT, if I query map[00110101] I want it to return "Hello", BECAUSE "00010101" (the key for Hello) is a subset of "00110101" of my query. I represent sets with bits, I think that's self-explanatory.

If there ever is more than one entry in the table such that the key is a subset of the query, I want them all.

I have no idea if there is anything like this. I am looking at Binary Decision Diagrams, but I have never used them and I am not sure if they would do the trick.

Thanks.


Edit: Set representations. Say I have a group of objects A,B,C,D,E,F,G I have two sets A,B,C and D,F. I would represent them as 1110000 and 0001010 respectively. Therefore: 1110000 is not a subset of 0001010 (or viceversa), but 1000100 is a subset of 1010101.


Solution

  • A map based on a hash table is the wrong data structure here.

    You can gain some efficiency in discovering all matches by storing the bit strings in a trie, where trie nodes contain corresponding strings.

    Unlike the tries in the link's examples, each node in your case will have 0, 1, or 2 children labeled 0 and/or 1.

    Now a lookup in your case moves traverses the trie in a customized manner. For each 1 in the search key, you will search both the corresponding 0 and 1 link of the trie. For each 0, search only the 0 branch. The nodes that you find will be just the ones you want.

    Search time will be proportional to the total bit string length of key values searched, which in the worst case is all elements in the tree.

    Adding code

    Here is a toy C implementation for reference.

    #include <stdio.h>
    #include <stdlib.h>
    #include <limits.h>
    
    // Simple bit vectors of arbitrary length.
    typedef struct {
      unsigned n_bits;
      unsigned *bits;
    } BIT_VECTOR;
    
    void init_bit_vector(BIT_VECTOR *v) {
      v->n_bits = 0;
      v->bits = NULL;
    }
    
    void setup_bit_vector(BIT_VECTOR *v, unsigned n_bits) {
      v->n_bits = n_bits;
      v->bits = calloc((n_bits + WORD_BIT - 1) / WORD_BIT, sizeof(unsigned));
    }
    
    void clear_bit_vector(BIT_VECTOR *v) {
      free(v->bits);
      v->n_bits = 0;
    }
    
    void set_bit_vector(BIT_VECTOR *v, unsigned *bits, unsigned n_bits) {
      unsigned n_words = (n_bits + WORD_BIT - 1) / WORD_BIT;
      for (int i = 0; i < n_words; i++) v->bits[i] = bits[i];
      v->n_bits = n_bits;
    }
    
    unsigned get_bit(BIT_VECTOR *v, int i) {
      unsigned mask = 1u << (i % WORD_BIT);
      return !!(v->bits[i / WORD_BIT] & mask);
    }
    
    // A trie map from bit vectors to strings.
    typedef struct trie_s {
      struct trie_s *b[2];
      char *val;
    } TRIE;
    
    TRIE *make_trie(void) {
      TRIE *trie = malloc(sizeof *trie);
      trie->b[0] = trie->b[1] = NULL;
      trie->val = NULL;
      return trie;
    }
    
    // Add a key/value entry to the given trie map.
    void put(TRIE *trie, BIT_VECTOR *key, char *val) {
      TRIE *p = trie;
      for (int i = 0; i < key->n_bits; ++i) {
        unsigned bit = get_bit(key, i);
        if (!p->b[bit]) p->b[bit] = make_trie();
        p = p->b[bit];
      }
      p->val = val;
    }
    
    // Recursive search that implements the subset membership check.
    static void search(TRIE *trie, BIT_VECTOR *key, int i, char **buf, unsigned *n) {
      if (!trie) return;
      if (i == key->n_bits) {
        if (trie->val) buf[(*n)++] = trie->val;
        return;
      }
      unsigned bit = get_bit(key, i);
      // A standard trie search does this.
      search(trie->b[bit], key, i + 1, buf, n);
      // But here, add a search of the 0 branch if the key bit is 1.
      if (bit) search(trie->b[0], key, i + 1, buf, n);
    }
    
    // Get all entries with keys a subset of the search key.
    unsigned get_all(TRIE *trie, BIT_VECTOR *key, char **buf) {
      int n = 0;
      search(trie, key, 0, buf, &n);
      return n;
    }
    
    typedef struct {
      unsigned bits;
      char *val;
    } EXAMPLE_DATA;
    
    int main(void) {
      TRIE *trie = make_trie();
      #define N (sizeof data / sizeof data[0])
      EXAMPLE_DATA data[] = {
        { 0b00010101, "Hello" },
        { 0b00100100, "Goodbye" },
        { 0b00101101, "Farewell" },
        { 0b01111101, "Whatever"},
      };
      BIT_VECTOR key[1];
      init_bit_vector(key);
      setup_bit_vector(key, 8);
      for (int i = 0; i < N; i++) {
        set_bit_vector(key, &data[i].bits, 8);
        put(trie, key, data[i].val);
      }
      unsigned search_val = 0b00110101;
      set_bit_vector(key, &search_val, 8);
      char *buf[N];
      unsigned n = get_all(trie, key, buf);
      printf("Found:\n");
      for (int i = 0; i < n; i++) 
        printf(" %s", buf[i]);
      printf(".\n");
      clear_bit_vector(key);
      return 0;
    }