Search code examples
cperformancelookup-tables

Small Lookup Table for Large Integers


I want to make a quick, relatively small lookup table, but with a large input range: -Input: 32bit Value max. (a 32bit color value) -Output: 8bit Index max. (index to a table)

Something like the code below. (If more than 256 values are indexed, the index is just going to be 0)

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

static uint8_t getIndx(uint32_t value);
static uint8_t **indx;
static uint8_t count = 0;

int main(void) {
  // set up the indx
  const uint32_t size = 0xFFFF;  // for demonstrative purposes not even nearly as large as wished to be (0xFFFFFFFF) plus my for loop below would get in trouble, I think
  indx = (uint8_t**)malloc(sizeof(uint8_t*) * size);
  if(indx == NULL) {
    printf("could not allocate memory\n");
    return 0;
  }
  for(int i = 0; i < size + 1; i++) {
    indx[i] = NULL;
  }

  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));
  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));

  return 0;
}

static uint8_t getIndx(uint32_t value) {
  if(indx[value] == NULL) {
    if(count > 255) return 0;
    indx[value] = (uint8_t*)malloc(sizeof(uint8_t));
    *indx[value] = count;
    count++;
  }
  return *(indx[value]);
}

The output is:

0
1
2
0
1
2

No matter what I think of, I always end up with something like that. With an input range of 32bit (4294967296 states) I would need to allocate way too much memory to only get 256 possible outputs. And forming 256 if else, inside a for loop or not, is also not what I want.

Is there some method that is quick, either a table or not which in the end has the same function, that I have not yet heard about?

Many thanks in advance!


Solution

  • You can use a hash table to map the 32-bit values to the look-up table indices. The length of the hash table should be significantly longer than the look-up table to reduce the chance of hash collisions.

    The following example uses a linear search of the hash table starting at the hash value derived from the input value until a matching entry is found or an empty hash table is found. If the input value is not found, and there is room in the look-up table and room in the hash table (which there should be since it is longer than the look-up table), the input value is added to the look-up table and the hash table is updated. It uses a hash table four times the size of the look-up table.

    The example does two passes with the same pseudo-random sequence in each pass. The first pass should mostly fill up the look-up table and update the hash table. The second pass shouldn't make any more changes to the look-up table or hash table because it is just repeating the first sequence.

    #include <stdio.h>
    #include <stdlib.h>
    #include <stdint.h>
    #include <inttypes.h>
    #include <string.h>
    
    struct hash_index {
      uint8_t index;
      char used;
    };
    
    #define HASHBITS  10
    #define HASHSIZE  (1U << HASHBITS)
    #define HASHMASK  (HASHSIZE - 1)
    #define LOOKUPSIZE  256U
    
    static uint8_t getIndx(uint32_t value);
    static uint32_t lookup[LOOKUPSIZE];
    static struct hash_index hashtab[HASHSIZE];
    static unsigned int count = 0;
    static unsigned int tot_collisions = 0;
    static unsigned int max_collisions = 0;
    static unsigned int hash_used = 0;
    
    int main(void) {
      int pass;
      unsigned int i;
      uint32_t value;
      uint8_t index;
      unsigned int successes;
      unsigned int failures;
      int ok;
    
      printf("Lookup size: %u, Hash size: %u\n", LOOKUPSIZE, HASHSIZE);
      for (pass = 1; pass <= 2; pass++) {
        successes = 0;
        failures = 0;
        tot_collisions = 0;
        max_collisions = 0;
        /* Not resetting hash_used because it shouldn't change after first pass. */
        printf("\nPass %d, currently used hashes: %u\n\n", pass, hash_used);
        srand(1);
        for (i = 0; i < 260; i++) {
          static const char * const outcomes[2] = {"FAIL", "OK"};
          value = rand();
          index = getIndx(value);
          ok = lookup[index] == value;
          printf("%" PRIu32 " -> %" PRIu8 " (%s)\n", value, index, outcomes[ok]);
          if (ok) {
            successes++;
          } else {
            failures++;
          }
        }
        printf("\nSuccesses: %u, Failures: %u\n", successes, failures);
        printf("Used hashes: %u, Total collisions: %u, Max collisions: %u\n\n",
               hash_used, tot_collisions, max_collisions);
      }
    
      return 0;
    }
    
    static uint8_t getIndx(uint32_t value) {
      unsigned int initial_hash;
      unsigned int hash;
      unsigned int collisions = 0;
      uint8_t index;
    
      /*
       * Search for value using hash table, starting at position hashed from value.
       *
       * The hash table is longer than the maximum number of used entries,
       * so we should always be able to find an unused entry in the hash table.
       */
      initial_hash = ((value * UINT32_C(0x61c88647)) >> (32 - HASHBITS)) & HASHMASK;
      for (hash = initial_hash;
           collisions < HASHSIZE && hashtab[hash].used;
           hash = (hash + 1) & HASHMASK) {
        /*
         * This hash table entry is used.  Get the corresponding index in the
         * main lookup table to check if the value matches.
         */
        index = hashtab[hash].index;
        if (lookup[index] == value) {
          /* Matching value found.  Return its index in the main table. */
          return index;
        }
        /* Count hash collisions and total hash collisions. */
        collisions++;
        tot_collisions++;
        if (max_collisions < collisions) {
          /* Update max hash collisions for diagnostics. */
          max_collisions = collisions;
        }
      }
      /* Value not found. */
      if (count < LOOKUPSIZE && collisions < HASHSIZE) {
        /*
         * There is room in the main lookup table for the new value
         * and room in the hash table.  The index of the new value in the
         * main lookup table will be the current count, which will be incremented.
         */
        index = count++;
        /*
         * Add value to main lookup table,
         * add index in main lookup table to hash table,
         * and return the index in the main lookup table.
         */
        lookup[index] = value;
        hashtab[hash].index = index;
        hashtab[hash].used = 1;
        hash_used++;  /* Count of used hash table entries for diagnostics. */
        return index;
      }
      /*
       * Value not found and main lookup table is full or hash table is full.
       * Give up.
       */
      return 0;
    }
    

    Another approach for dealing with collisions is to make each hash table entry point to a linked list of matching values, but that is more complicated.