Small Lookup Table for Large Integers

I want to make a quick, relatively small lookup table, but with a large input range: -Input: 32bit Value max. (a 32bit color value) -Output: 8bit Index max. (index to a table)

Something like the code below. (If more than 256 values are indexed, the index is just going to be 0)

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

static uint8_t getIndx(uint32_t value);
static uint8_t **indx;
static uint8_t count = 0;

int main(void) {
  // set up the indx
  const uint32_t size = 0xFFFF;  // for demonstrative purposes not even nearly as large as wished to be (0xFFFFFFFF) plus my for loop below would get in trouble, I think
  indx = (uint8_t**)malloc(sizeof(uint8_t*) * size);
  if(indx == NULL) {
    printf("could not allocate memory\n");
    return 0;
  }
  for(int i = 0; i < size + 1; i++) {
    indx[i] = NULL;
  }

  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));
  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));

  return 0;
}

static uint8_t getIndx(uint32_t value) {
  if(indx[value] == NULL) {
    if(count > 255) return 0;
    indx[value] = (uint8_t*)malloc(sizeof(uint8_t));
    *indx[value] = count;
    count++;
  }
  return *(indx[value]);
}

The output is:

No matter what I think of, I always end up with something like that. With an input range of 32bit (4294967296 states) I would need to allocate way too much memory to only get 256 possible outputs. And forming 256 if else, inside a for loop or not, is also not what I want.

Is there some method that is quick, either a table or not which in the end has the same function, that I have not yet heard about?

Many thanks in advance!

Solution

You can use a hash table to map the 32-bit values to the look-up table indices. The length of the hash table should be significantly longer than the look-up table to reduce the chance of hash collisions.

The following example uses a linear search of the hash table starting at the hash value derived from the input value until a matching entry is found or an empty hash table is found. If the input value is not found, and there is room in the look-up table and room in the hash table (which there should be since it is longer than the look-up table), the input value is added to the look-up table and the hash table is updated. It uses a hash table four times the size of the look-up table.

The example does two passes with the same pseudo-random sequence in each pass. The first pass should mostly fill up the look-up table and update the hash table. The second pass shouldn't make any more changes to the look-up table or hash table because it is just repeating the first sequence.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <string.h>

struct hash_index {
  uint8_t index;
  char used;
};

#define HASHBITS  10
#define HASHSIZE  (1U << HASHBITS)
#define HASHMASK  (HASHSIZE - 1)
#define LOOKUPSIZE  256U

static uint8_t getIndx(uint32_t value);
static uint32_t lookup[LOOKUPSIZE];
static struct hash_index hashtab[HASHSIZE];
static unsigned int count = 0;
static unsigned int tot_collisions = 0;
static unsigned int max_collisions = 0;
static unsigned int hash_used = 0;

int main(void) {
  int pass;
  unsigned int i;
  uint32_t value;
  uint8_t index;
  unsigned int successes;
  unsigned int failures;
  int ok;

  printf("Lookup size: %u, Hash size: %u\n", LOOKUPSIZE, HASHSIZE);
  for (pass = 1; pass <= 2; pass++) {
    successes = 0;
    failures = 0;
    tot_collisions = 0;
    max_collisions = 0;
    /* Not resetting hash_used because it shouldn't change after first pass. */
    printf("\nPass %d, currently used hashes: %u\n\n", pass, hash_used);
    srand(1);
    for (i = 0; i < 260; i++) {
      static const char * const outcomes[2] = {"FAIL", "OK"};
      value = rand();
      index = getIndx(value);
      ok = lookup[index] == value;
      printf("%" PRIu32 " -> %" PRIu8 " (%s)\n", value, index, outcomes[ok]);
      if (ok) {
        successes++;
      } else {
        failures++;
      }
    }
    printf("\nSuccesses: %u, Failures: %u\n", successes, failures);
    printf("Used hashes: %u, Total collisions: %u, Max collisions: %u\n\n",
           hash_used, tot_collisions, max_collisions);
  }

  return 0;
}

static uint8_t getIndx(uint32_t value) {
  unsigned int initial_hash;
  unsigned int hash;
  unsigned int collisions = 0;
  uint8_t index;

  /*
   * Search for value using hash table, starting at position hashed from value.
   *
   * The hash table is longer than the maximum number of used entries,
   * so we should always be able to find an unused entry in the hash table.
   */
  initial_hash = ((value * UINT32_C(0x61c88647)) >> (32 - HASHBITS)) & HASHMASK;
  for (hash = initial_hash;
       collisions < HASHSIZE && hashtab[hash].used;
       hash = (hash + 1) & HASHMASK) {
    /*
     * This hash table entry is used.  Get the corresponding index in the
     * main lookup table to check if the value matches.
     */
    index = hashtab[hash].index;
    if (lookup[index] == value) {
      /* Matching value found.  Return its index in the main table. */
      return index;
    }
    /* Count hash collisions and total hash collisions. */
    collisions++;
    tot_collisions++;
    if (max_collisions < collisions) {
      /* Update max hash collisions for diagnostics. */
      max_collisions = collisions;
    }
  }
  /* Value not found. */
  if (count < LOOKUPSIZE && collisions < HASHSIZE) {
    /*
     * There is room in the main lookup table for the new value
     * and room in the hash table.  The index of the new value in the
     * main lookup table will be the current count, which will be incremented.
     */
    index = count++;
    /*
     * Add value to main lookup table,
     * add index in main lookup table to hash table,
     * and return the index in the main lookup table.
     */
    lookup[index] = value;
    hashtab[hash].index = index;
    hashtab[hash].used = 1;
    hash_used++;  /* Count of used hash table entries for diagnostics. */
    return index;
  }
  /*
   * Value not found and main lookup table is full or hash table is full.
   * Give up.
   */
  return 0;
}

Another approach for dealing with collisions is to make each hash table entry point to a linked list of matching values, but that is more complicated.