Search code examples
chexperfect-hashgperf

How to use gperf to create hash for a range of values?


I have range of hex numbers like these

0xabcd****
0xabdc**89
0x****abcd
0xde****ab
# 50 or so more entries like these
# where * is any hex number

I need a hash a function that will accept a 4byte value and generate Y/N answer for membership.

I tried using gperf but unfortunately it doesn't interpret * as wildcard. Has anyone faced this problem before? My code is in C.


Solution

  • If I can trust my arithmetic, each **** has 16^4 possible values, so the four wild-card specs enumerate 3 * 16^4 + 16^2 values - something on the order of 200,000 - a bit out of the remit of gperf (its docs say a "large" key set is 15,000).

    Wildcards mean regular expressions for me, so why not try that? Here's a try that defines "4byte value" as a uint32_t, and presents a text encoding of such a value to the regex(3) machinery. This may well not be what you were looking for, but as it was fun to cobble up, here you go.

    #include <sys/types.h>
    #include <regex.h>
    #include <stdint.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <unistd.h>
    
    static regex_t the_re;
    
    int
    matcher_init(void)
    {
        static const char the_re_string[] = "(abcd....|abdc..89|....abcd|de....ab|"
                                             /* OP's 4, plus 46 more... */
                                             "..d80a..|..7bc5..|c6..514a|..b7585a|"
                                             "4732ecc4|7c22e4da|5a5e..63|....e866|"
                                             "..fdc367|ac....b4|70249edc|..e97e32|"
                                             "....94d8|....fa6c|4591..ff|..e4..67|"
                                             "aab285..|....f81b|15bb22ba|3cf4....|"
                                             "57d3ad86|..bd..1e|..ec67b7|..693aaf|"
                                             "323c..18|cab237cb|d4b2c6b4|2a15..2f|"
                                             "....d196|..5e..10|....b1f1|b54e9838|"
                                             "..0cf1..|5c1a..fb|....f34d|19..d34c|"
                                             "..cacb48|..4c2d09|48..bc..|f98cc7..|"
                                             "ac..2b1a|..beb5..|98..03..|..61c35e|"
                                             "....1245|61..5ca8)";
        int res;
    
        if ((res = regcomp(&the_re, the_re_string, REG_EXTENDED|REG_NOSUB)) != 0) {
            char ebuf[256];
    
            (void) regerror(res, &the_re, ebuf, sizeof ebuf);
            (void) fprintf(stderr, "regcomp failed: %s\n", ebuf);
    
            return -1;
        }
    
        return 0;
    }
    
    int
    matcher_matches(uint32_t u)
    {
        char ubuf[9];
    
        (void) sprintf(ubuf, "%08x", u);
    
        return regexec(&the_re, ubuf, 0, 0, 0) == 0;
    }
    
    int
    main(void)
    {
        int i;
        unsigned tf, iterations, matches;
        time_t start;
    
        uint32_t tvals[] = {
                    0xabcd0000, 0xabdc0089, 0x0000abcd, 0xde0000ab,
                    0x00d80a00, 0x007bc500, 0xc600514a, 0x00b7585a,
                    0x4732ecc4, 0x7c22e4da, 0x5a5e0063, 0x0000e866,
                    0x00fdc367, 0xac0000b4, 0x70249edc, 0x00e97e32,
                    0x000094d8, 0x0000fa6c, 0x459100ff, 0x00e40067,
                    0xaab28500, 0x0000f81b, 0x15bb22ba, 0x3cf40000,
                    0x57d3ad86, 0x00bd001e, 0x00ec67b7, 0x00693aaf,
                    0x323c0018, 0xcab237cb, 0xd4b2c6b4, 0x2a15002f,
                    0x0000d196, 0x005e0010, 0x0000b1f1, 0xb54e9838,
                    0x000cf100, 0x5c1a00fb, 0x0000f34d, 0x1900d34c,
                    0x00cacb48, 0x004c2d09, 0x4800bc00, 0xf98cc700,
                    0xac002b1a, 0x00beb500, 0x98000300, 0x0061c35e,
                    0x00001245, 0x61005ca8 };
    
        if (matcher_init() == -1) {
            return 1;
        }
    
        /* test known values */
    
        tf = 0;
    
        for (i = 0; i < sizeof(tvals) / sizeof(tvals[0]); i++) {
            if (!matcher_matches(tvals[i])) {
                (void) printf("0x%08x should match; didn't...\n", tvals[i]);
                tf = 1;
            }
        }
    
        if (tf) {
            return 1;
        }
    
        /* some random probes */
    
        srand((time(0) << 16) | (getpid() & 0xFFFF));
    
        iterations = matches = 0;
    
        (void) time(&start);
    
        for (i = 0; i < 1000000; i++) {
            uint32_t u = (uint32_t) ((rand() << 16) | (rand() & 0xFFFF));
    
            /* printf("Test: 0x%08x\n", u); */
    
            if (matcher_matches(u)) {
                (void) printf("Match: %08x\n", u);
                (void) fflush(stdout);
                matches++;
            }
    
            iterations++;
        }
    
        printf("iterations: %d; matches: %d (%u seconds)\n",
                            iterations, matches,
                            (unsigned) (time(0) - start));
    
        return 0;
    }
    

    The responses to this reminded me of the question itself, and on reflection a more straightforward way occurred to me. Why the better answers don't just occur first, I'll never know.

    Anyway, don't use regular expressions, just a value and a mask. The code above loses its matcher_init call (along with everything regex-related), and the matcher_matches call support could look like the following. The looked-for values match the additional 46 I produced for the first answer, so the same test code in the first answer's main will continue to work.

    I suppose you could sort the struct vm array so that entries with fewer bits set in the mask occur first and buy a modest performance gain, but even as is this second attempt is about five times faster than the regex-based one for me.

    static struct {
        uint32_t val, mask;
    } vm [] = {
        { 0xabcd0000, 0xffff0000 },
        { 0xabdc0089, 0xffff00ff },
        { 0x0000abcd, 0x0000ffff },
        { 0xde0000ab, 0xff0000ff },
        { 0x00d80a00, 0x00ffff00 },
        { 0x007bc500, 0x00ffff00 },
        { 0xc600514a, 0xff00ffff },
        { 0x00b7585a, 0x00ffffff },
        { 0x4732ecc4, 0xffffffff },
        { 0x7c22e4da, 0xffffffff },
        { 0x5a5e0063, 0xffff00ff },
        { 0x0000e866, 0x0000ffff },
        { 0x00fdc367, 0x00ffffff },
        { 0xac0000b4, 0xff0000ff },
        { 0x70249edc, 0xffffffff },
        { 0x00e97e32, 0x00ffffff },
        { 0x000094d8, 0x0000ffff },
        { 0x0000fa6c, 0x0000ffff },
        { 0x459100ff, 0xffff00ff },
        { 0x00e40067, 0x00ff00ff },
        { 0xaab28500, 0xffffff00 },
        { 0x0000f81b, 0x0000ffff },
        { 0x15bb22ba, 0xffffffff },
        { 0x3cf40000, 0xffff0000 },
        { 0x57d3ad86, 0xffffffff },
        { 0x00bd001e, 0x00ff00ff },
        { 0x00ec67b7, 0x00ffffff },
        { 0x00693aaf, 0x00ffffff },
        { 0x323c0018, 0xffff00ff },
        { 0xcab237cb, 0xffffffff },
        { 0xd4b2c6b4, 0xffffffff },
        { 0x2a15002f, 0xffff00ff },
        { 0x0000d196, 0x0000ffff },
        { 0x005e0010, 0x00ff00ff },
        { 0x0000b1f1, 0x0000ffff },
        { 0xb54e9838, 0xffffffff },
        { 0x000cf100, 0x00ffff00 },
        { 0x5c1a00fb, 0xffff00ff },
        { 0x0000f34d, 0x0000ffff },
        { 0x1900d34c, 0xff00ffff },
        { 0x00cacb48, 0x00ffffff },
        { 0x004c2d09, 0x00ffffff },
        { 0x4800bc00, 0xff00ff00 },
        { 0xf98cc700, 0xffffff00 },
        { 0xac002b1a, 0xff00ffff },
        { 0x00beb500, 0x00ffff00 },
        { 0x98000300, 0xff00ff00 },
        { 0x0061c35e, 0x00ffffff },
        { 0x00001245, 0x0000ffff },
        { 0x61005ca8, 0xff00ffff }
    };
    
    int
    matcher_matches(uint32_t u)
    {
        size_t i;
    
        for (i = 0; i < sizeof(vm) / sizeof(vm[0]); i++) {
            if ((u & vm[i].mask) == vm[i].val) {
                return 1;
            }
        }
        return 0;
    }
    

    Wildcards are now the zeros in the mask field of the struct, with corresponding bits "don't care" values (set to zero) in the val field.