Search code examples
hashgobenchmarkingbloom-filter

Bob Jenkins' Hash getting bad performance


I was building a Bloom filter and looked into what hashes to use and the Bob Jenkins' hash seemed like a good choice because of the evenness of the distribution. I adapted the given C++ code to Go (possibly making a mistake but it seems to work).

I got around to benchmarking the cost of the hash and found that the SHA1 hash in the Go std library was much faster.

PASS
BenchmarkJenkins     1000000          2649 ns/op
BenchmarkSHA256  1000000          1218 ns/op
BenchmarkSHA1    5000000           462 ns/op

Was I misled when I read that you shouldn't use cryptographic hashes in this use case? Or is the standard library code much more optimized than mine?

package jenkins

import (
    "bytes"
    "encoding/gob"
)

// adapted from http://bretmulvey.com/hash/7.html
func ComputeHash(key interface{}) (uint64, error) {
    var buf bytes.Buffer
    enc := gob.NewEncoder(&buf)
    err := enc.Encode(key)
    if err != nil {
        return 0, err
    }
    data := buf.Bytes()

    var a, b, c uint64
    a, b = 0x9e3779b9, 0x9e3779b9
    c = 0
    i := 0

    for i = 0; i < len(data)-12; {
        a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)

        a, b, c = mix(a, b, c)
    }

    c += uint64(len(data))

    if i < len(data) {
        a += uint64(data[i])
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        b += uint64(data[i])
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        c += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 24
        i++
    }

    a, b, c = mix(a, b, c)
    return c, nil
}

func mix(a, b, c uint64) (uint64, uint64, uint64) {
    a -= b
    a -= c
    a ^= (c >> 13)
    b -= c
    b -= a
    b ^= (a << 8)
    c -= a
    c -= b
    c ^= (b >> 13)
    a -= b
    a -= c
    a ^= (c >> 12)
    b -= c
    b -= a
    b ^= (a << 16)
    c -= a
    c -= b
    c ^= (b >> 5)
    a -= b
    a -= c
    a ^= (c >> 3)
    b -= c
    b -= a
    b ^= (a << 10)
    c -= a
    c -= b
    c ^= (b >> 15)
    return a, b, c
}

EDIT:

Benchmarking code:

package bloom

import (
    "testing"

    "crypto/sha1"
    "crypto/sha256"
)

func BenchmarkJenkins(b *testing.B) {
    j := jenkinsHash{}

    for i := 0; i < b.N; i++ {
        j.ComputeHash(i)
    }
}

func BenchmarkSHA1(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha1.Sum([]byte{byte(i)})
    }
}


func BenchmarkSHA256(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha256.Sum256([]byte{byte(i)})
    }
}

Solution

  • @JensG was on the right track. The calls to gob to encode the key in binary made up the vast majority of the cost. When I transitioned to passing in byte arrays the benchmark started getting the results I was expecting. Thanks for the help!

    BenchmarkJenkins    100000000           20.4 ns/op
    BenchmarkSHA1    5000000           463 ns/op
    BenchmarkSHA256  1000000          1223 ns/op
    

    Benchmark code:

    package bloom
    
    import (
        "testing"
    
        "crypto/sha1"
        "crypto/sha256"
    )
    
    func BenchmarkJenkins(b *testing.B) {
        j := jenkinsHash{}
    
        for i := 0; i < b.N; i++ {
            j.ComputeHash([]byte{byte(i)})
        }
    }
    
    func BenchmarkSHA1(b *testing.B) {
        for i := 0; i < b.N; i++ {
            sha1.Sum([]byte{byte(i)})
        }
    }
    
    
    func BenchmarkSHA256(b *testing.B) {
        for i := 0; i < b.N; i++ {
            sha256.Sum256([]byte{byte(i)})
        }
    }
    

    Altered code:

    package bloom
    
    type jenkinsHash struct {
    }
    
    // adapted from http://bretmulvey.com/hash/7.html
    func (_ jenkinsHash) ComputeHash(data []byte) (uint64, error) {    
        var a, b, c uint64
        a, b = 0x9e3779b9, 0x9e3779b9
        c = 0
        i := 0
    
        for i = 0; i < len(data)-12; {
            a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
            i += 4
            b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
            i += 4
            c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
    
            a, b, c = mix(a, b, c)
        }
    
        c += uint64(len(data))
    
        if i < len(data) {
            a += uint64(data[i])
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 24
            i++
        }
    
        if i < len(data) {
            b += uint64(data[i])
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 24
            i++
        }
    
        if i < len(data) {
            c += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            c += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            c += uint64(data[i]) << 24
            i++
        }
    
        a, b, c = mix(a, b, c)
        return c, nil
    }
    
    func mix(a, b, c uint64) (uint64, uint64, uint64) {
        a -= b
        a -= c
        a ^= (c >> 13)
        b -= c
        b -= a
        b ^= (a << 8)
        c -= a
        c -= b
        c ^= (b >> 13)
        a -= b
        a -= c
        a ^= (c >> 12)
        b -= c
        b -= a
        b ^= (a << 16)
        c -= a
        c -= b
        c ^= (b >> 5)
        a -= b
        a -= c
        a ^= (c >> 3)
        b -= c
        b -= a
        b ^= (a << 10)
        c -= a
        c -= b
        c ^= (b >> 15)
        return a, b, c
    }