douba05167 2014-05-02 20:24
浏览 249
已采纳

鲍勃·詹金斯(Bob Jenkins)的哈希表现不佳

I was building a Bloom filter and looked into what hashes to use and the Bob Jenkins' hash seemed like a good choice because of the evenness of the distribution. I adapted the given C++ code to Go (possibly making a mistake but it seems to work).

I got around to benchmarking the cost of the hash and found that the SHA1 hash in the Go std library was much faster.

PASS
BenchmarkJenkins     1000000          2649 ns/op
BenchmarkSHA256  1000000          1218 ns/op
BenchmarkSHA1    5000000           462 ns/op

Was I misled when I read that you shouldn't use cryptographic hashes in this use case? Or is the standard library code much more optimized than mine?

package jenkins

import (
    "bytes"
    "encoding/gob"
)

// adapted from http://bretmulvey.com/hash/7.html
func ComputeHash(key interface{}) (uint64, error) {
    var buf bytes.Buffer
    enc := gob.NewEncoder(&buf)
    err := enc.Encode(key)
    if err != nil {
        return 0, err
    }
    data := buf.Bytes()

    var a, b, c uint64
    a, b = 0x9e3779b9, 0x9e3779b9
    c = 0
    i := 0

    for i = 0; i < len(data)-12; {
        a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)

        a, b, c = mix(a, b, c)
    }

    c += uint64(len(data))

    if i < len(data) {
        a += uint64(data[i])
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        b += uint64(data[i])
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        c += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 24
        i++
    }

    a, b, c = mix(a, b, c)
    return c, nil
}

func mix(a, b, c uint64) (uint64, uint64, uint64) {
    a -= b
    a -= c
    a ^= (c >> 13)
    b -= c
    b -= a
    b ^= (a << 8)
    c -= a
    c -= b
    c ^= (b >> 13)
    a -= b
    a -= c
    a ^= (c >> 12)
    b -= c
    b -= a
    b ^= (a << 16)
    c -= a
    c -= b
    c ^= (b >> 5)
    a -= b
    a -= c
    a ^= (c >> 3)
    b -= c
    b -= a
    b ^= (a << 10)
    c -= a
    c -= b
    c ^= (b >> 15)
    return a, b, c
}

EDIT:

Benchmarking code:

package bloom

import (
    "testing"

    "crypto/sha1"
    "crypto/sha256"
)

func BenchmarkJenkins(b *testing.B) {
    j := jenkinsHash{}

    for i := 0; i < b.N; i++ {
        j.ComputeHash(i)
    }
}

func BenchmarkSHA1(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha1.Sum([]byte{byte(i)})
    }
}


func BenchmarkSHA256(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha256.Sum256([]byte{byte(i)})
    }
}
  • 写回答

3条回答 默认 最新

  • duangan6731 2014-05-02 23:28
    关注

    @JensG was on the right track. The calls to gob to encode the key in binary made up the vast majority of the cost. When I transitioned to passing in byte arrays the benchmark started getting the results I was expecting. Thanks for the help!

    BenchmarkJenkins    100000000           20.4 ns/op
    BenchmarkSHA1    5000000           463 ns/op
    BenchmarkSHA256  1000000          1223 ns/op
    

    Benchmark code:

    package bloom
    
    import (
        "testing"
    
        "crypto/sha1"
        "crypto/sha256"
    )
    
    func BenchmarkJenkins(b *testing.B) {
        j := jenkinsHash{}
    
        for i := 0; i < b.N; i++ {
            j.ComputeHash([]byte{byte(i)})
        }
    }
    
    func BenchmarkSHA1(b *testing.B) {
        for i := 0; i < b.N; i++ {
            sha1.Sum([]byte{byte(i)})
        }
    }
    
    
    func BenchmarkSHA256(b *testing.B) {
        for i := 0; i < b.N; i++ {
            sha256.Sum256([]byte{byte(i)})
        }
    }
    

    Altered code:

    package bloom
    
    type jenkinsHash struct {
    }
    
    // adapted from http://bretmulvey.com/hash/7.html
    func (_ jenkinsHash) ComputeHash(data []byte) (uint64, error) {    
        var a, b, c uint64
        a, b = 0x9e3779b9, 0x9e3779b9
        c = 0
        i := 0
    
        for i = 0; i < len(data)-12; {
            a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
            i += 4
            b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
            i += 4
            c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
    
            a, b, c = mix(a, b, c)
        }
    
        c += uint64(len(data))
    
        if i < len(data) {
            a += uint64(data[i])
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            a += uint64(data[i]) << 24
            i++
        }
    
        if i < len(data) {
            b += uint64(data[i])
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            b += uint64(data[i]) << 24
            i++
        }
    
        if i < len(data) {
            c += uint64(data[i]) << 8
            i++
        }
        if i < len(data) {
            c += uint64(data[i]) << 16
            i++
        }
        if i < len(data) {
            c += uint64(data[i]) << 24
            i++
        }
    
        a, b, c = mix(a, b, c)
        return c, nil
    }
    
    func mix(a, b, c uint64) (uint64, uint64, uint64) {
        a -= b
        a -= c
        a ^= (c >> 13)
        b -= c
        b -= a
        b ^= (a << 8)
        c -= a
        c -= b
        c ^= (b >> 13)
        a -= b
        a -= c
        a ^= (c >> 12)
        b -= c
        b -= a
        b ^= (a << 16)
        c -= a
        c -= b
        c ^= (b >> 5)
        a -= b
        a -= c
        a ^= (c >> 3)
        b -= c
        b -= a
        b ^= (a << 10)
        c -= a
        c -= b
        c ^= (b >> 15)
        return a, b, c
    }
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

悬赏问题

  • ¥15 #MATLAB仿真#车辆换道路径规划
  • ¥15 java 操作 elasticsearch 8.1 实现 索引的重建
  • ¥15 数据可视化Python
  • ¥15 要给毕业设计添加扫码登录的功能!!有偿
  • ¥15 kafka 分区副本增加会导致消息丢失或者不可用吗?
  • ¥15 微信公众号自制会员卡没有收款渠道啊
  • ¥15 stable diffusion
  • ¥100 Jenkins自动化部署—悬赏100元
  • ¥15 关于#python#的问题:求帮写python代码
  • ¥20 MATLAB画图图形出现上下震荡的线条