I benchmarked golang system package "math/bits". It is fast. I benchmarked the same codes copied from "math/bits", it is about 3 times slower.
I wonder what is the differences between user's code and the system code when compiling, linking or benchmarking?
// x_test.go
package x_test
import (
"math/bits"
"testing"
)
// copied from "math/bits"
const DeBruijn64 = 0x03f79d71b4ca8b09
var Input uint64 = DeBruijn64
var Output int
const m0 = 0x5555555555555555 // 01010101 ...
const m1 = 0x3333333333333333 // 00110011 ...
const m2 = 0x0f0f0f0f0f0f0f0f // 00001111 ...
const m3 = 0x00ff00ff00ff00ff // etc.
const m4 = 0x0000ffff0000ffff
func OnesCount64(x uint64) int {
const m = 1<<64 - 1
x = x>>1&(m0&m) + x&(m0&m)
x = x>>2&(m1&m) + x&(m1&m)
x = (x>>4 + x) & (m2 & m)
x += x >> 8
x += x >> 16
x += x >> 32
return int(x) & (1<<7 - 1)
}
// copied from "math/bits" END
func BenchmarkMine(b *testing.B) {
var s int
for i := 0; i < b.N; i++ {
s += OnesCount64(uint64(i))
}
Output = s
}
func BenchmarkGo(b *testing.B) {
var s int
for i := 0; i < b.N; i++ {
s += bits.OnesCount64(uint64(i))
}
Output = s
}
And running it shows the different result:
go test x_test.go -bench=.
goos: darwin
goarch: amd64
BenchmarkMine-4 500000000 3.32 ns/op
BenchmarkGo-4 2000000000 0.96 ns/op
The two benchmarks should result in similar results. But not.