ASM version
TEXT ·CountBitsUint64PopCnt(SB),NOSPLIT,$0
POPCNTQ x+0(FP), AX
MOVQ AX, ret+8(FP)
RET
Go Version
const (
m1q uint64 = 0x5555555555555555
m2q = 0x3333333333333333
m4q = 0x0f0f0f0f0f0f0f0f
hq = 0x0101010101010101
)
func CountBitsUint64(x uint64) int {
x -= (x >> 1) & m1q // put count of each 2 bits into those 2 bits
x = (x & m2q) + ((x >> 2) & m2q) // put count of each 4 bits into those 4 bits
x = (x + (x >> 4)) & m4q // put count of each 8 bits into those 8 bits
return int((x * hq) >> 56) // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
}
Go Version (go tool compile -S popcount.go
)
"".CountBitsUint64 t=1 size=101 args=0x10 locals=0x0
0x0000 00000 (popcount.go:81) TEXT "".CountBitsUint64(SB), $0-16
0x0000 00000 (popcount.go:81) NOP
0x0000 00000 (popcount.go:81) NOP
0x0000 00000 (popcount.go:81) FUNCDATA $0, gclocals·23e8278e2b69a3a75fa59b23c49ed6ad(SB)
0x0000 00000 (popcount.go:81) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (popcount.go:82) MOVQ "".x+8(FP), AX
0x0005 00005 (popcount.go:82) MOVQ AX, CX
0x0008 00008 (popcount.go:82) SHRQ $1, AX
0x000b 00011 (popcount.go:82) MOVQ $6148914691236517205, DX
0x0015 00021 (popcount.go:82) ANDQ DX, AX
0x0018 00024 (popcount.go:82) SUBQ AX, CX
0x001b 00027 (popcount.go:83) MOVQ $3689348814741910323, AX
0x0025 00037 (popcount.go:83) MOVQ CX, DX
0x0028 00040 (popcount.go:83) ANDQ AX, CX
0x002b 00043 (popcount.go:83) SHRQ $2, DX
0x002f 00047 (popcount.go:83) ANDQ AX, DX
0x0032 00050 (popcount.go:83) LEAQ (CX)(DX*1), AX
0x0036 00054 (popcount.go:84) MOVQ AX, CX
0x0039 00057 (popcount.go:84) SHRQ $4, AX
0x003d 00061 (popcount.go:84) ADDQ CX, AX
0x0040 00064 (popcount.go:84) MOVQ $1085102592571150095, CX
0x004a 00074 (popcount.go:84) ANDQ CX, AX
0x004d 00077 (popcount.go:85) MOVQ $72340172838076673, CX
0x0057 00087 (popcount.go:85) IMULQ AX, CX
0x005b 00091 (popcount.go:85) SHRQ $56, CX
0x005f 00095 (popcount.go:85) MOVQ CX, "".~r1+16(FP)
0x0064 00100 (popcount.go:85) RET
0x0000 48 8b 44 24 08 48 89 c1 48 d1 e8 48 ba 55 55 55 H.D$.H..H..H.UUU
0x0010 55 55 55 55 55 48 21 d0 48 29 c1 48 b8 33 33 33 UUUUUH!.H).H.333
0x0020 33 33 33 33 33 48 89 ca 48 21 c1 48 c1 ea 02 48 33333H..H!.H...H
0x0030 21 c2 48 8d 04 11 48 89 c1 48 c1 e8 04 48 01 c8 !.H...H..H...H..
0x0040 48 b9 0f 0f 0f 0f 0f 0f 0f 0f 48 21 c8 48 b9 01 H.........H!.H..
0x0050 01 01 01 01 01 01 01 48 0f af c8 48 c1 e9 38 48 .......H...H..8H
0x0060 89 4c 24 10 c3
"".CountBitsUint64Alt t=1 size=142 args=0x10 locals=0x0
0x0000 00000 (popcount.go:88) TEXT "".CountBitsUint64Alt(SB), $0-16
0x0000 00000 (popcount.go:88) NOP
0x0000 00000 (popcount.go:88) NOP
0x0000 00000 (popcount.go:88) FUNCDATA $0, gclocals·23e8278e2b69a3a75fa59b23c49ed6ad(SB)
0x0000 00000 (popcount.go:88) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (popcount.go:89) MOVQ "".x+8(FP), AX
0x0005 00005 (popcount.go:89) MOVQ AX, CX
0x0008 00008 (popcount.go:89) SHRQ $32, AX
0x000c 00012 (popcount.go:89) MOVQ AX, DX
0x000f 00015 (popcount.go:89) SHRL $1, AX
0x0011 00017 (popcount.go:89) ANDL $1431655765, AX
0x0016 00022 (popcount.go:89) SUBL AX, DX
0x0018 00024 (popcount.go:89) MOVL DX, AX
0x001a 00026 (popcount.go:89) ANDL $858993459, DX
0x0020 00032 (popcount.go:89) SHRL $2, AX
0x0023 00035 (popcount.go:89) ANDL $858993459, AX
0x0028 00040 (popcount.go:89) ADDL DX, AX
0x002a 00042 (popcount.go:89) MOVL AX, DX
0x002c 00044 (popcount.go:89) SHRL $4, AX
0x002f 00047 (popcount.go:89) ADDL DX, AX
0x0031 00049 (popcount.go:89) ANDL $252645135, AX
0x0036 00054 (popcount.go:89) MOVL AX, DX
0x0038 00056 (popcount.go:89) SHRL $8, AX
0x003b 00059 (popcount.go:89) ADDL DX, AX
0x003d 00061 (popcount.go:89) MOVL AX, DX
0x003f 00063 (popcount.go:89) SHRL $16, AX
0x0042 00066 (popcount.go:89) ADDL DX, AX
0x0044 00068 (popcount.go:89) MOVQ CX, DX
0x0047 00071 (popcount.go:89) SHRL $1, CX
0x0049 00073 (popcount.go:89) ANDL $1431655765, CX
0x004f 00079 (popcount.go:89) SUBL CX, DX
0x0051 00081 (popcount.go:89) MOVL DX, CX
0x0053 00083 (popcount.go:89) ANDL $858993459, DX
0x0059 00089 (popcount.go:89) SHRL $2, CX
0x005c 00092 (popcount.go:89) ANDL $858993459, CX
0x0062 00098 (popcount.go:89) ADDL DX, CX
0x0064 00100 (popcount.go:89) MOVL CX, DX
0x0066 00102 (popcount.go:89) SHRL $4, CX
0x0069 00105 (popcount.go:89) ADDL DX, CX
0x006b 00107 (popcount.go:89) ANDL $252645135, CX
0x0071 00113 (popcount.go:89) MOVL CX, DX
0x0073 00115 (popcount.go:89) SHRL $8, CX
0x0076 00118 (popcount.go:89) ADDL DX, CX
0x0078 00120 (popcount.go:89) MOVL CX, DX
0x007a 00122 (popcount.go:89) SHRL $16, CX
0x007d 00125 (popcount.go:89) ADDL DX, CX
0x007f 00127 (popcount.go:89) ANDL $63, AX
0x0082 00130 (popcount.go:89) ANDL $63, CX
0x0085 00133 (popcount.go:89) ADDQ CX, AX
0x0088 00136 (popcount.go:89) MOVQ AX, "".~r1+16(FP)
0x008d 00141 (popcount.go:89) RET
0x0000 48 8b 44 24 08 48 89 c1 48 c1 e8 20 48 89 c2 d1 H.D$.H..H.. H...
0x0010 e8 25 55 55 55 55 29 c2 89 d0 81 e2 33 33 33 33 .%UUUU).....3333
0x0020 c1 e8 02 25 33 33 33 33 01 d0 89 c2 c1 e8 04 01 ...%3333........
0x0030 d0 25 0f 0f 0f 0f 89 c2 c1 e8 08 01 d0 89 c2 c1 .%..............
0x0040 e8 10 01 d0 48 89 ca d1 e9 81 e1 55 55 55 55 29 ....H......UUUU)
0x0050 ca 89 d1 81 e2 33 33 33 33 c1 e9 02 81 e1 33 33 .....3333.....33
0x0060 33 33 01 d1 89 ca c1 e9 04 01 d1 81 e1 0f 0f 0f 33..............
0x0070 0f 89 ca c1 e9 08 01 d1 89 ca c1 e9 10 01 d1 83 ................
0x0080 e0 3f 83 e1 3f 48 01 c8 48 89 44 24 10 c3 .?..?H..H.D$..
Benchmark
$ go test -bench=.
BenchmarkCountBitsInt8PopCnt-4 500000000 3.96 ns/op
BenchmarkCountBitsInt16PopCnt-4 500000000 3.24 ns/op
BenchmarkCountBitsInt32PopCnt-4 500000000 3.36 ns/op
BenchmarkCountBitsInt64PopCnt-4 500000000 3.44 ns/op
BenchmarkCountBitsIntPopCnt-4 300000000 5.42 ns/op
BenchmarkCountBitsUint8PopCnt-4 1000000000 2.60 ns/op
BenchmarkCountBitsUint16PopCnt-4 1000000000 2.59 ns/op
BenchmarkCountBitsUint32PopCnt-4 1000000000 2.55 ns/op
> BenchmarkCountBitsUint64PopCnt-4 1000000000 2.51 ns/op
BenchmarkCountBitsUintPopCnt-4 300000000 4.38 ns/op
BenchmarkCountBitsBytePopCnt-4 500000000 3.21 ns/op
BenchmarkCountBitsRunePopCnt-4 500000000 3.29 ns/op
BenchmarkCountBitsInt8-4 2000000000 0.38 ns/op
BenchmarkCountBitsInt16-4 2000000000 0.41 ns/op
BenchmarkCountBitsInt32-4 2000000000 0.36 ns/op
BenchmarkCountBitsInt64-4 2000000000 0.37 ns/op
BenchmarkCountBitsInt-4 200000000 6.36 ns/op
BenchmarkCountBitsUint16-4 2000000000 0.36 ns/op
BenchmarkCountBitsUint32-4 2000000000 0.35 ns/op
> BenchmarkCountBitsUint64-4 2000000000 0.37 ns/op
> BenchmarkCountBitsUint64Alt-4 200000000 7.06 ns/op
BenchmarkCountBitsUint-4 300000000 4.16 ns/op
BenchmarkCountBitsUintReference-4 100000000 16.9 ns/op
BenchmarkCountBitsByte-4 2000000000 0.36 ns/op
BenchmarkCountBitsByteAlt-4 2000000000 0.36 ns/op
BenchmarkCountBitsRune-4 2000000000 0.37 ns/op
PASS
ok github.com/steakknife/hamming 42.730s
$
From https://github.com/steakknife/hamming
The testing call is calling straight into the assembly version:
0x0177 00375 (popcnt_amd64_test.go:189) CALL "".CountBitsUint64PopCnt(SB)