I need to calculate sha256 checksums for files over 1GB (read file by chunks), currently I am using python with this:
import hashlib
import time
start_time = time.time()
def sha256sum(filename="big.txt", block_size=2 ** 13):
sha = hashlib.sha256()
with open(filename, 'rb') as f:
for chunk in iter(lambda: f.read(block_size), b''):
sha.update(chunk)
return sha.hexdigest()
input_file = '/tmp/1GB.raw'
print 'checksum is: %s
' % sha256sum(input_file)
print 'Elapsed time: %s' % str(time.time() - start_time)
I wanted to give a try to golang thinking I could get faster results, but after trying the following code, it runs a couple of seconds slower:
package main
import (
"crypto/sha256"
"fmt"
"io"
"math"
"os"
"time"
)
const fileChunk = 8192
func File(file string) string {
fh, err := os.Open(file)
if err != nil {
panic(err.Error())
}
defer fh.Close()
stat, _ := fh.Stat()
size := stat.Size()
chunks := uint64(math.Ceil(float64(size) / float64(fileChunk)))
h := sha256.New()
for i := uint64(0); i < chunks; i++ {
csize := int(math.Min(fileChunk, float64(size-int64(i*fileChunk))))
buf := make([]byte, csize)
fh.Read(buf)
io.WriteString(h, string(buf))
}
return fmt.Sprintf("%x", h.Sum(nil))
}
func main() {
start := time.Now()
fmt.Printf("checksum is: %s
", File("/tmp/1G.raw"))
elapsed := time.Since(start)
fmt.Printf("Elapsed time: %s
", elapsed)
}
Any idea how to improve the golang code if possible? maybe to use all computer CPU cores, one for reading and other for hashing, any ideas ?
Update
As suggested I am using this code:
package main
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"os"
"time"
)
func main() {
start := time.Now()
fh, err := os.Open("/tmp/1GB.raw")
if err != nil {
panic(err.Error())
}
defer fh.Close()
h := sha256.New()
_, err = io.Copy(h, fh)
if err != nil {
panic(err.Error())
}
fmt.Println(hex.EncodeToString(h.Sum(nil)))
fmt.Printf("Elapsed time: %s
", time.Since(start))
}
For testing I am creating the 1GB file with this:
# mkfile 1G /tmp/1GB.raw
The new version is faster but not that much, what about using channels? could the use of more than one CPU/core could help to improve? I was expecting to have an improvement of at least 20% but unfortunately I am getting almost no gain, is almost nothing.
time result for python
5.867u 0.250s 0:06.15 99.3% 0+0k 0+0io 0pf+0w
time results for go after compiling (go build) and executing the binary:
5.687u 0.198s 0:05.93 98.9% 0+0k 0+0io 0pf+0w
Any more ideas?
test results
Using the version using channels posted below on the accepted answer by @icza
Elapsed time: 5.894779733s
Using the version with no channels:
Elapsed time: 5.823489239s
I thought that using channels would increase a little bit but seems to not.
I am running this on a MacBook Pro OS X Yosemite. using go version:
go version go1.4.1 darwin/amd64
update 2
Setting runtime.GOMAXPROCS to 4:
runtime.GOMAXPROCS(4)
Made things faster:
Elapsed time: 5.741511748s
update 3
Changing the chunk size to 8192 (like in the python version) give the expected result:
...
for b, hasMore := make([]byte, 8192<<10), true; hasMore; {
...
Also using only runtime.GOMAXPROCS(2)