I'm working on a little program with Go (v1.0.2). I'm trying to load lines into memory, which I can recall by index (the first column of a line). To save space, I compress every line with zlib. Lines are grouped by blocks.
Items are loaded from a file. An item is represented by a line. This line has many properties separated by tabulation.
It's working just fine, but it's currently really slow. It took me almost 50 sec to just load a 40 Mb file! (Note that if I disable the "compress" part, it takes only 0.87 sec). I'm obviously doing something wrong, but I can't find what. Please note that I'm a beginner in this language.
Note: I'm stuck with Go 1.0.2 and can't update.
package main
import (
"bytes"
"compress/zlib"
"encoding/json"
"flag"
"fmt"
"gotwcc/mylib"
"io"
"net/http"
"os"
)
type Block struct {
data []byte
}
type Row struct {
offset, len uint32
block *Block
}
type Cache struct {
blocks []Block
indexes map[string]*Row
enable_compress bool
maxRowGroup uint
}
func (this *Cache) dump() {
for key, value := range this.indexes {
fmt.Printf("[%s] = \"%s\"
", key, value.block)
}
}
func (this *Cache) search(item_id string) string {
row := this.indexes[item_id]
if row == nil {
return "(Not found)"
}
block := this.uncompress(row.block.data)
slice := block[row.offset : row.offset+row.len]
return string(slice)
}
func (this *Cache) compress(data []byte) []byte {
if !this.enable_compress {
return data
}
var b bytes.Buffer
w := zlib.NewWriter(&b)
w.Write(data)
w.Close()
return b.Bytes()
}
func (this *Cache) uncompress(data []byte) []byte {
if !this.enable_compress {
return data
}
var res bytes.Buffer
b := bytes.NewReader(data)
r, err := zlib.NewReader(b)
if err != nil {
panic(err)
}
io.Copy(&res, r)
r.Close()
return res.Bytes()
}
func (this *Cache) loadFile(s string) {
type TempRowBuf struct {
item_id []byte
offset, len uint32
}
file, err := os.Open(s)
if err != nil {
panic(err.Error())
}
defer file.Close()
scanner := mybufio.NewScanner(file)
scanner.Split(mybufio.ScanLines)
var tmp_buf bytes.Buffer
var buffer bytes.Buffer
var tmp_list []TempRowBuf
this.indexes = make(map[string]*Row)
var offset uint32 = 0
nb := this.maxRowGroup
for scanner.Scan() {
nb--
tmp_buf.Reset()
tmp_buf.Write(scanner.Bytes())
line := tmp_buf.Bytes()
item_id, _ := tmp_buf.ReadBytes('\t')
item_id = item_id[0 : len(item_id)-1]
size := uint32(len(line))
buffer.Write(line)
tmp_list = append(tmp_list, TempRowBuf{item_id, offset, size})
offset += size
if nb <= 0 {
compressed := this.compress(buffer.Bytes())
buff := make([]byte, len(compressed))
copy(buff, compressed)
var block *Block = &Block{buff}
for _, tmp := range tmp_list {
this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
}
nb = this.maxRowGroup
offset = 0
tmp_list = nil
buffer.Reset()
}
}
if nb > 0 {
compressed := this.compress(buffer.Bytes())
buff := make([]byte, len(compressed))
copy(buff, compressed)
var block *Block = &Block{buff}
for _, tmp := range tmp_list {
this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
}
}
}
func wsCacheHandler(cache *Cache, writer http.ResponseWriter, request *http.Request) {
var value map[string]string = make(map[string]string)
item_id := request.FormValue("item_id")
value["item_id"] = item_id
value["raw"] = cache.search(item_id)
jsonResp, err := json.Marshal(value)
if err != nil {
fmt.Println("error:", err)
} else {
fmt.Fprintf(writer, "%s", string(jsonResp))
}
}
func main() {
filename := flag.String("data", "default.txt", "The data filename")
no_http := flag.Bool("no-http", false, "Do not start an http server")
dumpMap := flag.Bool("dump", false, "If we should dump the map to stdout")
noCompression := flag.Bool("no-compress", false, "Disable compression")
maxRowGroup := flag.Uint("max-row-group", 100, "How much line to group when doing compression")
flag.Parse()
var cache Cache
cache.enable_compress = !*noCompression
cache.maxRowGroup = *maxRowGroup
cache.loadFile(*filename)
if *dumpMap {
cache.dump()
fmt.Println(cache.search("100001"))
fmt.Println(cache.search("100002"))
fmt.Println(cache.search("100003"))
fmt.Println(cache.search("100004"))
fmt.Println(cache.search("100005"))
fmt.Println(cache.search("100006"))
fmt.Println(cache.search("100007"))
fmt.Println(cache.search("100008"))
fmt.Println(cache.search("100009"))
fmt.Println(cache.search("100010"))
}
if !*no_http {
http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
wsCacheHandler(&cache, writer, request)
})
fmt.Println("Cache loaded, now listening on port 8585...")
http.ListenAndServe(":8585", nil)
}
}
This is the test file I use (I'm not pasting the 40 Mo file here :p):
data.txt:
100001 bar
100002 foo
100003 bob
100004 nuts
100005 gogopowran
100006 green
100007 test
100008 alongwordwithlotofletters
100009
100010 space space space
I launch my application like this:
time ./mybin -data=data.txt -no-http -no-compress => ok (0.6 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=100 => slow (12.1 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=1000 => still slow (10.9 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=10000 => still slow (10.6 sec to load)
Edit: Apply go fmt on the code. Add an option to choose the size of the packing. Test 3 different sizes.