douyinzha5820 2014-08-15 20:24
浏览 45

Go 1.0.2如何使用zlib压缩时提高速度

I'm working on a little program with Go (v1.0.2). I'm trying to load lines into memory, which I can recall by index (the first column of a line). To save space, I compress every line with zlib. Lines are grouped by blocks.

Items are loaded from a file. An item is represented by a line. This line has many properties separated by tabulation.

It's working just fine, but it's currently really slow. It took me almost 50 sec to just load a 40 Mb file! (Note that if I disable the "compress" part, it takes only 0.87 sec). I'm obviously doing something wrong, but I can't find what. Please note that I'm a beginner in this language.

Note: I'm stuck with Go 1.0.2 and can't update.

package main

import (
    "bytes"
    "compress/zlib"
    "encoding/json"
    "flag"
    "fmt"
    "gotwcc/mylib"
    "io"
    "net/http"
    "os"
)

type Block struct {
    data []byte
}

type Row struct {
    offset, len uint32
    block       *Block
}

type Cache struct {
    blocks          []Block
    indexes         map[string]*Row
    enable_compress bool
    maxRowGroup     uint
}

func (this *Cache) dump() {
    for key, value := range this.indexes {
        fmt.Printf("[%s] = \"%s\"
", key, value.block)
    }
}

func (this *Cache) search(item_id string) string {
    row := this.indexes[item_id]
    if row == nil {
        return "(Not found)"
    }
    block := this.uncompress(row.block.data)
    slice := block[row.offset : row.offset+row.len]
    return string(slice)
}

func (this *Cache) compress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var b bytes.Buffer
    w := zlib.NewWriter(&b)
    w.Write(data)
    w.Close()
    return b.Bytes()
}

func (this *Cache) uncompress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var res bytes.Buffer
    b := bytes.NewReader(data)
    r, err := zlib.NewReader(b)
    if err != nil {
        panic(err)
    }
    io.Copy(&res, r)

    r.Close()
    return res.Bytes()

}

func (this *Cache) loadFile(s string) {
    type TempRowBuf struct {
        item_id     []byte
        offset, len uint32
    }

    file, err := os.Open(s)
    if err != nil {
        panic(err.Error())
    }
    defer file.Close()

    scanner := mybufio.NewScanner(file)
    scanner.Split(mybufio.ScanLines)

    var tmp_buf bytes.Buffer
    var buffer bytes.Buffer
    var tmp_list []TempRowBuf
    this.indexes = make(map[string]*Row)

    var offset uint32 = 0
    nb := this.maxRowGroup
    for scanner.Scan() {
        nb--
        tmp_buf.Reset()
        tmp_buf.Write(scanner.Bytes())
        line := tmp_buf.Bytes()
        item_id, _ := tmp_buf.ReadBytes('\t')
        item_id = item_id[0 : len(item_id)-1]
        size := uint32(len(line))
        buffer.Write(line)
        tmp_list = append(tmp_list, TempRowBuf{item_id, offset, size})
        offset += size
        if nb <= 0 {
            compressed := this.compress(buffer.Bytes())
            buff := make([]byte, len(compressed))
            copy(buff, compressed)
            var block *Block = &Block{buff}
            for _, tmp := range tmp_list {
                this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
            }
            nb = this.maxRowGroup
            offset = 0
            tmp_list = nil
            buffer.Reset()
        }
    }
    if nb > 0 {
        compressed := this.compress(buffer.Bytes())
        buff := make([]byte, len(compressed))
        copy(buff, compressed)
        var block *Block = &Block{buff}
        for _, tmp := range tmp_list {
            this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
        }
    }
}

func wsCacheHandler(cache *Cache, writer http.ResponseWriter, request *http.Request) {
    var value map[string]string = make(map[string]string)

    item_id := request.FormValue("item_id")
    value["item_id"] = item_id
    value["raw"] = cache.search(item_id)
    jsonResp, err := json.Marshal(value)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Fprintf(writer, "%s", string(jsonResp))
    }
}

func main() {
    filename := flag.String("data", "default.txt", "The data filename")
    no_http := flag.Bool("no-http", false, "Do not start an http server")
    dumpMap := flag.Bool("dump", false, "If we should dump the map to stdout")
    noCompression := flag.Bool("no-compress", false, "Disable compression")
    maxRowGroup := flag.Uint("max-row-group", 100, "How much line to group when doing compression")

    flag.Parse()
    var cache Cache
    cache.enable_compress = !*noCompression
    cache.maxRowGroup = *maxRowGroup

    cache.loadFile(*filename)

    if *dumpMap {
        cache.dump()
        fmt.Println(cache.search("100001"))
        fmt.Println(cache.search("100002"))
        fmt.Println(cache.search("100003"))
        fmt.Println(cache.search("100004"))
        fmt.Println(cache.search("100005"))
        fmt.Println(cache.search("100006"))
        fmt.Println(cache.search("100007"))
        fmt.Println(cache.search("100008"))
        fmt.Println(cache.search("100009"))
        fmt.Println(cache.search("100010"))

    }

    if !*no_http {
        http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
            wsCacheHandler(&cache, writer, request)
        })
        fmt.Println("Cache loaded, now listening on port 8585...")
        http.ListenAndServe(":8585", nil)
    }
}

This is the test file I use (I'm not pasting the 40 Mo file here :p):

data.txt:

100001  bar
100002  foo
100003  bob
100004  nuts
100005  gogopowran
100006  green
100007  test
100008  alongwordwithlotofletters
100009  
100010  space space space

I launch my application like this:

time ./mybin -data=data.txt -no-http -no-compress => ok (0.6 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=100 => slow (12.1 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=1000 => still slow (10.9 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=10000 => still slow (10.6 sec to load)

Edit: Apply go fmt on the code. Add an option to choose the size of the packing. Test 3 different sizes.

  • 写回答

1条回答 默认 最新

  • dongxie45083 2014-08-15 22:02
    关注

    Compressing each line individually is just going to be very slow, and provide relatively inefficient compression to boot.

    Is there a reason you're not simply compressing the entire file? Or at least one "block" at a time?

    评论

报告相同问题?

悬赏问题

  • ¥20 机器学习能否像多层线性模型一样处理嵌套数据
  • ¥20 西门子S7-Graph,S7-300,梯形图
  • ¥50 用易语言http 访问不了网页
  • ¥50 safari浏览器fetch提交数据后数据丢失问题
  • ¥15 matlab不知道怎么改,求解答!!
  • ¥15 永磁直线电机的电流环pi调不出来
  • ¥15 用stata实现聚类的代码
  • ¥15 请问paddlehub能支持移动端开发吗?在Android studio上该如何部署?
  • ¥20 docker里部署springboot项目,访问不到扬声器
  • ¥15 netty整合springboot之后自动重连失效