Go 1.0.2如何使用zlib压缩时提高速度

I'm working on a little program with Go (v1.0.2). I'm trying to load lines into memory, which I can recall by index (the first column of a line). To save space, I compress every line with zlib. Lines are grouped by blocks.

Items are loaded from a file. An item is represented by a line. This line has many properties separated by tabulation.

It's working just fine, but it's currently really slow. It took me almost 50 sec to just load a 40 Mb file! (Note that if I disable the "compress" part, it takes only 0.87 sec). I'm obviously doing something wrong, but I can't find what. Please note that I'm a beginner in this language.

Note: I'm stuck with Go 1.0.2 and can't update.

package main

import (
    "bytes"
    "compress/zlib"
    "encoding/json"
    "flag"
    "fmt"
    "gotwcc/mylib"
    "io"
    "net/http"
    "os"
)

type Block struct {
    data []byte
}

type Row struct {
    offset, len uint32
    block       *Block
}

type Cache struct {
    blocks          []Block
    indexes         map[string]*Row
    enable_compress bool
    maxRowGroup     uint
}

func (this *Cache) dump() {
    for key, value := range this.indexes {
        fmt.Printf("[%s] = \"%s\"
", key, value.block)
    }
}

func (this *Cache) search(item_id string) string {
    row := this.indexes[item_id]
    if row == nil {
        return "(Not found)"
    }
    block := this.uncompress(row.block.data)
    slice := block[row.offset : row.offset+row.len]
    return string(slice)
}

func (this *Cache) compress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var b bytes.Buffer
    w := zlib.NewWriter(&b)
    w.Write(data)
    w.Close()
    return b.Bytes()
}

func (this *Cache) uncompress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var res bytes.Buffer
    b := bytes.NewReader(data)
    r, err := zlib.NewReader(b)
    if err != nil {
        panic(err)
    }
    io.Copy(&res, r)

    r.Close()
    return res.Bytes()

}

func (this *Cache) loadFile(s string) {
    type TempRowBuf struct {
        item_id     []byte
        offset, len uint32
    }

    file, err := os.Open(s)
    if err != nil {
        panic(err.Error())
    }
    defer file.Close()

    scanner := mybufio.NewScanner(file)
    scanner.Split(mybufio.ScanLines)

    var tmp_buf bytes.Buffer
    var buffer bytes.Buffer
    var tmp_list []TempRowBuf
    this.indexes = make(map[string]*Row)

    var offset uint32 = 0
    nb := this.maxRowGroup
    for scanner.Scan() {
        nb--
        tmp_buf.Reset()
        tmp_buf.Write(scanner.Bytes())
        line := tmp_buf.Bytes()
        item_id, _ := tmp_buf.ReadBytes('\t')
        item_id = item_id[0 : len(item_id)-1]
        size := uint32(len(line))
        buffer.Write(line)
        tmp_list = append(tmp_list, TempRowBuf{item_id, offset, size})
        offset += size
        if nb <= 0 {
            compressed := this.compress(buffer.Bytes())
            buff := make([]byte, len(compressed))
            copy(buff, compressed)
            var block *Block = &Block{buff}
            for _, tmp := range tmp_list {
                this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
            }
            nb = this.maxRowGroup
            offset = 0
            tmp_list = nil
            buffer.Reset()
        }
    }
    if nb > 0 {
        compressed := this.compress(buffer.Bytes())
        buff := make([]byte, len(compressed))
        copy(buff, compressed)
        var block *Block = &Block{buff}
        for _, tmp := range tmp_list {
            this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
        }
    }
}

func wsCacheHandler(cache *Cache, writer http.ResponseWriter, request *http.Request) {
    var value map[string]string = make(map[string]string)

    item_id := request.FormValue("item_id")
    value["item_id"] = item_id
    value["raw"] = cache.search(item_id)
    jsonResp, err := json.Marshal(value)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Fprintf(writer, "%s", string(jsonResp))
    }
}

func main() {
    filename := flag.String("data", "default.txt", "The data filename")
    no_http := flag.Bool("no-http", false, "Do not start an http server")
    dumpMap := flag.Bool("dump", false, "If we should dump the map to stdout")
    noCompression := flag.Bool("no-compress", false, "Disable compression")
    maxRowGroup := flag.Uint("max-row-group", 100, "How much line to group when doing compression")

    flag.Parse()
    var cache Cache
    cache.enable_compress = !*noCompression
    cache.maxRowGroup = *maxRowGroup

    cache.loadFile(*filename)

    if *dumpMap {
        cache.dump()
        fmt.Println(cache.search("100001"))
        fmt.Println(cache.search("100002"))
        fmt.Println(cache.search("100003"))
        fmt.Println(cache.search("100004"))
        fmt.Println(cache.search("100005"))
        fmt.Println(cache.search("100006"))
        fmt.Println(cache.search("100007"))
        fmt.Println(cache.search("100008"))
        fmt.Println(cache.search("100009"))
        fmt.Println(cache.search("100010"))

    }

    if !*no_http {
        http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
            wsCacheHandler(&cache, writer, request)
        })
        fmt.Println("Cache loaded, now listening on port 8585...")
        http.ListenAndServe(":8585", nil)
    }
}

This is the test file I use (I'm not pasting the 40 Mo file here :p):

data.txt:

100001  bar
100002  foo
100003  bob
100004  nuts
100005  gogopowran
100006  green
100007  test
100008  alongwordwithlotofletters
100009  
100010  space space space

I launch my application like this:

time ./mybin -data=data.txt -no-http -no-compress => ok (0.6 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=100 => slow (12.1 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=1000 => still slow (10.9 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=10000 => still slow (10.6 sec to load)

Edit: Apply go fmt on the code. Add an option to choose the size of the packing. Test 3 different sizes.

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
dongxie45083 2014-08-15 22:02
关注
Compressing each line individually is just going to be very slow, and provide relatively inefficient compression to boot.

Is there a reason you're not simply compressing the entire file? Or at least one "block" at a time?

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

maven 怎么引入jave-1.0.2.jar maven
2017-05-05 08:00

回答 4 已采纳这个包没有在maven私服中。先考虑进行下载http://download.csdn.net/detail/xiaofei_9070/9650294 再考虑使用maven依赖本地jar解决此问题
最新版sklearn 1.0.2 依然提示module 'sklearn.tree' has no attribute 'plot_tree' python sklearn 机器学习
2022-02-14 10:13

回答 2 已采纳这应该是你的包下载的不完整吧。。。。要不你先试试sklearn文档里的例子，或者用python自带的idle试一下。不行的话那就只能卸载一下然后再重新安装，集成环境里还不成的话，老老实实用pip。
npm没有使用的版本错误 webpack
2022-05-10 09:35

回答 1 已采纳名字打错了cli不是cil webpack-cli
zlib-1.2.11手册
2020-10-09 21:00

rtoax的博客修复从窗口拉出最后一个块时缩小存储的错误允许立即进行deflateParams更改，然后再进行任何deflate输入由于错误修复，任何1.2.9或1.2.10的安装应立即替换为1.2.11。 1.2.10版对1.2.9进行了以下关键改进：修复...
使用pymysql查询出的日期为None问题 mysql python
2023-02-13 14:51

回答 2 已采纳方案来自梦想橡皮擦狂飙组基于 GPT 编写的 “程秘” 可以将 execute_sql 返回的结果遍历并处理成合适的数据类型，如下代码示例： for item in items: pla
Hadoop 1.0.2中mapreduce的版本是1还是2 hadoop mapreduce
2015-04-16 01:58

回答 1 已采纳版本1，slot。 hadoop 1.0 指的是1.x(0.20.x),0.21,0.22 hadoop 2.0 指的是2.x,0.23.x
Golang docker库映像无法在$ PATH中找到go工具 docker
2017-06-09 00:17

回答 2 已采纳 From the comments, I tested this code myself without error (admittedly on 17.06-rc2 but the behavi
05 isescan plasmidfinder export pip -i参数 singularity使用安装metawrap phyloflash安装apptainer
2023-02-08 16:14

生信小博士的博客 make编译过程中用到的或者下载下来的文件在哪 go下载的临时文件都在哪里呢通常在go env GOPATH 路径下面。编译过程中产生的中间文件通常在编译命令执行的当前目录下的中间目录，具体目录名称和编译工具有关。它允许...
android leakcanary无法使用 android android-studio
2019-04-19 17:02

回答 1 已采纳 leakcanary版本升到最新
Vue2.0使用ElementUI的组件报错 vue.js
2019-03-29 11:11

回答 2 已采纳问题已解决，是由于在templete的下一节点有个div造成的，删除2即可
运行项目时报错，试了好几个方法还是没有解决 npm vue.js
2021-07-30 09:36

回答 1 已采纳这个文件的路径引用错误或者没有　D:\vuejs\test1\huantai\package.json
2022-01-27 使用liquibase管理mysql执行版本
2022-01-27 14:23

悟世者的博客本文对其使用做说明。本文包含两部分: 数据库语句自动化执行的工具说明数据库sql语句的编写要点说明数据库语句自动化执行的工具说明需求说明: 方便快速构建和代码匹配的数据库结构增量修改表结构，...
使用spring2.5@AspectJ进行AOP编程时遇到的配置问题 spring
2010-10-23 21:31

回答 3 已采纳 [quote]Caused by: ognl.NoSuchPropertyException: $Proxy26.location [/quote] 感觉像 jar 包版本不对！你确认下 A
无root权限解决编译时的依赖问题
2021-04-08 11:19

徐洲更hoptop的博客通常使用动态库简单做法是：把生成的so文件拷贝到/usr/lib中，这样不管是生成可以执行文件时，还是执行程序时，都能找到需要的so文件。但是普通用户没有/usr/lib的写入权限，所有要指定LD_LIBRARY_PATH.ls 参考资料:...
深入理解和使用nginx
2020-11-26 20:21

杨顾的博客 nginx学习笔记主要内容有nginx安装、基本使用、反向代理、负载均衡、动静分离、跨域配置、keeplived实现nginx高可用等
jenkins+phantomjs环境搭建及使用
2019-09-27 22:39

aipoqiu2566的博客 #jenkins+phantomjs 前端性能自动化测试的安装和使用#gcc GNU编译器套件 https://gcc.gnu.org/ #nginx 高性能的HTTP和反向代理服务器 http://nginx.org#php 通用开源脚本语言 http://php.net#Node.js 事件驱动I/...
无root权限下解决编译时的依赖问题
2018-05-27 15:28

徐洲更hoptop的博客通常使用动态库简单做法是：把生成的so文件拷贝到/usr/lib中，这样不管是生成可以执行文件时，还是执行程序时，都能找到需要的so文件。但是普通用户没有/usr/lib的写入权限，所有要指定 LD_LIBRARY_PATH .ls 参考...
configure 查找依赖库_无root权限下解决编译时的依赖问题
2021-01-14 15:33

王秋裤的博客通常使用动态库简单做法是：把生成的so文件拷贝到/usr/lib中，这样不管是生成可以执行文件时，还是执行程序时，都能找到需要的so文件。但是普通用户没有/usr/lib的写入权限，所有要指定LD_LIBRARY_PATH.ls 参考资料:...
没有解决我的问题, 去提问

悬赏问题

¥20 机器学习能否像多层线性模型一样处理嵌套数据
¥20 西门子S7-Graph,S7-300，梯形图
¥50 用易语言http 访问不了网页
¥50 safari浏览器fetch提交数据后数据丢失问题
¥15 matlab不知道怎么改，求解答！！
¥15 永磁直线电机的电流环pi调不出来
¥15 用stata实现聚类的代码
¥15 请问paddlehub能支持移动端开发吗？在Android studio上该如何部署？
¥20 docker里部署springboot项目，访问不到扬声器
¥15 netty整合springboot之后自动重连失效

Go 1.0.2如何使用zlib压缩时提高速度

1条回答 默认 最新

悬赏问题

1条回答默认最新