doujingao6210 2014-10-12 09:48
浏览 607
已采纳

为什么以下golang程序会抛出运行时内存不足错误?

This program is supposed to read a file consisting of pairs of ints (one pair per line) and remove duplicate pairs. While it works on small files, it throws a runtime error on huge files (say a file of 1.5 GB). Initially, I thought that it is the map data structure which is causing this, but even after commenting it out, it still runs out of memory. Any ideas why this is happening? How to rectify it? Here's a data file on which it runs out of memory: http://snap.stanford.edu/data/com-Orkut.html

package main
import (
    "fmt"
    "bufio"
    "os"
    "strings"
    "strconv"
)

func main() {
    file, err := os.Open(os.Args[1])
    if err != nil {
        panic(err.Error())
    }
    defer file.Close()
    type Edge struct {
        u, v int
    }
    //seen := make(map[Edge]bool)
    edges := []Edge{}
    scanner := bufio.NewScanner(file)

    for i, _ := strconv.Atoi(os.Args[2]); i > 0; i-- {
        scanner.Scan()
    }

    for scanner.Scan() {
        str := scanner.Text()
        edge := strings.Split(str, ",")
        u, _ := strconv.Atoi(edge[0])
        v, _ := strconv.Atoi(edge[1])
        var key Edge
        if u < v {
            key = Edge{u,v}
        } else {
            key = Edge{v,u}
        }
        //if seen[key] {
        //  continue
        //}
        //seen[key] = true
        edges = append(edges, key)
    }
    for _, e := range edges {
        s := strconv.Itoa(e.u) + "," + strconv.Itoa(e.v)
        fmt.Println(s)
    }
}

A sample input is given below. The program can be run as follows (where the last input says how many lines to skip). go run undup.go a.txt 1

# 3072441,117185083
1,2
1,3
1,4
1,5
1,6
1,7
1,8
  • 写回答

2条回答 默认 最新

  • douyue2313 2014-10-12 15:09
    关注

    I looked at this file: com-orkut.ungraph.txt and it contains 117,185,082 lines. The way your data is structured, that's at least 16 bytes per line. (Edge is two 64bit ints) That alone is 1.7GB. I have had this problem in the past, and it can be a tricky one. Are you trying to solve this for a specific use case (the file in question) or the general case?

    In the specific case there are a few things about the data you could leverage: (1) the keys are sorted and (2) it looks it stores every connection twice, (3) the numbers don't seem huge. Here are a couple ideas:

    1. If you use a smaller type for the key you will use less memory. Try a uint32.

    2. You could stream (without using a map) the keys to another file by simply seeing if the 2nd column is greater than the first:

      if u < v {
          // write the key to another file
      } else {
          // skip it because v will eventually show v -> u
      }
      

    For the general case there are a couple strategies you could use:

    1. If the order of the resulting list doesn't matter: Use an on-disk hash table to store the map. There are a bunch of these: leveldb, sqlite, tokyo tyrant, ... A really nice one for go is bolt.

      In your for loop you would just check to see if a bucket contains the given key. (You can convert the ints into byte slices using encoding/binary) If it does, just skip it and continue. You will need to move the second for loop processing step into the first for loop so that you don't have to store all the keys.

    2. If the order of the resulting list does matter (and you can't guarantee the input is in order): You can also use an on-disk hash table, but it needs to be sorted. Bolt is sorted so that will work. Add all the keys to it, then traverse it in the second loop.

    Here is an example: (this program will take a while to run with 100 million records)

    package main
    
    import (
        "bufio"
        "encoding/binary"
        "fmt"
        "github.com/boltdb/bolt"
        "os"
        "strconv"
        "strings"
    )
    
    type Edge struct {
        u, v int
    }
    
    func FromKey(bs []byte) Edge {
        return Edge{int(binary.BigEndian.Uint64(bs[:8])), int(binary.BigEndian.Uint64(bs[8:]))}
    }
    
    func (e Edge) Key() [16]byte {
        var k [16]byte
        binary.BigEndian.PutUint64(k[:8], uint64(e.u))
        binary.BigEndian.PutUint64(k[8:], uint64(e.v))
        return k
    }
    
    func main() {
        file, err := os.Open(os.Args[1])
        if err != nil {
            panic(err.Error())
        }
        defer file.Close()
    
        scanner := bufio.NewScanner(file)
    
        for i, _ := strconv.Atoi(os.Args[2]); i > 0; i-- {
            scanner.Scan()
        }
    
        db, _ := bolt.Open("ex.db", 0777, nil)
        defer db.Close()
    
        bucketName := []byte("edges")
        db.Update(func(tx *bolt.Tx) error {
            tx.CreateBucketIfNotExists(bucketName)
            return nil
        })
    
        batchSize := 10000
        total := 0
        batch := make([]Edge, 0, batchSize)
        writeBatch := func() {
            total += len(batch)
            fmt.Println("write batch. total:", total)
            db.Update(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(bucketName)
                for _, edge := range batch {
                    key := edge.Key()
                    bucket.Put(key[:], nil)
                }
                return nil
            })
        }
    
        for scanner.Scan() {
            str := scanner.Text()
            edge := strings.Split(str, "\t")
            u, _ := strconv.Atoi(edge[0])
            v, _ := strconv.Atoi(edge[1])
            var key Edge
            if u < v {
                key = Edge{u, v}
            } else {
                key = Edge{v, u}
            }
            batch = append(batch, key)
            if len(batch) == batchSize {
                writeBatch()
                // reset the batch length to 0
                batch = batch[:0]
            }
        }
        // write anything leftover
        writeBatch()
    
        db.View(func(tx *bolt.Tx) error {
            tx.Bucket(bucketName).ForEach(func(k, v []byte) error {
                edge := FromKey(k)
                fmt.Println(edge)
                return nil
            })
            return nil
        })
    }
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

悬赏问题

  • ¥15 C#算法问题, 不知道怎么处理这个数据的转换
  • ¥15 YoloV5 第三方库的版本对照问题
  • ¥15 请完成下列相关问题!
  • ¥15 drone 推送镜像时候 purge: true 推送完毕后没有删除对应的镜像,手动拷贝到服务器执行结果正确在样才能让指令自动执行成功删除对应镜像,如何解决?
  • ¥15 求daily translation(DT)偏差订正方法的代码
  • ¥15 js调用html页面需要隐藏某个按钮
  • ¥15 ads仿真结果在圆图上是怎么读数的
  • ¥20 Cotex M3的调试和程序执行方式是什么样的?
  • ¥20 java项目连接sqlserver时报ssl相关错误
  • ¥15 一道python难题3