doujingao6210
2014-10-12 09:48 阅读 531
已采纳

为什么以下golang程序会抛出运行时内存不足错误?

This program is supposed to read a file consisting of pairs of ints (one pair per line) and remove duplicate pairs. While it works on small files, it throws a runtime error on huge files (say a file of 1.5 GB). Initially, I thought that it is the map data structure which is causing this, but even after commenting it out, it still runs out of memory. Any ideas why this is happening? How to rectify it? Here's a data file on which it runs out of memory: http://snap.stanford.edu/data/com-Orkut.html

package main
import (
    "fmt"
    "bufio"
    "os"
    "strings"
    "strconv"
)

func main() {
    file, err := os.Open(os.Args[1])
    if err != nil {
        panic(err.Error())
    }
    defer file.Close()
    type Edge struct {
        u, v int
    }
    //seen := make(map[Edge]bool)
    edges := []Edge{}
    scanner := bufio.NewScanner(file)

    for i, _ := strconv.Atoi(os.Args[2]); i > 0; i-- {
        scanner.Scan()
    }

    for scanner.Scan() {
        str := scanner.Text()
        edge := strings.Split(str, ",")
        u, _ := strconv.Atoi(edge[0])
        v, _ := strconv.Atoi(edge[1])
        var key Edge
        if u < v {
            key = Edge{u,v}
        } else {
            key = Edge{v,u}
        }
        //if seen[key] {
        //  continue
        //}
        //seen[key] = true
        edges = append(edges, key)
    }
    for _, e := range edges {
        s := strconv.Itoa(e.u) + "," + strconv.Itoa(e.v)
        fmt.Println(s)
    }
}

A sample input is given below. The program can be run as follows (where the last input says how many lines to skip). go run undup.go a.txt 1

# 3072441,117185083
1,2
1,3
1,4
1,5
1,6
1,7
1,8
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享

2条回答 默认 最新

  • 已采纳
    douyue2313 douyue2313 2014-10-12 15:09

    I looked at this file: com-orkut.ungraph.txt and it contains 117,185,082 lines. The way your data is structured, that's at least 16 bytes per line. (Edge is two 64bit ints) That alone is 1.7GB. I have had this problem in the past, and it can be a tricky one. Are you trying to solve this for a specific use case (the file in question) or the general case?

    In the specific case there are a few things about the data you could leverage: (1) the keys are sorted and (2) it looks it stores every connection twice, (3) the numbers don't seem huge. Here are a couple ideas:

    1. If you use a smaller type for the key you will use less memory. Try a uint32.

    2. You could stream (without using a map) the keys to another file by simply seeing if the 2nd column is greater than the first:

      if u < v {
          // write the key to another file
      } else {
          // skip it because v will eventually show v -> u
      }
      

    For the general case there are a couple strategies you could use:

    1. If the order of the resulting list doesn't matter: Use an on-disk hash table to store the map. There are a bunch of these: leveldb, sqlite, tokyo tyrant, ... A really nice one for go is bolt.

      In your for loop you would just check to see if a bucket contains the given key. (You can convert the ints into byte slices using encoding/binary) If it does, just skip it and continue. You will need to move the second for loop processing step into the first for loop so that you don't have to store all the keys.

    2. If the order of the resulting list does matter (and you can't guarantee the input is in order): You can also use an on-disk hash table, but it needs to be sorted. Bolt is sorted so that will work. Add all the keys to it, then traverse it in the second loop.

    Here is an example: (this program will take a while to run with 100 million records)

    package main
    
    import (
        "bufio"
        "encoding/binary"
        "fmt"
        "github.com/boltdb/bolt"
        "os"
        "strconv"
        "strings"
    )
    
    type Edge struct {
        u, v int
    }
    
    func FromKey(bs []byte) Edge {
        return Edge{int(binary.BigEndian.Uint64(bs[:8])), int(binary.BigEndian.Uint64(bs[8:]))}
    }
    
    func (e Edge) Key() [16]byte {
        var k [16]byte
        binary.BigEndian.PutUint64(k[:8], uint64(e.u))
        binary.BigEndian.PutUint64(k[8:], uint64(e.v))
        return k
    }
    
    func main() {
        file, err := os.Open(os.Args[1])
        if err != nil {
            panic(err.Error())
        }
        defer file.Close()
    
        scanner := bufio.NewScanner(file)
    
        for i, _ := strconv.Atoi(os.Args[2]); i > 0; i-- {
            scanner.Scan()
        }
    
        db, _ := bolt.Open("ex.db", 0777, nil)
        defer db.Close()
    
        bucketName := []byte("edges")
        db.Update(func(tx *bolt.Tx) error {
            tx.CreateBucketIfNotExists(bucketName)
            return nil
        })
    
        batchSize := 10000
        total := 0
        batch := make([]Edge, 0, batchSize)
        writeBatch := func() {
            total += len(batch)
            fmt.Println("write batch. total:", total)
            db.Update(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(bucketName)
                for _, edge := range batch {
                    key := edge.Key()
                    bucket.Put(key[:], nil)
                }
                return nil
            })
        }
    
        for scanner.Scan() {
            str := scanner.Text()
            edge := strings.Split(str, "\t")
            u, _ := strconv.Atoi(edge[0])
            v, _ := strconv.Atoi(edge[1])
            var key Edge
            if u < v {
                key = Edge{u, v}
            } else {
                key = Edge{v, u}
            }
            batch = append(batch, key)
            if len(batch) == batchSize {
                writeBatch()
                // reset the batch length to 0
                batch = batch[:0]
            }
        }
        // write anything leftover
        writeBatch()
    
        db.View(func(tx *bolt.Tx) error {
            tx.Bucket(bucketName).ForEach(func(k, v []byte) error {
                edge := FromKey(k)
                fmt.Println(edge)
                return nil
            })
            return nil
        })
    }
    
    点赞 评论 复制链接分享
  • dpo15099 dpo15099 2014-10-14 05:40

    You are squandering memory. Here's how to rectify it.

    You give the sample input a.txt, 48 bytes.

    # 3072441,117185083
    1,2
    1,3
    1,4
    1,5
    

    On http://snap.stanford.edu/data/com-Orkut.html, I found http://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz, 1.8 GB uncompressed, 117,185,083 edges.

    # Undirected graph: ../../data/output/orkut.txt
    # Orkut
    # Nodes: 3072441 Edges: 117185083
    # FromNodeId    ToNodeId
    1   2
    1   3
    1   4
    1   5
    

    On http://socialnetworks.mpi-sws.org/data-imc2007.html, I found http://socialnetworks.mpi-sws.mpg.de/data/orkut-links.txt.gz, 3.4 GB uncompressed, 223,534,301 edges.

    1   2
    1   3
    1   4
    1   5
    

    Since they are similar, one program can handle all formats.

    Your Edge type is

    type Edge struct {
        u, v int
    }
    

    which is 16 bytes on a 64-bit architecture.

    Use

    type Edge struct {
        U, V uint32
    }
    

    which is 8 bytes, it is adequate.

    If the capacity of a slice is not large enough to fit the additional values, append allocates a new, sufficiently large underlying array that fits both the existing slice elements and the additional values. Otherwise, append re-uses the underlying array. For a large slice, the new array is 1.25 times the size of the old array. While the old array is being copied to the new array, 1 + 1.25 = 2.25 times the memory for the old array is required. Therefore, allocate the underlying array so that all values fit.

    make(T, n) initializes map of type T with initial space for n elements. Provide a value for n to limit the cost of reorganization and fragmentation as elements are added. Hashing functions are often imperfect which leads to wasted space. Eliminate the map as it's unneccesary. To eliminate duplicates, sort the slice in place and move the unique elements down.

    A string is immutable, therefore a new string is allocated for scanner.Text() to convert from a byte slice buffer. To parse numbers we use strconv. To minimize temporary allocations, use scanner.Bytes() and adapt strconv.ParseUint to accept a byte array argument (bytconv).

    For example,

    orkut.go

    package main
    
    import (
        "bufio"
        "bytes"
        "errors"
        "fmt"
        "os"
        "runtime"
        "sort"
        "strconv"
    )
    
    type Edge struct {
        U, V uint32
    }
    
    func (e Edge) String() string {
        return fmt.Sprintf("%d,%d", e.U, e.V)
    }
    
    type ByKey []Edge
    
    func (a ByKey) Len() int      { return len(a) }
    func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    func (a ByKey) Less(i, j int) bool {
        if a[i].U < a[j].U {
            return true
        }
        if a[i].U == a[j].U && a[i].V < a[j].V {
            return true
        }
        return false
    }
    
    func countEdges(scanner *bufio.Scanner) int {
        var nNodes, nEdges int
        for scanner.Scan() {
            line := scanner.Bytes()
            if !(len(line) > 0 && line[0] == '#') {
                nEdges++
                continue
            }
            n, err := fmt.Sscanf(string(line), "# Nodes: %d Edges: %d", &nNodes, &nEdges)
            if err != nil || n != 2 {
                n, err = fmt.Sscanf(string(line), "# %d,%d", &nNodes, &nEdges)
                if err != nil || n != 2 {
                    continue
                }
            }
            fmt.Println(string(line))
            break
        }
        if err := scanner.Err(); err != nil {
            panic(err.Error())
        }
        fmt.Println(nEdges)
        return nEdges
    }
    
    func loadEdges(filename string) []Edge {
        file, err := os.Open(filename)
        if err != nil {
            panic(err.Error())
        }
        defer file.Close()
    
        scanner := bufio.NewScanner(file)
        nEdges := countEdges(scanner)
        edges := make([]Edge, 0, nEdges)
        offset, err := file.Seek(0, os.SEEK_SET)
        if err != nil || offset != 0 {
            panic(err.Error())
        }
    
        var sep byte = '\t'
        scanner = bufio.NewScanner(file)
        for scanner.Scan() {
            line := scanner.Bytes()
            if len(line) > 0 && line[0] == '#' {
                continue
            }
            i := bytes.IndexByte(line, sep)
            if i < 0 || i+1 >= len(line) {
                sep = ','
                i = bytes.IndexByte(line, sep)
                if i < 0 || i+1 >= len(line) {
                    err := errors.New("Invalid line format: " + string(line))
                    panic(err.Error())
                }
            }
            u, err := ParseUint(line[:i], 10, 32)
            if err != nil {
                panic(err.Error())
            }
            v, err := ParseUint(line[i+1:], 10, 32)
            if err != nil {
                panic(err.Error())
            }
            if u > v {
                u, v = v, u
            }
            edges = append(edges, Edge{uint32(u), uint32(v)})
        }
        if err := scanner.Err(); err != nil {
            panic(err.Error())
        }
    
        if len(edges) <= 1 {
            return edges
        }
        sort.Sort(ByKey(edges))
        j := 0
        i := j + 1
        for ; i < len(edges); i, j = i+1, j+1 {
            if edges[i] == edges[j] {
                break
            }
        }
        for ; i < len(edges); i++ {
            if edges[i] != edges[j] {
                j++
                edges[j] = edges[i]
            }
        }
        edges = edges[:j+1]
        return edges
    }
    
    func main() {
        if len(os.Args) <= 1 {
            err := errors.New("Missing file name")
            panic(err.Error())
        }
        filename := os.Args[1]
        fmt.Println(filename)
        edges := loadEdges(filename)
    
        var ms runtime.MemStats
        runtime.ReadMemStats(&ms)
        fmt.Println(ms.Alloc, ms.TotalAlloc, ms.Sys, ms.Mallocs, ms.Frees)
        fmt.Println(len(edges), cap(edges))
        for i, e := range edges {
            fmt.Println(e)
            if i >= 10 {
                break
            }
        }
    }
    
    // bytconv from strconv
    
    // Return the first number n such that n*base >= 1<<64.
    func cutoff64(base int) uint64 {
        if base < 2 {
            return 0
        }
        return (1<<64-1)/uint64(base) + 1
    }
    
    // ParseUint is like ParseInt but for unsigned numbers.
    func ParseUint(s []byte, base int, bitSize int) (n uint64, err error) {
        var cutoff, maxVal uint64
    
        if bitSize == 0 {
            bitSize = int(strconv.IntSize)
        }
    
        s0 := s
        switch {
        case len(s) < 1:
            err = strconv.ErrSyntax
            goto Error
    
        case 2 <= base && base <= 36:
            // valid base; nothing to do
    
        case base == 0:
            // Look for octal, hex prefix.
            switch {
            case s[0] == '0' && len(s) > 1 && (s[1] == 'x' || s[1] == 'X'):
                base = 16
                s = s[2:]
                if len(s) < 1 {
                    err = strconv.ErrSyntax
                    goto Error
                }
            case s[0] == '0':
                base = 8
            default:
                base = 10
            }
    
        default:
            err = errors.New("invalid base " + strconv.Itoa(base))
            goto Error
        }
    
        n = 0
        cutoff = cutoff64(base)
        maxVal = 1<<uint(bitSize) - 1
    
        for i := 0; i < len(s); i++ {
            var v byte
            d := s[i]
            switch {
            case '0' <= d && d <= '9':
                v = d - '0'
            case 'a' <= d && d <= 'z':
                v = d - 'a' + 10
            case 'A' <= d && d <= 'Z':
                v = d - 'A' + 10
            default:
                n = 0
                err = strconv.ErrSyntax
                goto Error
            }
            if int(v) >= base {
                n = 0
                err = strconv.ErrSyntax
                goto Error
            }
    
            if n >= cutoff {
                // n*base overflows
                n = 1<<64 - 1
                err = strconv.ErrRange
                goto Error
            }
            n *= uint64(base)
    
            n1 := n + uint64(v)
            if n1 < n || n1 > maxVal {
                // n+v overflows
                n = 1<<64 - 1
                err = strconv.ErrRange
                goto Error
            }
            n = n1
        }
    
        return n, nil
    
    Error:
        return n, &strconv.NumError{"ParseUint", string(s0), err}
    }
    

    Output:

    $ go build orkut.go
    $ time ./orkut ~/release-orkut-links.txt
    /home/peter/release-orkut-links.txt
    223534301
    1788305680 1788327856 1904683256 135 50
    117185083 223534301
    1,2
    1,3
    1,4
    1,5
    1,6
    1,7
    1,8
    1,9
    1,10
    1,11
    1,12
    real    2m53.203s
    user    2m51.584s
    sys 0m1.628s
    $
    

    The orkut.go program with the release-orkut-links.txt file (3,372,855,860 (3.4 GB) bytes with 223,534,301 edges) uses about 1.8 GiB of memory. After eliminating duplicates, 117,185,083 unique edges remain. This matches the 117,185,083 unique edge com-orkut.ungraph.txt file.

    With 8 GB of memory on your machine, you can load much larger files.

    点赞 评论 复制链接分享

相关推荐