du21064 2017-04-27 10:18
浏览 94
已采纳

一个间隔和一组间隔之间的区别?

I have a set of non-overlapping, non-adjacent intervals, eg. [{10,15}, {30,35}, {20,25}]. They are not sorted, but I can sort them if necessary.

Now I am given some new interval, eg. {5,32} and want to generate a new set of intervals describing the difference: the ranges covered by this new interval that aren't in the set. In this example the answer would be: [{5,9}, {16,19}, {26,29}].

What's a fast algorithm for calculating this? Note that the set will typically have 1, sometimes 2, rarely 3 or more items in it, so I want to optimise for this case.

For context, here's the code for initially creating the set from an input stream of start+end data, where I merge as I go:

type Interval struct {
    start int
    end   int
}

func (i *Interval) OverlapsOrAdjacent(j Interval) bool {
    return i.end+1 >= j.start && j.end+1 >= i.start
}

func (i *Interval) Merge(j Interval) bool {
    if !i.OverlapsOrAdjacent(j) {
        return false
    }
    if j.start < i.start {
        i.start = j.start
    }
    if j.end > i.end {
        i.end = j.end
    }
    return true
}

type Intervals []Interval

func (ivs Intervals) Len() int           { return len(ivs) }
func (ivs Intervals) Swap(i, j int)      { ivs[i], ivs[j] = ivs[j], ivs[i] }
func (ivs Intervals) Less(i, j int) bool { return ivs[i].start < ivs[j].start }

func (ivs Intervals) Merge(iv Interval) Intervals {
    ivs = append(ivs, iv)
    merged := make(Intervals, 0, len(ivs))
    for _, iv := range ivs {
        for i := 0; i < len(merged); {
            if iv.Merge(merged[i]) {
                merged = append(merged[:i], merged[i+1:]...)
            } else {
                i++
            }
        }
        merged = append(merged, iv)
    }
    return merged
}

func (ivs Intervals) MergeUsingSort(iv Interval) Intervals {
    ivs = append(ivs, iv)
    sort.Sort(ivs)
    merged := make(Intervals, 0, len(ivs))
    merged = append(merged, ivs[0])
    for i := 1; i < len(ivs); i++ {
        last := len(merged) - 1
        if !merged[last].Merge(ivs[i]) {
            merged = append(merged, ivs[i])
        }
    }
    return merged
}

func (ivs Intervals) Difference(iv Interval) Intervals {
    // ???
    return ivs
}

func main() {
    var ivs Intervals
    for _, input := range inputsFromSomewhere { // in reality, I don't have all these inputs at once, they come in one at a time
        iv := Interval{input.start, input.end}
        diffs := ivs.Difference(iv) // not yet implemented...
        // do something with diffs
        ivs = ivs.Merge(iv)
    }
}

I find that the above Intervals.Merge() is 2x faster than MergeUsingSort(), so I wonder if there's also a simple non-sorting way of answering my question.

  • 写回答

2条回答 默认 最新

  • douseda0009 2017-04-28 14:33
    关注

    The question and answer code is incomplete and doesn't compile. There are no benchmarks. From a quick glance at the code, it's likely inefficient.

    Usable code for interval.go and interval_test.go was obtained from https://github.com/VertebrateResequencing/wr/tree/develop/minfys.

    Let's start by writing a benchmark for the interval difference example.

    package minfys
    
    import (
        "fmt"
        "testing"
    )
    
    // Example
    var (
        xA = Intervals{{10, 15}, {30, 35}, {20, 25}}
        xB = Interval{5, 32}
        xD = Intervals{{5, 9}, {16, 19}, {26, 29}}
        xR = Intervals{}
    )
    
    func BenchmarkExample(b *testing.B) {
        b.ReportAllocs()
        a := make(Intervals, len(xA))
        b.ResetTimer()
        for i := 0; i < b.N; i++ {
            copy(a, xA)
            xR = a.Difference(xB)
        }
        b.StopTimer()
        if fmt.Sprint(xD) != fmt.Sprint(xR) {
            b.Fatal(xD, xR)
        }
    }
    

    Next, write a Difference method.

    package minfys
    
    func (a Intervals) Difference(b Interval) Intervals {
        // If A and B are sets, then the relative complement of A in B
        // is the set of elements in B but not in A.
        // The relative complement of A in B is denoted B ∖  A:
        //     B \ A = {x ∈ A | x ∉ B}
        //     B \ A = B ∩ A'
        //
        // For example. d = a\b,
        //     a: [{10, 15}, {30, 35}, {20, 25}]
        //     b: {5,32}
        //     d: [{5,9}, {16,19}, {26,29}]
        // The elements of set a are non-overlapping, non-adjacent,
        // and unsorted intervals.
    
        if len(a) <= 0 {
            return Intervals{b}
        }
    
        d := make(Intervals, 0, 3)
        for ; len(a) > 0; a = a[1:] {
            for i := 1; i < len(a); i++ {
                if a[i].Start < a[0].Start {
                    a[i], a[0] = a[0], a[i]
                }
            }
    
            if b.Start < a[0].Start {
                if b.End < a[0].Start {
                    d = append(d, b)
                    break
                }
                d = append(d, Interval{b.Start, a[0].Start - 1})
                b.Start = a[0].Start
            }
            if b.End <= a[0].End {
                break
            }
            if b.Start <= a[0].End {
                b.Start = a[0].End + 1
            }
            if len(a) == 1 {
                if b.Start <= a[0].End {
                    b.Start = a[0].End + 1
                }
                d = append(d, b)
                break
            }
        }
        return d
    }
    

    Now, benchmark the Difference method.

    BenchmarkExample-4     20000000     62.4 ns/op    48 B/op      1 allocs/op
    

    sbs wrote a Difference method.

    // Interval struct is used to describe something with a start and end. End must
    // be greater than start.
    type Interval struct {
        Start int64
        End   int64
    }
    
    // Overlaps returns true if this interval overlaps with the supplied one.
    func (i *Interval) Overlaps(j Interval) bool {
        // https://nedbatchelder.com/blog/201310/range_overlap_in_two_compares.html
        return i.End >= j.Start && j.End >= i.Start
    }
    
    // Intervals type is a slice of Interval.
    type Intervals []Interval
    
    // Difference returns any portions of iv that do not overlap with any of our
    // intervals. Assumes that all of our intervals have been Merge()d in.
    func (ivs Intervals) Difference(iv Interval) (diffs Intervals) {
        diffs = append(diffs, iv)
        for _, prior := range ivs {
            for i := 0; i < len(diffs); {
                if left, right, overlapped := prior.Difference(diffs[i]); overlapped {
                    if len(diffs) == 1 {
                        diffs = nil
                    } else {
                        diffs = append(diffs[:i], diffs[i+1:]...)
                    }
    
                    if left != nil {
                        diffs = append(diffs, *left)
                    }
                    if right != nil {
                        diffs = append(diffs, *right)
                    }
                } else {
                    i++
                }
            }
            if len(diffs) == 0 {
                break
            }
        }
    
        return
    }
    

    Benchmark sbs's Difference method.

    BenchmarkExample-4      5000000    365 ns/op     128 B/op      4 allocs/op
    

    peterSO's Difference method is significantly faster.

    old.txt (sbs) versus new.txt (peterSO):
    
    benchmark              old ns/op     new ns/op     delta
    BenchmarkExample-4     365           62.4          -82.90%
    
    benchmark              old allocs     new allocs   delta
    BenchmarkExample-4     4              1            -75.00%
    
    benchmark              old bytes     new bytes     delta
    BenchmarkExample-4     128           48            -62.50%
    

    This is just a beginning. There are likely other improvements that can be made.

    There were some errors in interval_test.go. ShouldBeNil is for pointers; ShouldBeEmpty is for collections. ShouldResemble does not handle set equality (two sets which contain the same elements are the same set). Change ShouldResemble order to match implementation dependent order.

    $ go test
    ..........................................................................................................................x......................................................x................x
    Failures:
    
      * interval_test.go 
      Line 247:
      Expected: nil
      Actual:   '[]'
    
      * interval_test.go 
      Line 375:
      Expected: 'minfys.Intervals{minfys.Interval{Start:5, End:6}, minfys.Interval{Start:31, End:32}, minfys.Interval{Start:11, End:14}, minfys.Interval{Start:19, End:19}}'
      Actual:   'minfys.Intervals{minfys.Interval{Start:5, End:6}, minfys.Interval{Start:11, End:14}, minfys.Interval{Start:19, End:19}, minfys.Interval{Start:31, End:32}}'
      (Should resemble)!
    
      * interval_test.go 
      Line 413:
      Expected: 'minfys.Intervals{minfys.Interval{Start:7, End:10}, minfys.Interval{Start:1, End:3}, minfys.Interval{Start:15, End:17}}'
      Actual:   'minfys.Intervals{minfys.Interval{Start:1, End:3}, minfys.Interval{Start:7, End:10}, minfys.Interval{Start:15, End:17}}'
      (Should resemble)!
    
    
    195 total assertions
    
    ...
    198 total assertions
    
    --- FAIL: TestIntervals (0.04s)
    FAIL
    

    .

    $ diff -a -u ../interval_test.go interval_test.go
    --- ../interval_test.go 2017-04-29 20:23:29.365344008 -0400
    +++ interval_test.go    2017-04-29 20:54:14.349344903 -0400
    @@ -244,19 +244,19 @@
                So(len(ivs), ShouldEqual, 1)
    
                newIvs = ivs.Difference(twoSix)
    -           So(newIvs, ShouldBeNil)
    +           So(newIvs, ShouldBeEmpty)
                ivs = ivs.Merge(twoSix)
                So(len(ivs), ShouldEqual, 1)
    
                newIvs = ivs.Difference(oneThree)
    -           So(newIvs, ShouldBeNil)
    +           So(newIvs, ShouldBeEmpty)
                ivs = ivs.Merge(oneThree)
                So(len(ivs), ShouldEqual, 1)
    
                oneSeven := Interval{1, 7}
    
                newIvs = ivs.Difference(oneSix)
    -           So(newIvs, ShouldBeNil)
    +           So(newIvs, ShouldBeEmpty)
                ivs = ivs.Merge(oneSix)
                So(len(ivs), ShouldEqual, 1)
    
    @@ -372,7 +372,7 @@
    
                fiveThirtyTwo := Interval{5, 32}
                newIvs = ivs.Difference(fiveThirtyTwo)
    -           So(newIvs, ShouldResemble, Intervals{Interval{5, 6}, Interval{31, 32}, Interval{11, 14}, Interval{19, 19}})
    +           So(newIvs, ShouldResemble, Intervals{Interval{5, 6}, Interval{11, 14}, Interval{19, 19}, Interval{31, 32}})
                ivs = ivs.Merge(fiveThirtyTwo)
                So(len(ivs), ShouldEqual, 3)
    
    @@ -409,7 +409,7 @@
    
                ivs = ivs.Truncate(17)
    
    -           expected := Intervals{sevenTen, oneThree, Interval{15, 17}}
    +           expected := Intervals{oneThree, sevenTen, Interval{15, 17}}
                So(ivs, ShouldResemble, expected)
            })
        })
    

    .

    $ go test
    .............................................................................................................................................................................................................
    205 total assertions
    
    ...
    208 total assertions
    
    PASS
    $ 
    

    I [@sbs] confirm it's faster than my solution. Though if you just measure the wall-time that using Difference() takes (put a before := time.Now() before the last Difference() call in the interval_test.go, and a time.Since(before) after it and sum those durations over the loop), it seems to make surprisingly little difference (on my machine it takes ~31ms with my solution and ~29ms with yours).

    As requested, interval_test.go was modified to measure wall time:

    $ diff -a -u ../interval_test.go walltime_test.go
    --- ../interval_test.go 2017-04-29 20:23:29.365344008 -0400
    +++ walltime_test.go    2017-04-30 13:39:29.000000000 -0400
    @@ -24,6 +24,7 @@
        "math/rand"
        "testing"
        "time"
    +   "fmt"
     )
    
     func TestIntervals(t *testing.T) {
    @@ -459,16 +460,20 @@
    
            var ivs Intervals
            errors := 0
    +       var diffTime time.Duration
            t := time.Now()
            for i, input := range inputs {
                iv := NewInterval(int64(input), int64(readSize))
    +           before := time.Now()
                newIvs := ivs.Difference(iv)
    +           diffTime += time.Since(before)
                if (len(newIvs) == 1) != exepectedNew[i] {
                    errors++
                }
                ivs = ivs.Merge(iv)
            }
    -       // fmt.Printf("
    took %s
    ", time.Since(t))
    +       fmt.Printf("took %s
    ", time.Since(t))
    +       fmt.Printf("
      Difference took %s
    ", diffTime)
            So(errors, ShouldEqual, 0)
            So(len(ivs), ShouldEqual, 1)
            So(time.Since(t).Seconds(), ShouldBeLessThan, 1) // 42ms on my machine
    $ 
    

    The interval_test.go benchmark input sizes and frequencies were

    size    frequency
    0       1
    1       94929
    2       50072
    3       4998
    

    Output sizes and frequencies were

    size    frequency
    0       50000
    1       100000
    

    Tuning peterSo's Difference method for this distribution gives

    package minfys
    
    func (a Intervals) Difference(b Interval) Intervals {
        // If A and B are sets, then the relative complement of A in B
        // is the set of elements in B but not in A.
        // The relative complement of A in B is denoted B ∖  A:
        //     B \ A = {x ∈ A | x ∉ B}
        //     B \ A = B ∩ A'
        //
        // For example. d = a\b,
        //     a: [{10, 15}, {30, 35}, {20, 25}]
        //     b: {5,32}
        //     d: [{5,9}, {16,19}, {26,29}]
        // The elements of set a are non-overlapping, non-adjacent,
        // and unsorted intervals.
    
        if len(a) <= 0 {
            return Intervals{b}
        }
    
        var d Intervals
        for ; len(a) > 0; a = a[1:] {
            for i := 1; i < len(a); i++ {
                if a[i].Start < a[0].Start {
                    a[i], a[0] = a[0], a[i]
                }
            }
    
            if b.Start < a[0].Start {
                if b.End < a[0].Start {
                    d = append(d, b)
                    break
                }
                d = append(d, Interval{b.Start, a[0].Start - 1})
                b.Start = a[0].Start
            }
            if b.End <= a[0].End {
                break
            }
            if b.Start <= a[0].End {
                b.Start = a[0].End + 1
            }
            if len(a) == 1 {
                if b.Start <= a[0].End {
                    b.Start = a[0].End + 1
                }
                d = append(d, b)
                break
            }
        }
        return d
    }
    

    Running the interval_test.go benchmark for peterSO's and sbs's Difference methods gives

    $ go test -v
    
      Merging many intervals is fast took 26.208614ms
    
      Difference took 10.706858ms
    

    and

    $ go test -v
    
      Merging many intervals is fast took 30.799216ms
    
      Difference took 14.414488ms
    

    peterSo's Difference method is significantly faster than sbs's: 10.706858ms versus 14.414488ms or minus 25.7 percent.

    Updating the earlier example benchmark results for peterSO's revised Difference method gives

    old.txt (sbs) versus new.txt (peterSO):
    
    benchmark              old ns/op     new ns/op     delta
    BenchmarkExample-4     365           221           -39.45%
    
    benchmark              old allocs     new allocs   delta
    BenchmarkExample-4     4              3            -25.00%
    
    benchmark              old bytes     new bytes     delta
    BenchmarkExample-4     128           112           -12.50%
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

悬赏问题

  • ¥15 关于#hadoop#的问题
  • ¥15 (标签-Python|关键词-socket)
  • ¥15 keil里为什么main.c定义的函数在it.c调用不了
  • ¥50 切换TabTip键盘的输入法
  • ¥15 可否在不同线程中调用封装数据库操作的类
  • ¥15 微带串馈天线阵列每个阵元宽度计算
  • ¥15 keil的map文件中Image component sizes各项意思
  • ¥20 求个正点原子stm32f407开发版的贪吃蛇游戏
  • ¥15 划分vlan后,链路不通了?
  • ¥20 求各位懂行的人,注册表能不能看到usb使用得具体信息,干了什么,传输了什么数据