duanning9110 2019-07-04 07:30
浏览 50
已采纳

进行巡回练习:Web爬网程序-所有goroutine都处于睡眠状态-死锁

Exercise from: https://tour.golang.org/concurrency/10

Description:

In this exercise you'll use Go's concurrency features to parallelize a web crawler.

Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!

Here's my answer:

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

var crawledURLs = make(map[string]bool)
var mux sync.Mutex

func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
    defer func() { quit <- true }()
    if depth <= 0 {
        return
    }

    mux.Lock()
    _, isCrawled := crawledURLs[url]
    if isCrawled {
        return
    }
    crawledURLs[url] = true
    mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q
", url, body)
    quitThis := make(chan bool)
    for _, u := range urls {
        go CrawlURL(u, depth-1, fetcher, quitThis)
    }
    for range urls {
        <-quitThis
    }
    return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    CrawlURL(url, depth, fetcher, make(chan bool))
    return
}

func main() {
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

And the output:

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
fatal error: all goroutines are asleep - deadlock!

I was wondering why will deadlock happen? Is it because I use channels in the wrong way?


Noting that I forgot to release the mutex in the if isCrawled {} branch, so I've edited my code like this:

...
    if isCrawled {
        mux.Unlock() // added this line
        return
    }
...

But the deadlock still exists, and the output is different:

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"
fatal error: all goroutines are asleep - deadlock!
  • 写回答

1条回答 默认 最新

  • dtxf759200 2019-07-04 07:58
    关注

    The main issue is that you forgot to release the mutex before returning in the if isCrawled {} branch.

    Moreover, I would suggest to use synchronization APIs if you actually need to synchronize goroutines. Channels are better used for communicating and sharing data.

    This is the solution with sync.WaitGroup: https://play.golang.org/p/slrnmr3sPrs

    Here is instead you solution with only channels: https://play.golang.org/p/FbPXxPSXvFL

    The problem was that the very first time you call CrawlURL() you are not reading from the channel you pass as argument. Therefore, once that function tries to send something into it through defer func() { quit <- true }(), it block forever and never returns.

    package main
    
    import (
        "fmt"
        "sync"
    )
    
    type Fetcher interface {
        // Fetch returns the body of URL and
        // a slice of URLs found on that page.
        Fetch(url string) (body string, urls []string, err error)
    }
    
    var crawledURLs = make(map[string]bool)
    var mux sync.Mutex
    
    func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
        //For very first function instance, this would block forever if 
        //nobody is receiving from the other end of this channel
        defer func() { quit <- true }()
    
        if depth <= 0 {
            return
        }
    
        mux.Lock()
        _, isCrawled := crawledURLs[url]
        if isCrawled {
            mux.Unlock()
            return
        }
        crawledURLs[url] = true
        mux.Unlock()
    
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q
    ", url, body)
        quitThis := make(chan bool)
        for _, u := range urls {
            go CrawlURL(u, depth-1, fetcher, quitThis)
        }
        for range urls {
            <-quitThis
        }
        return
    }
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(url string, depth int, fetcher Fetcher) {
        lastQuit := make(chan bool)
        go CrawlURL(url, depth, fetcher, lastQuit)
        //You need to receive from this channel in order to
        //unblock the called function
        <-lastQuit
        return
    }
    
    func main() {
        Crawl("https://golang.org/", 10, fetcher)
    }
    
    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult
    
    type fakeResult struct {
        body string
        urls []string
    }
    
    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
        if res, ok := f[url]; ok {
            return res.body, res.urls, nil
        }
        return "", nil, fmt.Errorf("not found: %s", url)
    }
    
    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
        "https://golang.org/": &fakeResult{
            "The Go Programming Language",
            []string{
                "https://golang.org/pkg/",
                "https://golang.org/cmd/",
            },
        },
        "https://golang.org/pkg/": &fakeResult{
            "Packages",
            []string{
                "https://golang.org/",
                "https://golang.org/cmd/",
                "https://golang.org/pkg/fmt/",
                "https://golang.org/pkg/os/",
            },
        },
        "https://golang.org/pkg/fmt/": &fakeResult{
            "Package fmt",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
        "https://golang.org/pkg/os/": &fakeResult{
            "Package os",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
    }
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

悬赏问题

  • ¥15 综合布线实例设计,就好看好看不恐怖可好滤镜好聚
  • ¥15 使用moviepy库视频合并时出错
  • ¥30 FLUENT液固传质UDF
  • ¥15 怎么看梯度直方图以,怎么判断梯度消失/爆炸,怎么解决
  • ¥15 aspnetdll文件访问拒绝
  • ¥15 wpf中在模版中寻找元素
  • ¥15 MFC平台生成指定圆
  • ¥15 jmeter出现403
  • ¥500 求华为P30PRO手机硬盘数据恢复
  • ¥15 关于#vscode#的问题:ESP32开发板对接MQTT实现小灯泡的开关