duanning9110
2019-07-04 07:30
浏览 49
已采纳

进行巡回练习:Web爬网程序-所有goroutine都处于睡眠状态-死锁

Exercise from: https://tour.golang.org/concurrency/10

Description:

In this exercise you'll use Go's concurrency features to parallelize a web crawler.

Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!

Here's my answer:

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

var crawledURLs = make(map[string]bool)
var mux sync.Mutex

func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
    defer func() { quit <- true }()
    if depth <= 0 {
        return
    }

    mux.Lock()
    _, isCrawled := crawledURLs[url]
    if isCrawled {
        return
    }
    crawledURLs[url] = true
    mux.Unlock()

    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q
", url, body)
    quitThis := make(chan bool)
    for _, u := range urls {
        go CrawlURL(u, depth-1, fetcher, quitThis)
    }
    for range urls {
        <-quitThis
    }
    return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    CrawlURL(url, depth, fetcher, make(chan bool))
    return
}

func main() {
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

And the output:

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
fatal error: all goroutines are asleep - deadlock!

I was wondering why will deadlock happen? Is it because I use channels in the wrong way?


Noting that I forgot to release the mutex in the if isCrawled {} branch, so I've edited my code like this:

...
    if isCrawled {
        mux.Unlock() // added this line
        return
    }
...

But the deadlock still exists, and the output is different:

found: https://golang.org/ "The Go Programming Language"
not found: https://golang.org/cmd/
found: https://golang.org/pkg/ "Packages"
found: https://golang.org/pkg/os/ "Package os"
found: https://golang.org/pkg/fmt/ "Package fmt"
fatal error: all goroutines are asleep - deadlock!
  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

1条回答 默认 最新

  • dtxf759200 2019-07-04 07:58
    已采纳

    The main issue is that you forgot to release the mutex before returning in the if isCrawled {} branch.

    Moreover, I would suggest to use synchronization APIs if you actually need to synchronize goroutines. Channels are better used for communicating and sharing data.

    This is the solution with sync.WaitGroup: https://play.golang.org/p/slrnmr3sPrs

    Here is instead you solution with only channels: https://play.golang.org/p/FbPXxPSXvFL

    The problem was that the very first time you call CrawlURL() you are not reading from the channel you pass as argument. Therefore, once that function tries to send something into it through defer func() { quit <- true }(), it block forever and never returns.

    package main
    
    import (
        "fmt"
        "sync"
    )
    
    type Fetcher interface {
        // Fetch returns the body of URL and
        // a slice of URLs found on that page.
        Fetch(url string) (body string, urls []string, err error)
    }
    
    var crawledURLs = make(map[string]bool)
    var mux sync.Mutex
    
    func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) {
        //For very first function instance, this would block forever if 
        //nobody is receiving from the other end of this channel
        defer func() { quit <- true }()
    
        if depth <= 0 {
            return
        }
    
        mux.Lock()
        _, isCrawled := crawledURLs[url]
        if isCrawled {
            mux.Unlock()
            return
        }
        crawledURLs[url] = true
        mux.Unlock()
    
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q
    ", url, body)
        quitThis := make(chan bool)
        for _, u := range urls {
            go CrawlURL(u, depth-1, fetcher, quitThis)
        }
        for range urls {
            <-quitThis
        }
        return
    }
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(url string, depth int, fetcher Fetcher) {
        lastQuit := make(chan bool)
        go CrawlURL(url, depth, fetcher, lastQuit)
        //You need to receive from this channel in order to
        //unblock the called function
        <-lastQuit
        return
    }
    
    func main() {
        Crawl("https://golang.org/", 10, fetcher)
    }
    
    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult
    
    type fakeResult struct {
        body string
        urls []string
    }
    
    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
        if res, ok := f[url]; ok {
            return res.body, res.urls, nil
        }
        return "", nil, fmt.Errorf("not found: %s", url)
    }
    
    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
        "https://golang.org/": &fakeResult{
            "The Go Programming Language",
            []string{
                "https://golang.org/pkg/",
                "https://golang.org/cmd/",
            },
        },
        "https://golang.org/pkg/": &fakeResult{
            "Packages",
            []string{
                "https://golang.org/",
                "https://golang.org/cmd/",
                "https://golang.org/pkg/fmt/",
                "https://golang.org/pkg/os/",
            },
        },
        "https://golang.org/pkg/fmt/": &fakeResult{
            "Package fmt",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
        "https://golang.org/pkg/os/": &fakeResult{
            "Package os",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
    }
    
    已采纳该答案
    打赏 评论

相关推荐 更多相似问题