dongtui2029 2018-06-11 03:30 采纳率: 0%
浏览 5
已采纳

通过递归调用时,例程未运行

I'm doing the Web Crawler problem from the tour of go. Here's my solution so far:

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q
", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...
", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q
", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

When I run this code, Crawl() is never called when the function recurses (i know this because fmt.Printf("crawling %q... ", url) is only ever called once)

Here are the logs:

crawling "https://golang.org/"...
found: https://golang.org/ "The Go Programming Language"
urls: [https://golang.org/pkg/ https://golang.org/cmd/]
currentUrl: https://golang.org/pkg/
about to crawl "https://golang.org/pkg/"
currentUrl: https://golang.org/cmd/
about to crawl "https://golang.org/cmd/"

What am I doing wrong? I suspect that spawning a thread to do recursion is the wrong way to do this? Please advise.

Please note that I want to do this with as few libraries as possible. I've seen some answers with the WaitGroup package. I dont want to use this.

NOTE: The full code including the lesson boilerplate is below: package main

import (
    "fmt"
)

var cache map[string]bool

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q
", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...
", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q
", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}
  • 写回答

3条回答 默认 最新

  • doushang2571 2018-06-11 08:55
    关注

    As you see in this sample: https://tour.golang.org/concurrency/10, we should do following tasks:

    • Fetch URLs in parallel.
    • Don't fetch the same URL twice.
    • Cache URLs already fetched on a map, but maps alone are not safe for concurrent use!

    So, we can do following steps to resolve above tasks:

    Create struct to store the fetch result:

    type Result struct {
        body string
        urls []string
        err  error
    }
    

    Create a struct to store URL has already fetched on the map, we need use sync.Mutex, this is not introduced in 'A Tour of Go':

    type Cache struct {
        store map[string]bool
        mux   sync.Mutex
    }
    

    Fetch URL and body in parallel: Add URL to the cache when fetching it, but the first we need lock read/write in parallel by a mutex. So, we can modify Crawl function like this:

    func Crawl(url string, depth int, fetcher Fetcher) {
        if depth <= 0 {
            return
        }
    
        ch := make(chan Result)
    
        go func(url string, res chan Result) {
            body, urls, err := fetcher.Fetch(url)
    
            if err != nil {
                ch <- Result{body, urls, err}
                return
            }
    
            var furls []string
            cache.mux.Lock()
            for _, u := range urls {
                if _, exists := cache.store[u]; !exists {
                    furls = append(furls, u)
                }
                cache.store[u] = true
            }
            cache.mux.Unlock()
    
            ch <- Result{body: body, urls: furls, err: err}
    
        }(url, ch)
    
        res := <-ch
    
        if res.err != nil {
            fmt.Println(res.err)
            return
        }
    
        fmt.Printf("found: %s %q
    ", url, res.body)
    
        for _, u := range res.urls {
            Crawl(u, depth-1, fetcher)
        }
    }
    

    You can view the full code and run this in the playground: https://play.golang.org/p/iY9uBXchx3w

    Hope this help.

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

悬赏问题

  • ¥15 有了解d3和topogram.js库的吗?有偿请教
  • ¥100 任意维数的K均值聚类
  • ¥15 stamps做sbas-insar,时序沉降图怎么画
  • ¥15 unity第一人称射击小游戏,有demo,在原脚本的基础上进行修改以达到要求
  • ¥15 买了个传感器,根据商家发的代码和步骤使用但是代码报错了不会改,有没有人可以看看
  • ¥15 关于#Java#的问题,如何解决?
  • ¥15 加热介质是液体,换热器壳侧导热系数和总的导热系数怎么算
  • ¥100 嵌入式系统基于PIC16F882和热敏电阻的数字温度计
  • ¥15 cmd cl 0x000007b
  • ¥20 BAPI_PR_CHANGE how to add account assignment information for service line