douzhuochao4027 2017-02-08 12:28
浏览 25
已采纳

go中的对象被替换

I'm learning go by writing a web spider. I'm trying to get a list of all the business categories from allpages.com.

Below is my entire program. Unfortunately I can't isolate the issue so I've pasted it all.

If you run this program, you'll see that first of all it correctly downloads the first page, and adds all the extracted categories to the list of categories.

However, when it then downloads subsequent pages, it seems to mess up the reference to the parent category. E.g. it incorrectly calculates the URL http://www.allpages.com/travel-tourism/political-ideological-organizations/, when in fact political-ideological-organizations/ is not a subcategory of travel-tourism/. Digging through the logs it seems to overwrite the data in the parent object. The error is more pronounced the more workers there are.

This was working a bit better before I started passing data by reference to the goroutine, but I had essentially the same issue.

I've got several questions:

  1. How can I debug this without resorting to picking through log lines?
  2. What's wrong/why isn't it working and how can it be fixed?

    package main
    
    import (
            "fmt"
            "github.com/PuerkitoBio/goquery"
            "log"
            "strconv"
            "strings"
            "regexp"
    )
    
    const domain = "http://www.allpages.com/"
    const categoryPage = "category.html"
    
    type Category struct {
            url string
            level uint
            name string
            entries int
            parent *Category
    }
    
    type DownloadResult struct {
            doc *goquery.Document
            category *Category
    }
    
    const WORKERS = 2
    const SEPARATOR = "§§§"
    
    func main() {
    
            allCategories := make([]Category, 0)
    
            downloadChannel := make(chan *Category)
            resultsChannel := make(chan *DownloadResult, 100)
    
            for w := 1; w <= WORKERS; w++ {
                    go worker(downloadChannel, resultsChannel)
            }
    
            numRequests := 1
            downloadChannel <- &Category{ domain + categoryPage, 0, "root", 0, nil }
    
            for result := range resultsChannel {
                    var extractor func(doc *goquery.Document) []string
    
                    if result.category.level == 0 {
                            extractor = topLevelExtractor
                    } else if result.category.level == 1 {
                            extractor = secondLevelExtractor
                    } else {
                            extractor = thirdLevelExtractor
                    }
    
                    categories := extractCategories(result.doc, result.category, extractor)
                    allCategories = append(allCategories, *categories...)
    
                    //fmt.Printf("Appending categories: %v", *categories)
    
                    fmt.Printf("total categories = %d, total requests = %d
    ", len(allCategories), numRequests)
    
                    for _, category := range *categories {
                            numRequests += 1
                            downloadChannel <- &category
                    }
    
                    // close the channels when there are no more jobs
                    if len(allCategories) > numRequests {
                            close(downloadChannel)
                            close(resultsChannel)
                    }
            }
    
            fmt.Println("Done")
    }
    
    func worker(downloadChannel <-chan *Category, results chan<- *DownloadResult) {
            for target := range downloadChannel {
                    fmt.Printf("Downloading %v (addr %p) ...", target, &target)
    
                    doc, err := goquery.NewDocument(target.url)
                    if err != nil {
                            log.Fatal(err)
                            panic(err)
                    }
    
                    fmt.Print("done 
    ")
    
                    results <- &DownloadResult{doc, target}
            }
    }
    
    func extractCategories(doc *goquery.Document, parent *Category, extractor func(doc *goquery.Document) []string) *[]Category {
    
            numberRegex, _ := regexp.Compile("[0-9,]+")
    
            log.Printf("Extracting subcategories for page %s
    ", parent)
    
            subCategories := extractor(doc)
    
            categories := make([]Category, 0)
    
            for _, subCategory := range subCategories {
                    log.Printf("Got subcategory=%s from parent=%s", subCategory, parent)
                    extracted := strings.Split(subCategory, SEPARATOR)
    
                    numberWithComma := numberRegex.FindString(extracted[2])
                    number := strings.Replace(numberWithComma, ",", "", -1)
    
                    numRecords, err := strconv.Atoi(number)
                    if err != nil {
                            log.Fatal(err)
                            panic(err)
                    }
    
                    var category Category
    
                    level := parent.level + 1
    
                    if parent.level == 0 {
                            category = Category{ domain + extracted[1], level, extracted[0], numRecords, parent }
                    } else {
                            log.Printf("category URL=%s, parent=%s, parent=%v", extracted[1], parent.url, parent)
                            category = Category{ parent.url + extracted[1], level, extracted[0], numRecords, parent }
                    }
    
                    log.Printf("Appending category=%v (pointer=%p)", category, &category)
    
                    categories = append(categories, category)
            }
    
            return &categories
    }
    
    func topLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".cat-listings-td .c-1s-2m-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s
    ", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    
    func secondLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s
    ", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    
    func thirdLevelExtractor(doc *goquery.Document) []string {
            return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string {
                    title := s.Find("a").Text()
                    url := s.Find("a").Map(func(x int, a *goquery.Selection) string {
                            v, _ := a.Attr("href")
                            return v
                    })
                    records := s.Clone().Children().Remove().End().Text()
    
                    //log.Printf("Item %d: %s, %s - %s
    ", i, title, records, url)
    
                    res := []string{title, url[0], records}
                    return strings.Join(res, SEPARATOR)
            })
    }
    

Update Fixed - see comment below.

  • 写回答

1条回答 默认 最新

  • douyasihefu6214 2017-02-08 12:49
    关注

    Looping over:

                for _, category := range *categories {
                        numRequests += 1
                        downloadChannel <- &category
                }
    

    meant I was sending a reference to the temporary variable category to the channel, instead of the actual memory address of that value.

    I've fixed this by using a different loop:

        for i := 0; i < len(*categories); i++ {
            fmt.Printf("Queuing category: %v (%p)", categoriesValues[i], categoriesValues[i])
    
            downloadChannel <- &categoriesValues[i]
        }
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

悬赏问题

  • ¥15 r语言神经网络自变量重要性分析
  • ¥15 基于双目测规则物体尺寸
  • ¥15 wegame打不开英雄联盟
  • ¥15 公司的电脑,win10系统自带远程协助,访问家里个人电脑,提示出现内部错误,各种常规的设置都已经尝试,感觉公司对此功能进行了限制(我们是集团公司)
  • ¥15 救!ENVI5.6深度学习初始化模型报错怎么办?
  • ¥30 eclipse开启服务后,网页无法打开
  • ¥30 雷达辐射源信号参考模型
  • ¥15 html+css+js如何实现这样子的效果?
  • ¥15 STM32单片机自主设计
  • ¥15 如何在node.js中或者java中给wav格式的音频编码成sil格式呢