I'm learning go by writing a web spider. I'm trying to get a list of all the business categories from allpages.com
.
Below is my entire program. Unfortunately I can't isolate the issue so I've pasted it all.
If you run this program, you'll see that first of all it correctly downloads the first page, and adds all the extracted categories to the list of categories.
However, when it then downloads subsequent pages, it seems to mess up the reference to the parent category. E.g. it incorrectly calculates the URL http://www.allpages.com/travel-tourism/political-ideological-organizations/
, when in fact political-ideological-organizations/
is not a subcategory of travel-tourism/
. Digging through the logs it seems to overwrite the data in the parent
object. The error is more pronounced the more workers there are.
This was working a bit better before I started passing data by reference to the goroutine, but I had essentially the same issue.
I've got several questions:
- How can I debug this without resorting to picking through log lines?
-
What's wrong/why isn't it working and how can it be fixed?
package main import ( "fmt" "github.com/PuerkitoBio/goquery" "log" "strconv" "strings" "regexp" ) const domain = "http://www.allpages.com/" const categoryPage = "category.html" type Category struct { url string level uint name string entries int parent *Category } type DownloadResult struct { doc *goquery.Document category *Category } const WORKERS = 2 const SEPARATOR = "§§§" func main() { allCategories := make([]Category, 0) downloadChannel := make(chan *Category) resultsChannel := make(chan *DownloadResult, 100) for w := 1; w <= WORKERS; w++ { go worker(downloadChannel, resultsChannel) } numRequests := 1 downloadChannel <- &Category{ domain + categoryPage, 0, "root", 0, nil } for result := range resultsChannel { var extractor func(doc *goquery.Document) []string if result.category.level == 0 { extractor = topLevelExtractor } else if result.category.level == 1 { extractor = secondLevelExtractor } else { extractor = thirdLevelExtractor } categories := extractCategories(result.doc, result.category, extractor) allCategories = append(allCategories, *categories...) //fmt.Printf("Appending categories: %v", *categories) fmt.Printf("total categories = %d, total requests = %d ", len(allCategories), numRequests) for _, category := range *categories { numRequests += 1 downloadChannel <- &category } // close the channels when there are no more jobs if len(allCategories) > numRequests { close(downloadChannel) close(resultsChannel) } } fmt.Println("Done") } func worker(downloadChannel <-chan *Category, results chan<- *DownloadResult) { for target := range downloadChannel { fmt.Printf("Downloading %v (addr %p) ...", target, &target) doc, err := goquery.NewDocument(target.url) if err != nil { log.Fatal(err) panic(err) } fmt.Print("done ") results <- &DownloadResult{doc, target} } } func extractCategories(doc *goquery.Document, parent *Category, extractor func(doc *goquery.Document) []string) *[]Category { numberRegex, _ := regexp.Compile("[0-9,]+") log.Printf("Extracting subcategories for page %s ", parent) subCategories := extractor(doc) categories := make([]Category, 0) for _, subCategory := range subCategories { log.Printf("Got subcategory=%s from parent=%s", subCategory, parent) extracted := strings.Split(subCategory, SEPARATOR) numberWithComma := numberRegex.FindString(extracted[2]) number := strings.Replace(numberWithComma, ",", "", -1) numRecords, err := strconv.Atoi(number) if err != nil { log.Fatal(err) panic(err) } var category Category level := parent.level + 1 if parent.level == 0 { category = Category{ domain + extracted[1], level, extracted[0], numRecords, parent } } else { log.Printf("category URL=%s, parent=%s, parent=%v", extracted[1], parent.url, parent) category = Category{ parent.url + extracted[1], level, extracted[0], numRecords, parent } } log.Printf("Appending category=%v (pointer=%p)", category, &category) categories = append(categories, category) } return &categories } func topLevelExtractor(doc *goquery.Document) []string { return doc.Find(".cat-listings-td .c-1s-2m-1-td1").Map(func(i int, s *goquery.Selection) string { title := s.Find("a").Text() url := s.Find("a").Map(func(x int, a *goquery.Selection) string { v, _ := a.Attr("href") return v }) records := s.Clone().Children().Remove().End().Text() //log.Printf("Item %d: %s, %s - %s ", i, title, records, url) res := []string{title, url[0], records} return strings.Join(res, SEPARATOR) }) } func secondLevelExtractor(doc *goquery.Document) []string { return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string { title := s.Find("a").Text() url := s.Find("a").Map(func(x int, a *goquery.Selection) string { v, _ := a.Attr("href") return v }) records := s.Clone().Children().Remove().End().Text() //log.Printf("Item %d: %s, %s - %s ", i, title, records, url) res := []string{title, url[0], records} return strings.Join(res, SEPARATOR) }) } func thirdLevelExtractor(doc *goquery.Document) []string { return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string { title := s.Find("a").Text() url := s.Find("a").Map(func(x int, a *goquery.Selection) string { v, _ := a.Attr("href") return v }) records := s.Clone().Children().Remove().End().Text() //log.Printf("Item %d: %s, %s - %s ", i, title, records, url) res := []string{title, url[0], records} return strings.Join(res, SEPARATOR) }) }
Update Fixed - see comment below.