I am trying to use gocolly's Parallelism setting to throttle scraping a maximum number of URLs at a time.
Using the code I've pasted below, I am getting this output:
Visiting https://www.google.com/search?q=GrkZmM
Visiting https://www.google.com/search?q=eYSGmF
Visiting https://www.google.com/search?q=MtYvWU
Visiting https://www.google.com/search?q=yMDfIa
Visiting https://www.google.com/search?q=sQuKLv
Done visiting https://www.google.com/search?q=MtYvWU
Done visiting https://www.google.com/search?q=GrkZmM
Done visiting https://www.google.com/search?q=eYSGmF
Done visiting https://www.google.com/search?q=yMDfIa
Done visiting https://www.google.com/search?q=sQuKLv
Which shows that the visits are not blocking with the max number of threads given. When adding more URLs, they are sent all together resulting in a ban from the server.
How can I configure the library to get the following output:
Visiting https://www.google.com/search?q=GrkZmM
Visiting https://www.google.com/search?q=eYSGmF
Done visiting https://www.google.com/search?q=MtYvWU
Done visiting https://www.google.com/search?q=GrkZmM
Visiting https://www.google.com/search?q=MtYvWU
Visiting https://www.google.com/search?q=yMDfIa
Done visiting https://www.google.com/search?q=eYSGmF
Done visiting https://www.google.com/search?q=yMDfIa
Visiting https://www.google.com/search?q=sQuKLv
Done visiting https://www.google.com/search?q=sQuKLv
Here is the code:
const (
letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
URL = "https://www.google.com/search?q="
)
func RandStringBytes(n int) chan string {
out := make(chan string)
quit := make(chan int)
go func() {
for i := 1; i <= 5; i++ {
b := make([]byte, n)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
out <- string(b)
}
close(out)
quit <- 0
}()
return out
}
func main() {
c := RandStringBytes(6)
collector := colly.NewCollector(
colly.AllowedDomains("www.google.com"),
colly.Async(true),
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"),
)
collector.Limit(&colly.LimitRule{
DomainRegexp: "www.google.com",
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
collector.OnResponse(func(r *colly.Response) {
url := r.Ctx.Get("url")
fmt.Println("Done visiting", url)
})
collector.OnRequest(func(r *colly.Request) {
r.Ctx.Put("url", r.URL.String())
fmt.Println("Visiting", r.URL.String())
})
collector.OnError(func(r *colly.Response, err error) {
fmt.Println(err)
})
for w := range c {
collector.Visit(URL+w)
}
collector.Wait()
}
Visiting https://www.google.com/search?q=GrkZmM
Visiting https://www.google.com/search?q=eYSGmF
Visiting https://www.google.com/search?q=MtYvWU
Visiting https://www.google.com/search?q=yMDfIa
Visiting https://www.google.com/search?q=sQuKLv
Done visiting https://www.google.com/search?q=MtYvWU
Done visiting https://www.google.com/search?q=GrkZmM
Done visiting https://www.google.com/search?q=eYSGmF
Done visiting https://www.google.com/search?q=yMDfIa
Done visiting https://www.google.com/search?q=sQuKLv