Assume I have a html page that contains something like
<ul class ="good">
<li>1</li>
<li>2</li>
<li>3</li>
</ul>
<ul class ="bad">
<li>a</li>
<li>b</li>
<li>c</li>
</ul>
I want to grab the <li>
elements inside the first <ul>
. From here I have basically copied (note: edited code per @twotwotwo comment)
page, _ := html.Parse(httpBody)
var f func(*html.Node)
f = func(n *html.Node) {
//fmt.Println("Inside f")
if n.Type == html.ElementNode && n.Data == "ul" {
fmt.Println("ul found -> ",n)
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
} else {
fmt.Println(n.Data ,"is not the correct one")
for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) }
}
}
f(page)
But the only output I obtain is
is not the correct one
html is not the correct one
head is not the correct one
body is not the correct one
I wonder why the recursion stops at body. I have tried with motherfuckingwebsite.com which has tags inside the body
P.S. I have also tried
page := html.NewTokenizer(httpBody)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
return links
}
token := page.Token()
but this seem to show all the tokens, without caring about the tree structure.
EDIT: