How do I can extract positional offset for specific node of already parsed HTML document? For example, for document <div>Hello, <b>World!</b></div>
I want to be able to know that offset of World!
is 15:21
. Document may be changed while parsing.
I have a solution to render whole document with special marks, but it's really bad for performance. Any ideas?
package main
import (
"bytes"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"log"
"strings"
)
func nodeIndexOffset(context *html.Node, node *html.Node) (int, int) {
if node.Type != html.TextNode {
node = node.FirstChild
}
originalData := node.Data
var buf bytes.Buffer
node.Data = "|start|" + originalData
_ = html.Render(&buf, context.FirstChild)
start := strings.Index(buf.String(), "|start|")
buf = bytes.Buffer{}
node.Data = originalData + "|end|"
_ = html.Render(&buf, context.FirstChild)
end := strings.Index(buf.String(), "|end|")
node.Data = originalData
return start, end
}
func main() {
s := "<div>Hello, <b>World!</b></div>"
var context html.Node
context = html.Node{
Type: html.ElementNode,
Data: "body",
DataAtom: atom.Body,
}
nodes, err := html.ParseFragment(strings.NewReader(s), &context)
if err != nil {
log.Fatal(err)
}
for _, node := range nodes {
context.AppendChild(node)
}
world := nodes[0].FirstChild.NextSibling.FirstChild
log.Println("target", world)
log.Println(nodeIndexOffset(&context, world))
}