drbi19093 2016-01-15 13:34
浏览 298
已采纳

在Golang中提取* html.Node的位置偏移

How do I can extract positional offset for specific node of already parsed HTML document? For example, for document <div>Hello, <b>World!</b></div> I want to be able to know that offset of World! is 15:21. Document may be changed while parsing.

I have a solution to render whole document with special marks, but it's really bad for performance. Any ideas?

package main

import (
    "bytes"
    "golang.org/x/net/html"
    "golang.org/x/net/html/atom"
    "log"
    "strings"
)

func nodeIndexOffset(context *html.Node, node *html.Node) (int, int) {
    if node.Type != html.TextNode {
        node = node.FirstChild
    }
    originalData := node.Data

    var buf bytes.Buffer
    node.Data = "|start|" + originalData
    _ = html.Render(&buf, context.FirstChild)
    start := strings.Index(buf.String(), "|start|")

    buf = bytes.Buffer{}
    node.Data = originalData + "|end|"
    _ = html.Render(&buf, context.FirstChild)
    end := strings.Index(buf.String(), "|end|")

    node.Data = originalData
    return start, end
}

func main() {
    s := "<div>Hello, <b>World!</b></div>"
    var context html.Node
    context = html.Node{
        Type:     html.ElementNode,
        Data:     "body",
        DataAtom: atom.Body,
    }
    nodes, err := html.ParseFragment(strings.NewReader(s), &context)
    if err != nil {
        log.Fatal(err)
    }
    for _, node := range nodes {
        context.AppendChild(node)
    }
    world := nodes[0].FirstChild.NextSibling.FirstChild
    log.Println("target", world)
    log.Println(nodeIndexOffset(&context, world))
}
  • 写回答

2条回答 默认 最新

  • dpz90118 2016-01-23 06:50
    关注

    I come up with solution where we extend (please fix me if there's another way to do it) original HTML package with additional custom.go file with new exported function. This function is able to access unexported data property of Tokenizer, which holds exactly start and end position of current Node. We have to adjust positions after each buffer read. See globalBufDif.

    I don't really like that I have to fork the package only to access couple of properties, but seems like this is a Go way.

    func parseWithIndexes(p *parser) (map[*Node][2]int, error) {
        // Iterate until EOF. Any other error will cause an early return.
        var err error
        var globalBufDif int
        var prevEndBuf int
        var tokenIndex [2]int
        tokenMap := make(map[*Node][2]int)
        for err != io.EOF {
            // CDATA sections are allowed only in foreign content.
            n := p.oe.top()
            p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
    
            t := p.top().FirstChild
            for {
                if t != nil && t.NextSibling != nil {
                    t = t.NextSibling
                } else {
                    break
                }
            }
            tokenMap[t] = tokenIndex
            if prevEndBuf > p.tokenizer.data.end {
                globalBufDif += prevEndBuf
            }
            prevEndBuf = p.tokenizer.data.end
            // Read and parse the next token.
            p.tokenizer.Next()
            tokenIndex = [2]int{p.tokenizer.data.start + globalBufDif, p.tokenizer.data.end + globalBufDif}
    
            p.tok = p.tokenizer.Token()
            if p.tok.Type == ErrorToken {
                err = p.tokenizer.Err()
                if err != nil && err != io.EOF {
                    return tokenMap, err
                }
            }
            p.parseCurrentToken()
        }
        return tokenMap, nil
    }
    
    // ParseFragmentWithIndexes parses a fragment of HTML and returns the nodes
    // that were found. If the fragment is the InnerHTML for an existing element,
    // pass that element in context.
    func ParseFragmentWithIndexes(r io.Reader, context *Node) ([]*Node, map[*Node][2]int, error) {
        contextTag := ""
        if context != nil {
            if context.Type != ElementNode {
                return nil, nil, errors.New("html: ParseFragment of non-element Node")
            }
            // The next check isn't just context.DataAtom.String() == context.Data because
            // it is valid to pass an element whose tag isn't a known atom. For example,
            // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
            if context.DataAtom != a.Lookup([]byte(context.Data)) {
                return nil, nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
            }
            contextTag = context.DataAtom.String()
        }
        p := &parser{
            tokenizer: NewTokenizerFragment(r, contextTag),
            doc: &Node{
                Type: DocumentNode,
            },
            scripting: true,
            fragment:  true,
            context:   context,
        }
    
        root := &Node{
            Type:     ElementNode,
            DataAtom: a.Html,
            Data:     a.Html.String(),
        }
        p.doc.AppendChild(root)
        p.oe = nodeStack{root}
        p.resetInsertionMode()
    
        for n := context; n != nil; n = n.Parent {
            if n.Type == ElementNode && n.DataAtom == a.Form {
                p.form = n
                break
            }
        }
    
        tokenMap, err := parseWithIndexes(p)
        if err != nil {
            return nil, nil, err
        }
    
        parent := p.doc
        if context != nil {
            parent = root
        }
    
        var result []*Node
        for c := parent.FirstChild; c != nil; {
            next := c.NextSibling
            parent.RemoveChild(c)
            result = append(result, c)
            c = next
        }
        return result, tokenMap, nil
    }
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

悬赏问题

  • ¥20 ML307A在使用AT命令连接EMQX平台的MQTT时被拒绝
  • ¥20 腾讯企业邮箱邮件可以恢复么
  • ¥15 有人知道怎么将自己的迁移策略布到edgecloudsim上使用吗?
  • ¥15 错误 LNK2001 无法解析的外部符号
  • ¥50 安装pyaudiokits失败
  • ¥15 计组这些题应该咋做呀
  • ¥60 更换迈创SOL6M4AE卡的时候,驱动要重新装才能使用,怎么解决?
  • ¥15 让node服务器有自动加载文件的功能
  • ¥15 jmeter脚本回放有的是对的有的是错的
  • ¥15 r语言蛋白组学相关问题