dqtok88424 2017-08-12 16:50
浏览 74
已采纳

如何在XML字符串中获取“ <”和“>”?

Is it posible to get '<' and '>' value in this XML string? I have problem with unmarshal, and I can't change the strings. Is there anyone who can help me in this? Here my code:

package main

import (
    "encoding/xml"
    "fmt"
)

func main() {
    type Example struct {
        XMLName xml.Name `xml:"Shop"`
        ShopName  string `xml:"ShopName"`
    }

    myString1 := `<Shop> 
        <ShopName>Fresh Fruit <Fruit Shop></ShopName>
    </Shop>`

    myString2 :=`<Shop> 
        <ShopName>Fresh Fruit < Fruit Shop ></ShopName>
    </Shop>`

    //example 1
    var example1 Example
    err := xml.Unmarshal([]byte(myString1), &example1)
    if err != nil {
        fmt.Println("error: %example1", err)
    }else{
        fmt.Println(example1.ShopName)
    }       

    //example 2
    var example2 Example
    err = xml.Unmarshal([]byte(myString2), &example2)
    if err != nil {
        fmt.Printf("error: %example2", err)
        return
    }else{
        fmt.Println(example2.ShopName)
    }
}

I get an error bellow:

error: %example1 XML syntax error on line 2: attribute name without = in element
error: &{%!e(string=expected element name after <) %!e(int=2)}xample2

What I want to get:

Fresh Fruit <Fruit Shop>
Fresh Fruit < Fruit Shop >
  • 写回答

1条回答 默认 最新

  • doudui1850 2017-08-12 21:38
    关注

    The input you have is definitely invalid XML. There is a bug in the creation routine of the XML.

    Idea

    Since you say you have to deal with it the way it is... here a suggestion:

    1. replace all closing tags via regex to something you will basically never have in your input (e.g. @#lt#@/tagname@#gt#@). While doing that save all the distinct tag names to a slice.
    2. With the slice of tag names replace the start tags
    3. Now escape all remaining < and >
    4. Last but not least replace the original tags back: @#lt#@ to < and @#gt#@ to >

    Now you should have valid xml that is parseable.

    Proof of Concept

    Playground

    package main
    
    import (
        "bytes"
        "fmt"
        "log"
        "regexp"
        "sort"
    )
    
    var (
        rlt = []byte("@#lt#@")
        rgt = []byte("@#gt#@")
        lt  = []byte("&lt;")
        gt  = []byte("&gt;")
    )
    
    // used for sorting strings by length
    type ByLength []string
    
    func (s ByLength) Len() int {
        return len(s)
    }
    func (s ByLength) Swap(i, j int) {
        s[i], s[j] = s[j], s[i]
    }
    func (s ByLength) Less(i, j int) bool {
        return len(s[i]) < len(s[j])
    }
    
    func main() {
        s := `<Shop>
        <ShopName>Fresh Fruit <Fruit Shop></ShopName>
        <ShopName attr="val1">Fresh Fruit <Shop test></ShopName>
    </Shop>`
    
        r1, err := regexp.Compile("</([^<>]*)>")
        if err != nil {
            log.Fatal(err)
        }
    
        names := []string{}
        out := r1.ReplaceAllFunc([]byte(s), func(b []byte) []byte {
            name := b[2 : len(b)-1]
    
            // TODO: only append name if not already in list
            names = append(names, string(name))
    
            // probably optimizable
            bytes := make([]byte, 0, len(name)+12)
            bytes = append(bytes, rlt...)
            bytes = append(bytes, name...)
            bytes = append(bytes, rgt...)
            return bytes
        })
    
        // sort names descending by length otherwise we risk replacing parts of names like with <Shop and <ShopName
        sort.Sort(sort.Reverse(ByLength(names)))
    
        for _, name := range names {
            // replace only exact start tags
            out = bytes.Replace(out, []byte(fmt.Sprintf("<%s>", name)), []byte(fmt.Sprintf("@#lt#@%s@#gt#@", name)), -1)
    
            // replace start tags with attributes
            r3, err := regexp.Compile(fmt.Sprintf("<%s( [^<>=]+=\"[^<>]+)>", name))
            if err != nil {
                // handle error
            }
            out = r3.ReplaceAll(out, []byte(fmt.Sprintf("@#lt#@%s$1@#gt#@", name)))
        }
    
        out = bytes.Replace(out, []byte{'<'}, lt, -1)
        out = bytes.Replace(out, []byte{'>'}, gt, -1)
    
        out = bytes.Replace(out, rlt, []byte{'<'}, -1)
        out = bytes.Replace(out, rgt, []byte{'>'}, -1)
    
        fmt.Println(string(out))
    }
    

    Notes

    1. this is a proof of concept. This is not optimised for performance.
    2. you might still run into content that might not be escaped properly. Then you will need to further optimise. If there is something like this in the content it will be falsely considered a tag: <tagname> or <tagname something ="something>. Therefore expect some xml to still to be invalid. Log invalid xml so you can improve the algorithm.
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?