The input you have is definitely invalid XML. There is a bug in the creation routine of the XML.
Idea
Since you say you have to deal with it the way it is... here a suggestion:
- replace all closing tags via regex to something you will basically never have in your input (e.g.
@#lt#@/tagname@#gt#@
). While doing that save all the distinct tag names to a slice.
- With the slice of tag names replace the start tags
- Now escape all remaining
<
and >
- Last but not least replace the original tags back:
@#lt#@
to <
and @#gt#@
to >
Now you should have valid xml that is parseable.
Proof of Concept
Playground
package main
import (
"bytes"
"fmt"
"log"
"regexp"
"sort"
)
var (
rlt = []byte("@#lt#@")
rgt = []byte("@#gt#@")
lt = []byte("<")
gt = []byte(">")
)
// used for sorting strings by length
type ByLength []string
func (s ByLength) Len() int {
return len(s)
}
func (s ByLength) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s ByLength) Less(i, j int) bool {
return len(s[i]) < len(s[j])
}
func main() {
s := `<Shop>
<ShopName>Fresh Fruit <Fruit Shop></ShopName>
<ShopName attr="val1">Fresh Fruit <Shop test></ShopName>
</Shop>`
r1, err := regexp.Compile("</([^<>]*)>")
if err != nil {
log.Fatal(err)
}
names := []string{}
out := r1.ReplaceAllFunc([]byte(s), func(b []byte) []byte {
name := b[2 : len(b)-1]
// TODO: only append name if not already in list
names = append(names, string(name))
// probably optimizable
bytes := make([]byte, 0, len(name)+12)
bytes = append(bytes, rlt...)
bytes = append(bytes, name...)
bytes = append(bytes, rgt...)
return bytes
})
// sort names descending by length otherwise we risk replacing parts of names like with <Shop and <ShopName
sort.Sort(sort.Reverse(ByLength(names)))
for _, name := range names {
// replace only exact start tags
out = bytes.Replace(out, []byte(fmt.Sprintf("<%s>", name)), []byte(fmt.Sprintf("@#lt#@%s@#gt#@", name)), -1)
// replace start tags with attributes
r3, err := regexp.Compile(fmt.Sprintf("<%s( [^<>=]+=\"[^<>]+)>", name))
if err != nil {
// handle error
}
out = r3.ReplaceAll(out, []byte(fmt.Sprintf("@#lt#@%s$1@#gt#@", name)))
}
out = bytes.Replace(out, []byte{'<'}, lt, -1)
out = bytes.Replace(out, []byte{'>'}, gt, -1)
out = bytes.Replace(out, rlt, []byte{'<'}, -1)
out = bytes.Replace(out, rgt, []byte{'>'}, -1)
fmt.Println(string(out))
}
Notes
- this is a proof of concept. This is not optimised for performance.
- you might still run into content that might not be escaped properly. Then you will need to further optimise. If there is something like this in the content it will be falsely considered a tag:
<tagname>
or <tagname something ="something>
. Therefore expect some xml to still to be invalid. Log invalid xml so you can improve the algorithm.