donglu5047 2019-02-07 08:08
浏览 48
已采纳

运行基准测试时XML解析速度慢

I want to measure the time it takes for GoLang to parse an XML file. So, I've decided the write a benchmark.

I do have a function which generates an io.Reader that contains an XML document.

// PRIVATE: createSampleXMLReader creates an io.Reader instance that contains 10.000 '<Node />' elements which are
//          suitable for running a benchmark test.
func createSampleXMLReader(
    nodeElementCount int) io.Reader {
    xmlContents := new(strings.Builder)

    xmlContents.WriteString("<ROOT>
")

    for i := 0; i < nodeElementCount; i++ {
        appendNodeXMLElement(xmlContents)
    }

    xmlContents.WriteString("</ROOT>")

    return strings.NewReader(xmlContents.String())
}

// PRIVATE: appendNodeXMLElement appends a '<Node />' elements to an existing io.Reader instance.
func appendNodeXMLElement(
    xmlDocument *strings.Builder) {

    xmlDocument.WriteString("<Node id=\"0\" position=\"0\" depth=\"0\" parent=\"0\">
")
    xmlDocument.WriteString("    <Name>Name</Name>
")
    xmlDocument.WriteString("    <Description>Description</Description>
")
    xmlDocument.WriteString("    <OwnInformation>
")
    xmlDocument.WriteString("        <Title>Title</Title>
")
    xmlDocument.WriteString("        <Description>Description</Description>
")
    xmlDocument.WriteString("    </OwnInformation>
")
    xmlDocument.WriteString("    <Assets>
")
    xmlDocument.WriteString("        <Asset id=\"0\" position=\"0\" type=\"0\" category=\"0\">
")
    xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>
")
    xmlDocument.WriteString("            <Description>Description</Description>
")
    xmlDocument.WriteString("            <Uri>Uri</Uri>
")
    xmlDocument.WriteString("        </Asset>
")
    xmlDocument.WriteString("        <Asset id=\"1\" position=\"1\" type=\"1\" category=\"1\">
")
    xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>
")
    xmlDocument.WriteString("            <Description>Description</Description>
")
    xmlDocument.WriteString("            <Uri>Uri</Uri>
")
    xmlDocument.WriteString("        </Asset>
")
    xmlDocument.WriteString("        <Asset id=\"2\" position=\"2\" type=\"2\" category=\"2\">
")
    xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>
")
    xmlDocument.WriteString("            <Description>Description</Description>
")
    xmlDocument.WriteString("            <Uri>Uri</Uri>
")
    xmlDocument.WriteString("        </Asset>
")
    xmlDocument.WriteString("        <Asset id=\"3\" position=\"3\" type=\"3\" category=\"3\">
")
    xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>
")
    xmlDocument.WriteString("            <Description>Description</Description>
")
    xmlDocument.WriteString("            <Uri>Uri</Uri>
")
    xmlDocument.WriteString("        </Asset>
")
    xmlDocument.WriteString("        <Asset id=\"4\" position=\"4\" type=\"4\" category=\"4\">
")
    xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>
")
    xmlDocument.WriteString("            <Description>Description</Description>
")
    xmlDocument.WriteString("            <Uri>Uri</Uri>
")
    xmlDocument.WriteString("        </Asset>
")
    xmlDocument.WriteString("    </Assets>
")
    xmlDocument.WriteString("    <Synonyms>
")
    xmlDocument.WriteString("        <Synonym>Synonym 0</Synonym>
")
    xmlDocument.WriteString("        <Synonym>Synonym 1</Synonym>
")
    xmlDocument.WriteString("        <Synonym>Synonym 2</Synonym>
")
    xmlDocument.WriteString("        <Synonym>Synonym 3</Synonym>
")
    xmlDocument.WriteString("        <Synonym>Synonym 4</Synonym>
")
    xmlDocument.WriteString("    </Synonyms>
")
    xmlDocument.WriteString("</Node>
")
}

Then, I have the function that actually parses this XML document.

// PRIVATE: parseXML parses an io.Reader instance into a 'Node' struct.
func parseXML(
    xmlReader io.Reader) {

    xmlDecoder := xml.NewDecoder(xmlReader)

    for {
        token, _ := xmlDecoder.Token()
        if token == nil {
            break
        }

        switch element := token.(type) {
        case xml.StartElement:
            if element.Name.Local == "Node" {
                decodeNodeElement(xmlDecoder, &element)
            }
        }
    }
}

// PRIVATE: decodeNodeElement decodes a '<Node />' element into a 'Node' struct.
func decodeNodeElement(
    xmlDecoder *xml.Decoder,
    element *xml.StartElement) {

    node := new(model.Node)

    xmlDecoder.DecodeElement(node, element)
}

Then, I have the function that performs the benchmark:

// PRIVATE: runBenchmarkParseXML performs a benchmark that parses an XML document that contains the given number of
//          '<Node />' element.
func runBenchmarkParseXML(
    nodeCount int,
    benchmark *testing.B) {

    // Arrange.
    xmlReader := createSampleXMLReader(nodeCount)

    // Act.
    for i := 0; i < benchmark.N; i++ {
        parseXML(xmlReader)
    }
}

And then I have 5 functions that performs the benchmark. These are for 1, 10, 100, 1000 and 10000 elements in the XML document.

func BenchmarkParseXML1(benchmark *testing.B)     { runBenchmarkParseXML(1, benchmark) }
func BenchmarkParseXML10(benchmark *testing.B)    { runBenchmarkParseXML(10, benchmark) }
func BenchmarkParseXML100(benchmark *testing.B)   { runBenchmarkParseXML(100, benchmark) }
func BenchmarkParseXML1000(benchmark *testing.B)  { runBenchmarkParseXML(1000, benchmark) }
func BenchmarkParseXML10000(benchmark *testing.B) { runBenchmarkParseXML(10000, benchmark) }

When I'm running this benchmark, I see the following output:

BenchmarkParseXML1-4             5000000               226 ns/op
BenchmarkParseXML10-4           10000000               230 ns/op
BenchmarkParseXML100-4           5000000               226 ns/op
BenchmarkParseXML1000-4          5000000               254 ns/op
BenchmarkParseXML10000-4               1        1690998100 ns/op

How is it possible that there's such a difference in the benchmark that parses an XML file that contains 10.000 elements, which the rest of the benchmark timings are stable?

Is my benchmark new or it the implementation of the ParseXML method not correct.

Edit: Node struct

// Node represents a '<Node />' element in the XML document.
type Node struct {
    ID             int    `xml:"id,attr"`
    Position       int    `xml:"position,attr"`
    Depth          int    `xml:"depth,attr"`
    Parent         string `xml:"parent,attr"`
    Name           string `xml:"Name"`
    Description    string `xml:"Description"`
    OwnInformation struct {
        Title       string `xml:"Title"`
        Description string `xml:"Description"`
    } `xml:"OwnInformation"`
    Assets []struct {
        ID           string `xml:"id,attr"`
        Position     int    `xml:"position,attr"`
        Type         string `xml:"type,attr"`
        Category     int    `xml:"category,attr"`
        OriginalFile string `xml:"OriginalFile"`
        Description  string `xml:"Description"`
        URI          string `xml:"Uri"`
    } `xml:"Assets>Asset"`
    Synonyms []string `xml:"Synonyms>Synonym"`
}

Thanks in advance for any guidance.

  • 写回答

1条回答 默认 最新

  • dongpiao1983 2019-02-07 08:50
    关注

    Your benchmark is flawed. The fact that you are increasing the input by 10x, but the duration stays roughly constant throughout should have made you very suspicious of this benchmark.

    You are re-using the same reader every single time. Only the first iteration of each benchmark will actually do anything. Further calls will read from a reader that is already at EOF.

    Change your setup so it returns a byte slice and construct a new reader for each iteration. That will yield the expected results:

    func createSampleXMLDoc(nodeElementCount int) []byte {
        xmlContents := &bytes.Buffer{}
    
        xmlContents.WriteString("<ROOT>
    ")
    
        for i := 0; i < nodeElementCount; i++ {
            appendNodeXMLElement(xmlContents)
        }
    
        xmlContents.WriteString("</ROOT>")
        return xmlContents.Bytes()
    }
    
    func runBenchmarkParseXML(nodeCount int, b *testing.B) {
        doc := createSampleXMLDoc(nodeCount)
    
        for i := 0; i < b.N; i++ {
            xmlReader := bytes.NewReader(doc)
            parseXML(xmlReader)
        }
    }
    

    Results on my machine (as expected, increasing input by 10x increases time by 10x):

    $ go test -benchtime=5s -bench .
    goos: linux
    goarch: amd64
    BenchmarkParseXML1-8              100000            115978 ns/op
    BenchmarkParseXML10-8              10000           1147605 ns/op
    BenchmarkParseXML100-8              1000          11586980 ns/op
    BenchmarkParseXML1000-8               50         124199120 ns/op
    BenchmarkParseXML10000-8               5        1003668966 ns/op
    

    -benchtime=5s increases the default time per benchmark from one second to five seconds. One second isn't enough for more than one iteration in the last case, yielding unreliable results. That is also why see the large number in your original benchmark. The first iteration is slow, but every one after that returns instantly, so the average time drops dramatically. Seeing just one iteration in a benchmark is another red flag, by the way. Even five iterations is still pretty low for reliable measurements, actually.

    Moral of the story: Check your errors!

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

悬赏问题

  • ¥15 VMBox虚拟机无法访问
  • ¥15 skd显示找不到头文件
  • ¥15 机器视觉中图片中长度与真实长度的关系
  • ¥15 fastreport table 怎么只让每页的最下面和最顶部有横线
  • ¥15 R语言卸载之后无法重装,显示电脑存在下载某些较大二进制文件行为,怎么办
  • ¥15 java 的protected权限 ,问题在注释里
  • ¥15 这个是哪里有问题啊?
  • ¥15 关于#vue.js#的问题:修改用户信息功能图片无法回显,数据库中只存了一张图片(相关搜索:字符串)
  • ¥15 texstudio的问题,
  • ¥15 spaceclaim模型变灰色