I have a directory which contains several large XML files (total size is about 10 GB). Is there any way to iterate through the directory containing the XML files and read 50 byte by 50 byte and parse the XML files with high performance?
func (mdc *Mdc) Loadxml(path string, wg sync.WaitGroup) {
defer wg.Done()
//var conf configuration
file, err := os.Open(path)
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
buf := make([]byte, 1024*1024)
scanner.Buffer(buf, 50)
for scanner.Scan() {
_, err := file.Read(buf)
if err != nil {
log.Fatal(err)
}
}
err = xml.Unmarshal(buf, &mdc)
if err != nil {
log.Fatal(err)
}
fmt.Println(mdc)
}
You can do something even better: You can tokenize your xml files.
Say you have an xml like this
<inventory>
<item name="ACME Unobtainium">
<tag>Foo</tag>
<count>1</count>
</item>
<item name="Dirt">
<tag>Bar</tag>
<count>0</count>
</item>
</inventory>
you can actually have the following data model
type Inventory struct {
Items []Item `xml:"item"`
}
type Item struct {
Name string `xml:"name,attr"`
Tags []string `xml:"tag"`
Count int `xml:"count"`
}
Now, all you have to do is to use filepath.Walk and do something like this for each file you want to process:
decoder := xml.NewDecoder(file)
for {
// Read tokens from the XML document in a stream.
t, err := decoder.Token()
// If we are at the end of the file, we are done
if err == io.EOF {
log.Println("The end")
break
} else if err != nil {
log.Fatalf("Error decoding token: %s", err)
} else if t == nil {
break
}
// Here, we inspect the token
switch se := t.(type) {
// We have the start of an element.
// However, we have the complete token in t
case xml.StartElement:
switch se.Name.Local {
// Found an item, so we process it
case "item":
var item Item
// We decode the element into our data model...
if err = decoder.DecodeElement(&item, &se); err != nil {
log.Fatalf("Error decoding item: %s", err)
}
// And use it for whatever we want to
log.Printf("'%s' in stock: %d", item.Name, item.Count)
if len(item.Tags) > 0 {
log.Println("Tags")
for _, tag := range item.Tags {
log.Printf("\t%s", tag)
}
}
}
}
}
Working example with dummy XML: https://play.golang.org/p/MiLej7ih9Jt