Search code examples

xml.etree.ElementTree iterparse() still using lots of memory?

I've been experimenting with iterparse to reduce the memory footprint of my scripts that need to process large XML docs. Here's an example. I wrote this simple script to read a TMX file and split it into one or more output files not to exceed a user-specified size. Despite using iterparse, when I split a 886MB file into 100MB files, the script runs away with all available memory (grinding to a crawl at using 6.5 of my 8MB).

Am I doing something wrong? Why does the memory usage go so high?

#! /usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import codecs
from xml.etree.ElementTree import iterparse, tostring
from sys import getsizeof

def startNewOutfile(infile, i, root, header):
    out = open(infile.replace('tmx', str(i) + '.tmx'), 'w')
    print >>out, '<?xml version="1.0" encoding="UTF-8"?>'
    print >>out, '<!DOCTYPE tmx SYSTEM "tmx14.dtd">'
    print >>out, roottxt
    print >>out, headertxt
    print >>out, '<body>'
    return out

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--maxsize', dest='maxsize', required=True, type=float, help='max size (in MB) of output files')
    parser.add_argument(dest='infile', help='.tmx file to be split')
    args = parser.parse_args()

    maxsize = args.maxsize * 1024 * 1024

    nodes = iter(iterparse(args.infile, events=['start','end']))

    _, root = next(nodes)
    _, header = next(nodes)

    roottxt = tostring(root).strip()
    headertxt = tostring(header).strip()

    i = 1
    curr_size = getsizeof(roottxt) + getsizeof(headertxt)
    out = startNewOutfile(args.infile, i, roottxt, headertxt)

    for event, node in nodes:
        if event =='end' and node.tag == 'tu':
            nodetxt = tostring(node, encoding='utf-8').strip()
            curr_size += getsizeof(nodetxt)
            print >>out, nodetxt
        if curr_size > maxsize:
            curr_size = getsizeof(roottxt) + getsizeof(headertxt)
            print >>out, '</body>'
            print >>out, '</tmx>'
            i += 1
            out = startNewOutfile(args.infile, i, roottxt, headertxt)

    print >>out, '</body>'
    print >>out, '</tmx>'


  • Found the answer in a related question: Why is elementtree.ElementTree.iterparse using so much memory?

    One needs not only root.clear(), but node.clear() at each iteration of the for loop. Because we're processing both start & end events, though, we need to be careful not to remove tu nodes too soon:

    for e, node in nodes:
        if e == 'end' and node.tag == 'tu':
            nodetxt = tostring(node, encoding='utf-8').strip()
            curr_size += getsizeof(nodetxt)
            print >>out, nodetxt
        if curr_size > maxsize:
            curr_size = getsizeof(roottxt) + getsizeof(headertxt)
            print >>out, '</body>'
            print >>out, '</tmx>'
            i += 1
            out = startNewOutfile(args.infile, i, roottxt, headertxt)