Search code examples
pythonxmllxmlramlarge-files

lxml iterparse eats up memory for an 4GB XML file, even when clear() is used


The purpose of the script is to extract how many articles/books have been published for each year and get this information from element in the xml file dblp-2023-10-01.xml. The file can be found here: https://dblp.org/xml/release/

from lxml import etree

xmlfile = 'dblp-2023-10-01.xml'

doc = etree.iterparse(xmlfile, tag='year', load_dtd=True)
_, root = next(doc)
counter_dict = {}
for event, element in doc:
    if element.text not in counter_dict:
        counter_dict[element.text] = 1
    else:
        counter_dict[element.text] += 1
    root.clear() 

When I run the code for a small file it runs smoothly. What puzzles me is that when I run with the dblp file that it exceeds the 4GB (which is the file size) which doesn't make sense to me.

I tried also to run alternative versions to make sure it clears what it parses like:

    for ancestor in elem.xpath('ancestor-or-self::*'):
        while ancestor.getprevious() is not None:
            del ancestor.getparent()[0]

Without any improvement


Solution

  • Option 1: My machine does it in 2,5 minutes locally, so you need the download time too. You should not only clear the root.clear(), instead elem.clear():

    from lxml import etree
    import gzip
    
    import psutil
    import time
    time_start = time.time()
    
    fd = gzip.open('dblp-2023-10-01.xml.gz', "r")
    
    counter_dict = {}
    for event, elem in etree.iterparse(fd, events=['end'], recover=True):
        if elem.tag == "year":
            if elem.text not in counter_dict:
                counter_dict[elem.text] = 1
            else:
                counter_dict[elem.text] +=1
        elem.clear()
        
    #print(counter_dict)
    print(dict(sorted(counter_dict.items())))
    
    print("RAM:")
    print(psutil.Process().memory_info().rss / (1024 * 1024))
    print("Time:")
    print((time.time() - time_start))
    

    Output:

    {'1936': 12, '1937': 16, '1938': 11, '1939': 18, '1940': 10, '1941': 13, '1942': 13, '1943': 8, '1944': 5, '1945': 9, '1946': 31, '1947': 10, '1948': 41, '1949': 52, '1950': 29, '1951': 46, '1952': 114, '1953': 173, '1954': 225, '1955': 213, '1956': 355, '1957': 343, '1958': 464, '1959': 715, '1960': 625, '1961': 903, '1962': 1186, '1963': 1032, '1964': 1108, '1965': 1291, '1966': 1503, '1967': 1763, '1968': 2182, '1969': 2113, '1970': 2227, '1971': 3120, '1972': 3751, '1973': 4414, '1974': 5007, '1975': 5246, '1976': 5695, '1977': 5961, '1978': 6786, '1979': 6913, '1980': 7787, '1981': 8662, '1982': 9939, '1983': 10860, '1984': 12334, '1985': 13890, '1986': 16475, '1987': 17549, '1988': 21633, '1989': 24001, '1990': 28166, '1991': 31084, '1992': 34900, '1993': 40695, '1994': 45290, '1995': 47712, '1996': 52809, '1997': 57099, '1998': 64297, '1999': 71138, '2000': 80955, '2001': 86798, '2002': 97758, '2003': 116385, '2004': 135735, '2005': 158268, '2006': 176458, '2007': 189562, '2008': 203544, '2009': 222653, '2010': 228955, '2011': 250783, '2012': 263294, '2013': 280709, '2014': 292279, '2015': 302656, '2016': 314744, '2017': 339456, '2018': 374688, '2019': 417602, '2020': 433127, '2021': 456839, '2022': 470484, '2023': 305500, '2024': 536}
    
    RAM:
    1141.16796875
    Time:
    151.0215344429016
    

    Option 2 - Download and parse the stream:

    import gzip
    from urllib.request import urlopen
    from lxml import etree
    
    import psutil
    import time
    time_start = time.time()
    
    url = "https://dblp.org/xml/release/"
    file = "dblp-2023-10-01.xml.gz"
    fd = url+file
    
    f = urlopen(fd)
    fz = gzip.GzipFile(fileobj=f, mode="r")
    
    counter_dict = {}
    for event, elem in etree.iterparse(fz, events=['end'], recover=True):
        if elem.tag == "year":
            if elem.text not in counter_dict:
                counter_dict[elem.text] = 1
            else:
                counter_dict[elem.text] +=1
        elem.clear()
        
    #print(counter_dict)
    print(dict(sorted(counter_dict.items())))
    
    print("RAM:")
    print(psutil.Process().memory_info().rss / (1024 * 1024))
    print("Time:")
    print((time.time() - time_start))
    

    Output:

    {'1936': 12, '1937': 16, '1938': 11, '1939': 18, '1940': 10, '1941': 13, '1942': 13, '1943': 8, '1944': 5, '1945': 9, '1946': 31, '1947': 10, '1948': 41, '1949': 52, '1950': 29, '1951': 46, '1952': 114, '1953': 173, '1954': 225, '1955': 213, '1956': 355, '1957': 343, '1958': 464, '1959': 715, '1960': 625, '1961': 903, '1962': 1186, '1963': 1032, '1964': 1108, '1965': 1291, '1966': 1503, '1967': 1763, '1968': 2182, '1969': 2113, '1970': 2227, '1971': 3120, '1972': 3751, '1973': 4414, '1974': 5007, '1975': 5246, '1976': 5695, '1977': 5961, '1978': 6786, '1979': 6913, '1980': 7787, '1981': 8662, '1982': 9939, '1983': 10860, '1984': 12334, '1985': 13890, '1986': 16475, '1987': 17549, '1988': 21633, '1989': 24001, '1990': 28166, '1991': 31084, '1992': 34900, '1993': 40695, '1994': 45290, '1995': 47712, '1996': 52809, '1997': 57099, '1998': 64297, '1999': 71138, '2000': 80955, '2001': 86798, '2002': 97758, '2003': 116385, '2004': 135735, '2005': 158268, '2006': 176458, '2007': 189562, '2008': 203544, '2009': 222653, '2010': 228955, '2011': 250783, '2012': 263294, '2013': 280709, '2014': 292279, '2015': 302656, '2016': 314744, '2017': 339456, '2018': 374688, '2019': 417602, '2020': 433127, '2021': 456839, '2022': 470484, '2023': 305500, '2024': 536}
    

    RAM: 1084.80859375 Time: 148.59651041030884

    Read also memory efficiency of common data structure, here.