lxml iterparse eats up memory for an 4GB XML file, even when clear() is used

The purpose of the script is to extract how many articles/books have been published for each year and get this information from element in the xml file dblp-2023-10-01.xml. The file can be found here: https://dblp.org/xml/release/

from lxml import etree

xmlfile = 'dblp-2023-10-01.xml'

doc = etree.iterparse(xmlfile, tag='year', load_dtd=True)
_, root = next(doc)
counter_dict = {}
for event, element in doc:
    if element.text not in counter_dict:
        counter_dict[element.text] = 1
    else:
        counter_dict[element.text] += 1
    root.clear()

When I run the code for a small file it runs smoothly. What puzzles me is that when I run with the dblp file that it exceeds the 4GB (which is the file size) which doesn't make sense to me.

I tried also to run alternative versions to make sure it clears what it parses like:

    for ancestor in elem.xpath('ancestor-or-self::*'):
        while ancestor.getprevious() is not None:
            del ancestor.getparent()[0]

Without any improvement

Solution

Option 1: My machine does it in 2,5 minutes locally, so you need the download time too. You should not only clear the root.clear(), instead elem.clear():

from lxml import etree
import gzip

import psutil
import time
time_start = time.time()

fd = gzip.open('dblp-2023-10-01.xml.gz', "r")

counter_dict = {}
for event, elem in etree.iterparse(fd, events=['end'], recover=True):
    if elem.tag == "year":
        if elem.text not in counter_dict:
            counter_dict[elem.text] = 1
        else:
            counter_dict[elem.text] +=1
    elem.clear()
    
#print(counter_dict)
print(dict(sorted(counter_dict.items())))

print("RAM:")
print(psutil.Process().memory_info().rss / (1024 * 1024))
print("Time:")
print((time.time() - time_start))

Output:

{'1936': 12, '1937': 16, '1938': 11, '1939': 18, '1940': 10, '1941': 13, '1942': 13, '1943': 8, '1944': 5, '1945': 9, '1946': 31, '1947': 10, '1948': 41, '1949': 52, '1950': 29, '1951': 46, '1952': 114, '1953': 173, '1954': 225, '1955': 213, '1956': 355, '1957': 343, '1958': 464, '1959': 715, '1960': 625, '1961': 903, '1962': 1186, '1963': 1032, '1964': 1108, '1965': 1291, '1966': 1503, '1967': 1763, '1968': 2182, '1969': 2113, '1970': 2227, '1971': 3120, '1972': 3751, '1973': 4414, '1974': 5007, '1975': 5246, '1976': 5695, '1977': 5961, '1978': 6786, '1979': 6913, '1980': 7787, '1981': 8662, '1982': 9939, '1983': 10860, '1984': 12334, '1985': 13890, '1986': 16475, '1987': 17549, '1988': 21633, '1989': 24001, '1990': 28166, '1991': 31084, '1992': 34900, '1993': 40695, '1994': 45290, '1995': 47712, '1996': 52809, '1997': 57099, '1998': 64297, '1999': 71138, '2000': 80955, '2001': 86798, '2002': 97758, '2003': 116385, '2004': 135735, '2005': 158268, '2006': 176458, '2007': 189562, '2008': 203544, '2009': 222653, '2010': 228955, '2011': 250783, '2012': 263294, '2013': 280709, '2014': 292279, '2015': 302656, '2016': 314744, '2017': 339456, '2018': 374688, '2019': 417602, '2020': 433127, '2021': 456839, '2022': 470484, '2023': 305500, '2024': 536}

RAM:
1141.16796875
Time:
151.0215344429016

Option 2 - Download and parse the stream:

import gzip
from urllib.request import urlopen
from lxml import etree

import psutil
import time
time_start = time.time()

url = "https://dblp.org/xml/release/"
file = "dblp-2023-10-01.xml.gz"
fd = url+file

f = urlopen(fd)
fz = gzip.GzipFile(fileobj=f, mode="r")

counter_dict = {}
for event, elem in etree.iterparse(fz, events=['end'], recover=True):
    if elem.tag == "year":
        if elem.text not in counter_dict:
            counter_dict[elem.text] = 1
        else:
            counter_dict[elem.text] +=1
    elem.clear()
    
#print(counter_dict)
print(dict(sorted(counter_dict.items())))

print("RAM:")
print(psutil.Process().memory_info().rss / (1024 * 1024))
print("Time:")
print((time.time() - time_start))

Output:

{'1936': 12, '1937': 16, '1938': 11, '1939': 18, '1940': 10, '1941': 13, '1942': 13, '1943': 8, '1944': 5, '1945': 9, '1946': 31, '1947': 10, '1948': 41, '1949': 52, '1950': 29, '1951': 46, '1952': 114, '1953': 173, '1954': 225, '1955': 213, '1956': 355, '1957': 343, '1958': 464, '1959': 715, '1960': 625, '1961': 903, '1962': 1186, '1963': 1032, '1964': 1108, '1965': 1291, '1966': 1503, '1967': 1763, '1968': 2182, '1969': 2113, '1970': 2227, '1971': 3120, '1972': 3751, '1973': 4414, '1974': 5007, '1975': 5246, '1976': 5695, '1977': 5961, '1978': 6786, '1979': 6913, '1980': 7787, '1981': 8662, '1982': 9939, '1983': 10860, '1984': 12334, '1985': 13890, '1986': 16475, '1987': 17549, '1988': 21633, '1989': 24001, '1990': 28166, '1991': 31084, '1992': 34900, '1993': 40695, '1994': 45290, '1995': 47712, '1996': 52809, '1997': 57099, '1998': 64297, '1999': 71138, '2000': 80955, '2001': 86798, '2002': 97758, '2003': 116385, '2004': 135735, '2005': 158268, '2006': 176458, '2007': 189562, '2008': 203544, '2009': 222653, '2010': 228955, '2011': 250783, '2012': 263294, '2013': 280709, '2014': 292279, '2015': 302656, '2016': 314744, '2017': 339456, '2018': 374688, '2019': 417602, '2020': 433127, '2021': 456839, '2022': 470484, '2023': 305500, '2024': 536}

RAM: 1084.80859375 Time: 148.59651041030884

Read also memory efficiency of common data structure, here.