Search code examples
pythonpandascsvconcatenationout-of-memory

MemoryError when trying to convert txt to csv and concatenate into one file


I am having about approx. 400 txt files with each file having a size between a few mb up to 1 GB. These txt files contain data which are length-separated. As an example consider these two txt files (no header in the source files):

File1.txt:

AB;12345A;400E;500C
CD;12345B;400E;500C

File2.txt:

EC;12345E;400E;500C
EF;12345E;400D;500E

(Once again: Although there are semicolons, these files are not ";"-separated they are length-separated. I read it in using pandas read_fwf.)

My first approach was to read the each txt file, store it in a dataframe and attach/concatenate this to one big dataframe:

import pandas as pd
import glob
import csv


path = r'C:\folder\Test'
all_files = glob.glob(path + "\*.txt")


for filename in all_files:
    print(filename)
    col_lengths = {'Column1': range(0, 2), 
                   'Column2': range(3, 9), 
                   'Column3': range(10, 14),
                   'Column4': range(15, 19),
                  }
    col_lengths = {k: set(v) for k, v in col_lengths.items()}

    df = pd.read_fwf(filename, colspecs=[(min(x), max(x)+1) for x in col_lengths.values()], encoding='cp1252', header=None, names=col_lengths.keys(), 
               converters={'Column1':lambda x : str(x),
                          'Column2':lambda x : str(x),
                          'Column3':lambda x : str(x),
                          'Column4':lambda x : str(x),
                          }
               )
   
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

frame.to_csv(r'C:\folder\Python test.csv', encoding='utf-8', index=False, sep=";", decimal=",", quoting=csv.QUOTE_NONE)

This code works with the sample files, but not with my original data, there I get a memory error ("Unable to allocate ... for an array with shape ... and data type object)

My second approach was to read in each txt file, store each file as a csv file and then read each file again and concatenate it into one big dataframe:

import pandas as pd
import glob
import csv
import os

path = r'C:\folder\Test'
all_files = glob.glob(path + "\*.txt")


for filename in all_files:
    col_lengths = {'Column1': range(0, 2), 
                   'Column2': range(3, 9), 
                   'Column3': range(10, 14),
                   'Column4': range(15, 19),
                  }
    col_lengths = {k: set(v) for k, v in col_lengths.items()}

    df = pd.read_fwf(filename, colspecs=[(min(x), max(x)+1) for x in col_lengths.values()], encoding='cp1252', header=None, names=col_lengths.keys(), 
               converters={'Column1':lambda x : str(x),
                          'Column2':lambda x : str(x),
                          'Column3':lambda x : str(x),
                          'Column4':lambda x : str(x),
                          }
               )
    # convert each txt file to a csv file
    df.to_csv(os.path.join(path, os.path.splitext(os.path.basename(filename))[0] + '.' + "csv"), encoding='utf-8', index=False, sep=";", decimal=",", date_format='%d.%m.%Y', quoting=csv.QUOTE_MINIMAL)

# read in csv files and concatenate
path = r'C:\folder\Test'

all_files = glob.glob(os.path.join(path, "*.csv"))

dtypes= {"Column1": str, "Column2": str, "Column3": str, "Column4": str}

df = pd.concat((pd.read_csv(f, sep=";", encoding='utf-8', dtype=dtypes, decimal=",") for f in all_files), ignore_index=True)

df.to_csv(os.path.join(path,"File.csv"), encoding='utf-8', index=False, sep=";", decimal=",", quoting=csv.QUOTE_MINIMAL)

Unfortunately, same problem again: Works with sample files, but with my original data I get a memory error again.

The problem is always the pd.concat. I thought the second approach might be better, but turns out the memory error appears even earlier.

As a third approach I tried to read in the txt and export as pickle files, so not csv and then try to concatenate these pickle files:

import pandas as pd
import glob
import csv

path = r'C:\folder\Test'
all_files = glob.glob(path + "\*.pkl")

li = []

for filename in all_files:
    print(filename)
    
    df = pd.read_pickle(filename)
   
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

frame.to_csv(r'C:\folder\Test\Test.csv', encoding='utf-8', index=False, sep=";", decimal=",", quoting=csv.QUOTE_MINIMAL)

It seems to run faster, but soon there is a memory error again.

Before trying other data formats like parquet, hdf5 or other techniques like dask my question is: Is there a way to get it running with a more efficient approach which somehow allows me to get around the memory error?

My RAM size is limited to 16 gb, unfortunately. Windows 64x and Python 64x. Of course the OS and other background processes take away some useable RAM. Maybe there is an option to change system parameters and make use of more RAM, as Python Pandas is not using the full possibilites and just throws this Error, however I am not a big fan of playing around with such system parameters, as I think the root cause might be the way how I tackle this problem and I cannot say for sure, that more RAM or even increasing the physical RAM size to 32gb would solve this issue.

(Regarding my code above: I know that lambda x : str(x), can be reduced to str I think, but anyway this doesn't change my question, still memory error problem.)


Solution

  • Chances are you don't need Pandas at all; just read each file and stream them out to one:

    import glob
    import csv
    import os
    
    path = r"C:\folder\Test"
    all_files = glob.glob(os.path.join(path, "*.txt"))
    
    with open(os.path.join(path, "output.csv"), "w", encoding="utf-8") as outf:
        cw = csv.writer(outf, delimiter=";", quoting=csv.QUOTE_MINIMAL)
        cw.writerow(["Column1", "Column2", "Column3", "Column4"])
        for i, filename in enumerate(all_files):
            with open(filename, "r", encoding="cp1252") as inf:
                for line in inf:
                    col1 = line[0:2]
                    col2 = line[3:9]
                    col3 = line[10:14]
                    col4 = line[15:19]
                    cw.writerow([col1, col2, col3, col4])
    

    However if you want to use Pandas, you can similarly just stream the dataframes into a single file handle you've opened beforehand:

    import pandas as pd
    import glob
    import csv
    import os
    
    col_lengths = {
        "Column1": range(0, 2),
        "Column2": range(3, 9),
        "Column3": range(10, 14),
        "Column4": range(15, 19),
    }
    converters = {col: str for col in col_lengths}
    colspecs = [(x.start, x.end + 1) for x in col_lengths.values()]
    
    path = r"C:\folder\Test"
    all_files = glob.glob(os.path.join(path, "*.txt"))
    
    with open(os.path.join(path, "output.csv"), "wb") as f:
        for i, filename in enumerate(all_files):
            df = pd.read_fwf(
                filename,
                colspecs=colspecs,
                encoding="cp1252",
                header=None,
                names=col_lengths,
                converters=converters,
            )
            df.to_csv(
                f,
                header=(i == 0),  # write header only for first file
                encoding="utf-8",
                index=False,
                sep=";",
                decimal=",",
                date_format="%d.%m.%Y",
                mode="wb",
                quoting=csv.QUOTE_MINIMAL,
            )