Search code examples
pythonpandascsvparquet

Out of memory trying to convert csv file to parquet using python


I am trying to convert a very large csv file to parquet.

I have tried the following method:

df1 = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv')
df1.to_parquet('/kaggle/input/amex-default-prediction/train.parquet')

but pd.read_csv throws Out Of Memory Error

Is there any way to convert to the file without loading it entirely ?


Solution

  • To solve the memory problem, you can first import the data with the chunck method of pandas and save each chunck as a parquet file. So for example for your case, create a folder "train_data", and in this folder you save the different parquet files that correspond to the chuncks.

    import pandas as pd
    import numpy as np
    import os
    import csv2parquet
    from subprocess import run
    import fastparquet
    import sys
    import pyarrow.parquet as pq
    
    path ="C:/.../amex-default-prediction/"
    parquet="parquet/"
    #create new folder train_data
    path_train_data="train_data/"
    
    
    def get_path_parquet(file):
        if file.split('.')[0]=="sample_submission":
            return path_sample_submission
        elif file.split('.')[0]=="test_data":
            return path_test_data
        elif file.split('.')[0]=="train_data":
            return path_train_data
        elif file.split('.')[0]=="train_labels":
            return path_train_label
    def csv_to_parquet(df,title, path,i):
        """
        Convert Csv files to parquet
        df    : csv data
        title : name data
        path  : folder into the save parquet data
        """
        try:
            title_prefix=title.split(".")[0]+str(i)
            out_title = path + f'\\{title_prefix}.parquet'
            df.to_parquet(out_title, engine='fastparquet')
        except:
            sys.exit(-1)
    def loding_csv_with_chunk(path,file):
        try:
            chunk_csv= pd.read_csv(path + f'\\{file}', low_memory=False, chunksize = 5000)
            #df = pd.concat(chunk for chunk in chunk_csv)
            return chunk_csv
        except:
            sys.exit(-1)
            
    
    def read_partition_parquet():
        dataset = pq.ParquetDataset(path_train_, use_legacy_dataset=False)
        data=dataset.read().to_pandas()
        return data
    
    
    #csv_df
    for file in os.listdir(path):
        if file[-4:]==".csv":
            
            print("begin process for : "+str(file)+ "....")
            #csv_df = pd.read_csv(path + f'\\{file}')
            ##load data with chunck method
            chunk_csv = loding_csv_with_chunk(path,file)
            ##for each chunck save the data on parquet format 
            for i, df_chunk in enumerate(chunk_csv):
                print(df_chunk.shape)
                title_prefix=file.split(".")[0]+str(i)
                out_title = path+parquet+get_path_parquet(file) + f'{title_prefix}.parquet'
                df_chunk.to_parquet(out_title, engine='fastparquet')
            #csv_to_parquet(csv_df,file, path)
            print("end process for : "+str(file)+ "....")
        else:
            continue