python pandas file directory concatenation

How to concatenate a newly added file to pandas dataframe?

I am trying to write a script which will be grabbing newly added csv file from the folder and adding it to one big file. Basically, I want all of the csv files added to a particular folder, being stored in one resulting csv file. I have a code below which generates the list of files and I am selecting the newly added file there:

def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                fs = os.path.getsize(fp)
                total_size += fs
                if f in fh:
                    if fh[f] == fs:
                        # file unchanged
                        pass
                    else:
                        if changed_cb:
                            changed_cb(fp)
                else:
                    #new file
                    if new_cb:
                        new_cb(fp)
                fh[f] = fs

    return total_size

def new_file(fp):
    print("New File {0}!".format(fp))

def changed_file(fp):
    print("File {0} changed!".format(fp))

if __name__ == '__main__':
    file_history={}
    total = 0

    while(True):
        nt = check_dir(file_history,'/Users/.../Desktop/files',new_file,changed_file)
        if total and nt != total:
            print("Total size changed from {0} to {1}".format(total,nt))
            total = nt
        time.sleep(200)
        print("File list:\n{0}".format(file_history))
        print(list(dict.keys(file_history))[-1])

I don't really know how to create this empty pandas data frame to which this latest added file will be added on a regular basis (that's why I have a time.sleep there). In the end I want to have this big csv file with all the files added to it.

Please, help :(

P.S. I am new to Python, so please don't judge if it is super simple..

Solution

Are you going to be using Pandas to process the data in the csv or only to concatenate the files?

If you simply want to append each csv file to the big one, then why not use python io for speed and simplicity. Assuming that all csv files use the same type of formatting that is.

I have updated the new_file method to append to the big csv using io. I have added an append_pandas function which is not used but should help you if you must use pandas to do the job. I haven't tested the pandas function, there are more things to consider like the format of the csv files. Check out the documentation for more details.

import os
import time


def check_dir(fh,start_path='/Users/.../Desktop/files',new_cb=None,changed_cb=None,**kwargs):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                fs = os.path.getsize(fp)
                total_size += fs
                if f in fh:
                    if fh[f] == fs:
                        # file unchanged
                        pass
                    else:
                        if changed_cb:
                            changed_cb(fp,**kwargs)
                else:
                    #new file
                    if new_cb:
                        new_cb(fp, **kwargs)
                fh[f] = fs

    return total_size

def is_csv(f):
    # you can add more to check here
    return 'csv' in f

def append_csv(s,d,skip_header=1):

    with open(s,'r') as readcsv:
        with open(d,'a') as appendcsv:
            for line in readcsv:
                if(skip_header < 1):
                    appendcsv.write(line)
                else:
                    skip_header -= 1

            if not "\n" in line:
                appendcsv.write("\n")

def append_pandas(s,d):
    # i haven't tested this
    pd = pandas.read_csv(s)
    pdb = pandas.read_csv(d)
    newpd = pdb.append(pd)
    DataFrame.to_csv(d)

def new_file(fp, **kwargs):
    if is_csv(fp):
        print("Appending {0}!".format(fp))
        bcsv = kwargs.get('append_to_csv','/default/path/to/big.csv')
        skip = kwargs.get('skip_header',1)
        append_csv(fp,bcsv,skip)

def changed_file(fp, **kwargs):
    print("File {0} changed!".format(fp))

if __name__ == '__main__':
    file_history={}
    total = 0

    while(True):
        nt = check_dir(file_history,'/tmp/test/',new_file,changed_file, append_to_csv ='/tmp/big.csv', skip_header = 1)
        if total and ns != total:
            print("Total size changed from {0} to {1}".format(total,ns))
            total = ns
        time.sleep(10)
        print("File list:\n{0}".format(file_history))