Search code examples
pythonpandascsvsortingdatetime

How to read, format, sort, and save a csv file, without pandas


  • Given the following sample data in a file test.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
  • How can this file be parsed without using pandas?
    1. Open the file
    2. Format the date column into a datetime date formatted string
    3. Sort all rows by the column 0, the date column
    4. Save back to the same file, with a header for the date column
  • With pandas this can be accomplished with a single (long) line of code, not including the import.
    • It should be noted, using parse_date can be very slow, if date_parser isn't used.
import pandas as pd

(pd.read_csv('test.csv', header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
 .rename(columns={0: 'date'})
 .sort_values('date')
 .to_csv('test.csv', index=False))

Expected Form

date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
  • This question and answer are being written to fill a knowledge content gap on Stack Overflow.
  • It's very easy to use pandas for this task.
  • It was surprisingly difficult to come up with all the necessary pieces to create a complete solution, without pandas.
  • This should be beneficial for anyone curious about this task, and for students prohibited from using pandas.
  • I wouldn't mind seeing a solution with numpy, but the primary point of the question, is to accomplish this task with only packages from the standard library.

Solution

    • pandas is easier for parsing and cleaning files.
      • What requires 1 line of pandas, took 11 lines of code, and requires a for-loop.
    • This requires the following packages and functions
    • Initially, list() was used to unpack the csv.reader object, but that was removed, to update the date value, while iterating through the reader.
    • A custom key function can be supplied to sorted to customize the sort order, but I do not see a way to return a value from the lambda expression.
      • Originally key=lambda row: datetime.strptime(row[0], '%Y-%m-%d') was used, but has been removed, since the updated date column doesn't contain month names.
      • If the date column contains month names, it won't be properly sorted, without a custom sort key.
    import csv
    from datetime import datetime
    
    # open the file for reading and writing
    with open('test1.csv', mode='r+', newline='') as f:
    
       # create a reader and writer opbject
        reader, writer = csv.reader(f), csv.writer(f)
        
        data = list()
        # iterate through the reader and update column 0 to a datetime date string
        for row in reader:
    
            # update column 0 to a datetime date string
            row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
            
            # append the row to data
            data.append(row)
    
        # sort all of the rows, based on date, with a lambda expression
        data = sorted(data, key=lambda row: row[0])
    
        # change the stream position to the given byte offset
        f.seek(0)
    
        # truncate the file size
        f.truncate()
    
        # add a header to data
        data.insert(0, ['date', 1, 2, 3, 4, 5])
    
        # write data to the file
        writer.writerows(data)
    

    Updated test.csv

    date,1,2,3,4,5
    2012-03-23,8.05,8.12,7.95,8.09,8149170
    2012-03-26,8.16,8.25,8.12,8.24,8694416
    2012-03-27,8.25,8.35,8.17,8.19,9801989
    

    %time test

    import pandas
    import pandas_datareader as web
    
    # test data with 1M rows
    df = web.DataReader(ticker, data_source='yahoo', start='1980-01-01', end='2020-09-27').drop(columns=['Adj Close']).reset_index().sort_values('High', ascending=False)
    df.Date = df.Date.dt.strftime('%d-%b-%y')
    df = pd.concat([df]*100)
    df.to_csv('test.csv', index=False, header=False)
    

    Tests

    # pandas test with date_parser
    %time pandas_test('test.csv')
    [out]:
    Wall time: 17.9 s
    
    # pandas test without the date_parser parameter
    %time pandas_test('test.csv')
    [out]:
    Wall time: 1min 17s
    
    # from Paddy Alton
    %time paddy('test.csv')
    [out]:
    Wall time: 15.9 s
    
    # from Trenton
    %time trenton('test.csv')
    [out]:
    Wall time: 17.7 s
    
    # from sammywemmy with functions updated to return the correct date format
    %time sammy('test.csv')
    [out]:
    Wall time: 22.2 s
    %time sammy2('test.csv')
    [out]:
    Wall time: 22.2 s
    

    Test Functions

    from operator import itemgetter
    import csv
    import pandas as pd
    from datetime import datetime
    
    def pandas_test(file):
        (pd.read_csv(file, header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
         .rename(columns={0: 'date'})
         .sort_values('date')
         .to_csv(file, index=False))
    
    
    def trenton(file):
        with open(file, mode='r+', newline='') as f:
            reader, writer = csv.reader(f), csv.writer(f)
            data = list()
            for row in reader:
                row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
                data.append(row)
            data = sorted(data, key=lambda row: row[0])
            f.seek(0)
            f.truncate()
            data.insert(0, ['date', 1, 2, 3, 4, 5])
            writer.writerows(data)
    
    
    def paddy(file):
        def format_date(date: str) -> str:
            formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
            return formatted_date
    
    
        with open(file, "r") as f:
            lines = f.readlines()
            records = [[value for value in line.split(",")] for line in lines]
        for record in records:
            record[0] = format_date(record[0])
        sorted_records = sorted(records, key = lambda r: r[0])
        prepared_lines = [",".join(record).strip("\n") for record in sorted_records]
        field_names = "date,1,2,3,4,5"
        prepared_lines.insert(0, field_names)
        prepared_data = "\n".join(prepared_lines)
        with open(file, "w") as f:
            f.write(prepared_data)
    
    
    def sammy(file):
        # updated with .date().isoformat() to return the correct format
        with open(file) as csvfile:
            fieldnames = ["date", 1, 2, 3, 4, 5]
            reader = csv.DictReader(csvfile, fieldnames=fieldnames)
            mapping = list(reader)
            mapping = [
                {
                    key: datetime.strptime(value, ("%d-%b-%y")).date().isoformat()
                         if key == "date" else value
                    for key, value in entry.items()
                }
                for entry in mapping
            ]
    
            mapping = sorted(mapping, key=itemgetter("date"))
    
        with open(file, mode="w", newline="") as csvfile:
            fieldnames = mapping[0].keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
            writer.writeheader()
            for row in mapping:
                writer.writerow(row)
    
    
    def sammy2(file):
        # updated with .date().isoformat() to return the correct format
        with open(file) as csvfile:
            reader = csv.reader(csvfile, delimiter=",")
            mapping = dict(enumerate(reader))
            num_of_cols = len(mapping[0])
            fieldnames = ["date" if n == 0 else n
                          for n in range(num_of_cols)]
            mapping = [
                       [ datetime.strptime(val, "%d-%b-%y").date().isoformat()
                         if ind == 0 else val
                         for ind, val in enumerate(value)
                       ]
                     for key, value in mapping.items()
                      ]
    
            mapping = sorted(mapping, key=itemgetter(0))
    
        with open(file, mode="w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=",")
            csvwriter.writerow(fieldnames)
            for row in mapping:
                csvwriter.writerow(row)