Search code examples
pythoncsvdefaultdict

Parsing a csv file, changing its columns into rows and rows into columns


I have one CSV file, in which first row is of dates, and first column is of sectors. something like this.

Date,7/2/2007,7/3/2007,7/5/2007,7/6/2007,7/9/2007
A,0,1,3,2,0
AA,23,423,2,0,0
AAL,34,23,5,0,234
AGCG,234,0,9,234,23
XL,0,65,34,34,34

So now i want to prepare another file, which is like

Date,Sector
7/2/2007,AA
7/2/2007,AAL
7/2/2007,AGCG
7/3/2007,A
7/3/2007,AA
7/3/2007,AAL
7/3/2007,XL
...

The logic behind is, i want sectors on each date which are not 0.

Code i have tried so far is: import csv,sys from collections import defaultdict

dd = defaultdict(list)
dateList = []
header = False

def createFile(di):
    ff = open("cum_file.csv","w")
    csvwriter = csv.writer(ff)
    row = []
    for d,t in di.iteritems():
        for tt in t:
            print tt,d
            row = [tt,d]
            csvwriter.writerow(row)
            del row[:]

#with open("./data/StrategyAcctValue-Daily.csv") as f:
with open("./try/test.csv") as f:
    reader = csv.reader(f,delimiter=",")
    for line in reader:
        col1 = True
        if header:
            #sys.exit()
            for eachCol in line:
                if col1:
                    col1 = False
                    tkr = eachCol
                elif eachCol != '0':
                    tkrIndex = line.index(eachCol)
                    tickerDate = dateList[tkrIndex - 1]
                    dd[tickerDate].append(tkr)
                else:
                    continue
            #print dd
            #createFile(dd)
            #sys.exit()
        else:
            header = True
            for eachCol in line:
            #   print line.index(eachCol)
            #   continue
                if col1:
                    col1 = False
                    tkr = eachCol
                else:
                    dd[eachCol] = []
                    dateList.append(eachCol)
            print dateList
    print dd
    createFile(dd)

This is giving the output like:

A 7/3/2007
AA 7/3/2007
AAL 7/3/2007
XL 7/3/2007
A 7/6/2007
AAL 7/9/2007
AGCG 7/9/2007
AA 7/2/2007
AAL 7/2/2007
AGCG 7/2/2007
AGCG 7/2/2007
A 7/5/2007
AA 7/5/2007
AAL 7/5/2007
AGCG 7/5/2007
XL 7/5/2007
XL 7/5/2007
XL 7/5/2007

I'm unable to find, where i'm making the mistakes.


Solution

  • import pandas as pd
    
    df = pd.read_csv("input.csv")
    df1 = pd.DataFrame(columns=["Date", "Sector"])
    
    for i, row in df.iterrows():
        dict_ = dict(row)
        days = [key for key, value in dict_.items() if value != 0]
        days.remove('Date')
        for day in days:
            df1.loc[len(df1)] = [day, dict_["Date"]]
    
    # df1.sort_values(by='Date') // to sort by date
    df1.to_csv("output.csv", index=False)
    

    Output.csv

    Date,Sector
    7/6/2007,A
    7/3/2007,A
    7/5/2007,A
    7/3/2007,AA
    7/2/2007,AA
    7/5/2007,AA
    7/9/2007,AAL
    7/3/2007,AAL
    7/2/2007,AAL
    7/5/2007,AAL
    7/9/2007,AGCG
    7/6/2007,AGCG
    7/2/2007,AGCG
    7/5/2007,AGCG
    7/9/2007,XL
    7/6/2007,XL
    7/3/2007,XL
    7/5/2007,XL