Search code examples
pythonpandasdataframecsvheader

Automatic Header Identification (Pandas / CSV)


I am working on a project that parses csv files. In its current state, the user has to enter a unique string from the known column names to set the header row, both for parse column selection and to populate the columns-to-keep field.

My goal is to remove that step from the user end and automate the process, regardless of where the header row actually lies. I made a basic, hard-coded, solution to this... but I am wondering how I might go about a more Pythonic / automated way that doesn't rely on hard-coded values and logic.

Here is a minimal version of my test dataframe:

0 A B C D E F
1 this is just a placeholder to act like the first cell is filled
2
3 this is also just a placeholder
4
5
6
7 Text Bool Type Active
8 A. Lorem ipsum dolor sit amet, consectetur adipiscing elit.... TRUE t1 yes
9 C.Nulla quis nisi vitae ante eleifend porta.D.Donec tempor felis nec lectus ultricies fringilla.... FALSE t5 no

The header row is row 7, which my basic solution identifies. But again, it is hard-coded specifically to the test dataframe.

Code below, TYIA.

import os, sys
import pandas as pd
import numpy as np

n=0
rownum=0
false_count=0
chklst=[]
stmt=[]
column_check = {}
row_number=0

docs = os.listdir() #set to where file is saved
os.chdir()

for file in docs:
    if file.endswith('csv'):
        if file == 'test_list1.csv':
            df = pd.read_csv('test_list1.csv', header = None)

            for row in df.iterrows():

                rownum +=1
                colnum=0

                for column in df.iloc[n]:

                    colnum += 1
                    rowindx = rownum-1
                    colindx = colnum-1
                    test=len(df.iloc[n])-1
                    
                    if df.isnull().loc[rowindx, colindx]:
                        colbool = 'False'
                        column_check[f'row {rownum} col {colnum}'] = colbool
                    else:
                        colbool = 'True'
                        column_check[f'row {rownum} col {colnum}'] = colbool
                    
                    if n < 3:
                        n += 1
                        
for key, value in column_check.items():
    false_count=0
    i=0
    chklst.append(f'{key} ')
    
    if key.endswith('col 1'):

        if value == 'False':
            col1val = 1
        elif value == 'True':
            col1val = 0

    if key.endswith('col 2'):

        if value == 'False':
            col2val = 1
        elif value == 'True':
            col2val = 0

    if key.endswith('col 3'):

        if value == 'False':
            col3val = 1
        elif value == 'True':
            col3val = 0   

    if key.endswith('col 4'):

        if value == 'False':
            col4val = 1
        elif value == 'True':
            col4val = 0

        colsum = col1val+col2val+col3val+col4val

        if colsum > 2:
            chklst.append("not header row")

        else:
            chklst.append('poss header')

        chklst.append("")

match = 'poss header'
indx_pos = chklst.index(match)
indx_pos = int((indx_pos / 5)-1) #adjusting for number of entries per row and -1 for index
found=f'The possible header is row {indx_pos}!!'
print(found)

This prints: "The possible header is row 7!!"


Solution

  • After toying with the code all day I think I found a solution that works. If anyone is willing to test it for me on their end, I would appreciate the feedback!

    import os, sys
    import pandas as pd
    import numpy as np
    
    docs = os.listdir() #folder path where files are saved
    os.chdir()
    
    for file in docs:
        rownum=0
        rowindx=0
        cols=[]
    
        if file.endswith('csv'):
            df = pd.read_csv(file, header = None)
            
            for row in df.iterrows():
                rownum+=1
                rowindx=rownum-1
                col_count=0
                i=0
    
                while i < len(df.columns):
    
                    if df.isnull().loc[rowindx, df.columns[i]]:
                        col_count=col_count
                    else:
                        col_count+=1
                    i+=1
               
                if col_count < len(df.columns):
                    cols.append(f'row {rownum} NOPE')
                elif col_count == len(df.columns):
                    cols.append(f'row matched')
    
            match = 'row matched'
            hdr_pos = cols.index(match)
            print(f'Header Index: {hdr_pos} \n Row: {hdr_pos+1}')
    

    For each csv in the target folder, this prints:
    Header Index: [index #]
    Row: [index # + 1]