Search code examples
pythonxlrd

Remove duplicate rows with certain column combination in excel using python


I have a python program which reads excel documents. I need to allow only the first appearance of certain column combinations. Eg:

    A     |  B
  -------------
  1.  200 | 201   
  2.  200 | 202
  3.  200 | 201
  4.  200 | 203
  5.  201 | 201
  6.  201 | 202
  .............

I want remove/skip the third row where the duplication found and write it to a CSV file. Here is the function I've been trying so far. But it's not working.

def validateExcel(filename):
   xls=xlrd.open_workbook(filename)  
   setcount = 0
   column = 0
   count = 0
   # sheetcount = 0
   for sheet in xls.sheets():
       header=""
       # sheetcount = sheetcount + 1
       number_of_rows = sheet.nrows
       number_of_columns = sheet.ncols
       sheetname = sheet.name          
       mylist = []
       for row in range (1, number_of_rows):  
           mylist = []
           for col in range(0, 2):      
               mylist.append(sheet.cell_value(row, col))

           print mylist

           myset = set(mylist)

           print myset

Solution

  • It is worked for me: In python 2.7

    def validateExcel(filename):
       xls=xlrd.open_workbook(filename)  
       setcount = 0
       column = 0
       count = 0
       # sheetcount = 0
       for sheet in xls.sheets():
           header=""
           # sheetcount = sheetcount + 1
           number_of_rows = sheet.nrows
           number_of_columns = sheet.ncols
           sheetname = sheet.name          
           mylist = []
           for row in range(1, number_of_rows):  
                mylist.append((sheet.cell_value(row, 0), sheet.cell_value(row, 1)))
           myset = sorted(set(mylist), key=mylist.index)
           return myset