Search code examples
apache-nificonvertersdataflowprocessor

Convert xls files to csv files in apache NIFI


Hello guys I'm trying to convert .xls files to .csv in apache NIFI data flow. I tried many solutions with no result I even tried creating a script like this sccript

Thank you in advance


Solution

  • Configure your ExecuteStreamCommand as

    xls-to-csv

    and try using below code in your python script,

    import csv
    import os
    import sys
    from io import StringIO, BytesIO
    import pandas as pd
    import xlrd
    from pandas import ExcelFile
    
    wb = xlrd.open_workbook(file_contents=sys.stdin.read(),logfile=open(os.devnull, 'w'))
    excel_file_df = pd.read_excel(wb, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8',engine='xlrd')
    
    #flowfile_content = ExcelFile(BytesIO(sys.stdin.read()))
    #excel_file_df = pd.read_excel(flowfile_content, sheet_name='Sheet1', index=False, index_col=0, encoding='utf-8')
    
    csv_data_rows = []
    header_list = list(excel_file_df.columns.values)
    temp_header_list = []
    
    for field in header_list:
        temp = '"' + field +  '"'
        temp_header_list.append(temp)
    
    header_row  = ','.join([str(elem) for elem in temp_header_list])
    csv_data_rows.append(header_row)
    is_header_row = True
    for index, row in excel_file_df.iterrows():
    
        if is_header_row :
            is_header_row = False
            continue
    
        temp_data_list = []
        for item in row :
            #item = item.encode('utf-8', 'ignore').decode('utf-8')
            if hasattr(item, 'encode'):
                item = item.encode('ascii', 'ignore').decode('ascii')
    
            item = str(item)
            item = item.replace('\n', '')
            item = item.replace('",', '" ')
            if item == 'nan':
                item=''
            temp = '"' + str(item) + '"'
            temp_data_list.append(temp)
    
        data_row = ','.join([str(elem) for elem in temp_data_list])
        data_row = data_row
        csv_data_rows.append(data_row)
    
    for item in csv_data_rows:
        sys.stdout.write("%s\r\n" % item)