Search code examples
pythonpython-3.xxmlsortingnodes

How to sorted xml node values within against a group?


I have input csv file:

col2  col3  col1
a1    b1    c1
a2    b2    c2
a3    b3    c3

Script that converts to an xml file:

import csv

csvFile = r'C:\Users\tkp\Desktop\Holdings_Download\testxml.csv'
xmlFile = r'C:\Users\tkp\Desktop\Holdings_Download\testxml.xml'

csvData = csv.reader(open(csvFile), delimiter=';')
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="utf-8"?>' + "\n" +'<TabularXml>' + "\n" )
xmlData.write('<Sheet>' + "\n" )

rowNum = 0
for row in csvData:
    if rowNum == 0:
        tags = row
        # replace spaces w/ underscores in tag names
        for i in range(len(tags)):
            tags[i] = tags[i].replace(' ', '_')
    else: 
        xmlData.write(' '+'<Row' + ' ' +'srcidx='+'"'+str(rowNum+1)+'"'+ '>' + "\n")
        for i in range (len(tags)):
            
            if tags[i]=='col1':
                if row[i] !='':
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="1"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="String"'+' '+'>' \
                      + row[i] + '</' + 'Cell' + '>' + "\n")
                else:
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="1"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="String"'+' '+'>' \
                      + "\n")
            if tags[i]=='col2':
                if row[i] !='':
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="2"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="DateTime"'+' '+'>' \
                      + row[i] + '</' + 'Cell' + '>' + "\n")
                else:
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="2"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="DateTime"'+' '+'>' \
                      + "\n")
            if tags[i]=='col3':
                if row[i] !='':
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="3"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="Int32"'+' '+'>' \
                      + row[i] + '</' + 'Cell' + '>' + "\n")
                else:
                    xmlData.write('    ' +'<' + 'Cell' + ' ' +'idx="3"'+' '+ 'name='+'"'+tags[i]+'"'+' '+'type="Int32"'+' '+'>' \
                      + "\n")
                
        xmlData.write(' '+'</Row>' + "\n")

    rowNum +=1

xmlData.write('</Sheet>' + "\n")
xmlData.write('</TabularXml>' + "\n")
xmlData.close()

Output xml file:

<?xml version="1.0" encoding="utf-8"?>
<TabularXml>
<Sheet>
 <Row srcidx="2">
    <Cell idx="2" name="col2" type="DateTime" >a1</Cell>
    <Cell idx="3" name="col3" type="Int32" >b1</Cell>
    <Cell idx="1" name="col1" type="String" >c1</Cell>
 </Row>
 <Row srcidx="3">
    <Cell idx="2" name="col2" type="DateTime" >a2</Cell>
    <Cell idx="3" name="col3" type="Int32" >b2</Cell>
    <Cell idx="1" name="col1" type="String" >c2</Cell>
 </Row>
 <Row srcidx="4">
    <Cell idx="2" name="col2" type="DateTime" >a3</Cell>
    <Cell idx="3" name="col3" type="Int32" >b3</Cell>
    <Cell idx="1" name="col1" type="String" >c3</Cell>
 </Row>
</Sheet>
</TabularXml>

How can I sorted data by 'Cell' nodes and 'idx' values?

Expected result:

<?xml version="1.0" encoding="utf-8"?>
<TabularXml>
<Sheet>
 <Row srcidx="2">
    <Cell idx="1" name="col1" type="String" >c1</Cell>
    <Cell idx="2" name="col2" type="DateTime" >a1</Cell>
    <Cell idx="3" name="col3" type="Int32" >b1</Cell>
 </Row>
 <Row srcidx="3">
    <Cell idx="1" name="col1" type="String" >c2</Cell>
    <Cell idx="2" name="col2" type="DateTime" >a2</Cell>
    <Cell idx="3" name="col3" type="Int32" >b2</Cell>
 </Row>
 <Row srcidx="4">
    <Cell idx="1" name="col1" type="String" >c3</Cell>
    <Cell idx="2" name="col2" type="DateTime" >a3</Cell>
    <Cell idx="3" name="col3" type="Int32" >b3</Cell>
 </Row>
</Sheet>
</TabularXml>

Solution

  • You have always the same column names so you can use them to create order

    order = [
        column_names.index('col1'), 
        column_names.index('col2'), 
        column_names.index('col3')
    ]
    

    and then you can use it to sort data

    column_names = [
        column_names[order[0]], 
        column_names[order[1]], 
        column_names[order[2]]
    ]
    
    row = [
        row[order[0]], 
        row[order[1]], 
        row[order[2]]
    ]
    

    For more columns you could even use for-loop but I skip this idea.


    Minimal working code.

    I use io only to simulate file in memory.

    I use next() to get first/single row from csv

    text = '''col2;col3;col1
    a1;b1;c1
    a2;b2;c2
    a3;b3;c3'''
    
    import csv
    import io
    
    
    csv_reader = csv.reader(io.StringIO(text), delimiter=';')
    
    column_names = next(csv_reader)
    
    order = [
        column_names.index('col1'), 
        column_names.index('col2'), 
        column_names.index('col3')
    ]
    print('order:', order)
    
    
    print('old column_names:', column_names)
    column_names = [
        column_names[order[0]], 
        column_names[order[1]], 
        column_names[order[2]]
    ]
    print('new column_names:', column_names)
    
    for row in csv_reader:
        print('---')
        print('old row:', row)
        row = [
            row[order[0]], 
            row[order[1]], 
            row[order[2]]
        ]
        print('new row:', row)
    

    Result:

    order: [2, 0, 1]
    old column_names: ['col2', 'col3', 'col1']
    new column_names: ['col1', 'col2', 'col3']
    ---
    old row: ['a1', 'b1', 'c1']
    new row: ['c1', 'a1', 'b1']
    ---
    old row: ['a2', 'b2', 'c2']
    new row: ['c2', 'a2', 'b2']
    ---
    old row: ['a3', 'b3', 'c3']
    new row: ['c3', 'a3', 'b3']