Search code examples
pythonxmldataframebeautifulsoupxmltodict

How to convert an XML file to pandas dataframe?


I cannot get XML to a python dataframe

Could you please help me to parse XML to python dataframe? I can't seem to get it to work This is how far I got to:

import xmltodict 
import pandas as pd
import requests
from bs4 import BeautifulSoup
 def get_xml():
    url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
    headers = {'content-type': 'application/soap+xml; charset=utf-8'}
    body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
                <soap12:Body>
                <GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
                </soap12:Body>
                </soap12:Envelope>"""

    response = requests.post(url,data=body,headers=headers)
    return response.content

response = get_xml()
soup = BeautifulSoup(response, 'lxml')
table_columns = []
for item in soup.find_all(['EDPObjectName'.lower()]):
    table_columns.append(item.text)
table_columns=pd.DataFrame(table_columns)
table_rows=[]
for item in soup.find_all(['applicableat']):
    table_rows.append(item.text) 
df1=pd.DataFrame(table_rows).drop_duplicates() 
#df1=pd.to_datetime(df1)
table=[]
for item in soup.find_all(['flowrate']):
    table.append(item.text) 
df=pd.DataFrame(table)
 df_final=pd.DataFrame(df, columns=table_columns, index=df1)

This is the result I am looking for:

                    ALDBROUGH   AVONMOUTH   BACTON BBL  …
    2019-08-08T13:00:00 0       1.23    5.1         …
    2019-08-08T13:02:00 0       1.23    5.1         …
    2019-08-08T13:04:00 0       3.23    5.1         …
    2019-08-08T13:06:00 0       3.23    5.1         …
    2019-08-08T13:08:00 0       3.23    5.23            …
    2019-08-08T13:10:00 0       4.23    5.204           …

Solution

  • Try using:

    from bs4 import BeautifulSoup
    import pandas as pd
    
    name_list = []
    
    prev_df = pd.DataFrame(columns=['time'])
    response = BeautifulSoup(get_xml(), 'lxml')
    for x in response.find_all('edpobjectbe'):
        list_small = list()
    
        name = str(x.find('edpobjectname').text).strip()
        name_list.append(name)
    
        data = x.find_all('edpenergydatabe')
        print(name)
        for y in data:
    
            applicableat = str(y.find('applicableat').text).strip()
            flowrate = str(y.find('flowrate').text).strip()
    
            list_small.append([applicableat, flowrate])
        df = pd.DataFrame(list_small, columns=['time', name])
        prev_df = pd.DataFrame.merge(prev_df, df, how='right', on='time')
    print(prev_df)
    

    Check if this works for you!!!