Search code examples
pythonxml

Reading xml files in to pandas dataframe


I have a folder with several other folders. These folders all contain xml files

I have written code to extract the data into a df. Can i write in such a way that i specify all the file directories in my code and once the file is run it extracts the xml from all the folders?

import nltk
import os
import pandas as pd
from lxml import etree

dir = 'directory' 

speakers_data = [ ]


# Define namespaces

ns = {
'tb': 'http://www.talkbank.org/ns/talkbank',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}

for name in os.listdir(dir:
if name.endswith('.xml'):
file_path = os.path.join(dir, name)

        tree = etree.parse(file_path)
        
        folder_name = os.path.basename(os.path.dirname(file_path))
    
        speakers = []
        for participant in tree.xpath("//tb:Participants/tb:participant", namespaces=ns):
            speaker_info = {
                'speaker_name': participant.get('name'),
                'role': participant.get('role'),
                'age': participant.get('age'),
                'sex': participant.get('sex')
            }
            speakers_info.append(speakers)
        
        df_speakers = pd.DataFrame(speakers)
        speakers_data.append(df_speakers)

speakers_data = pd.concat(speakers_data, ignore_index=True)

Solution

  • You can use os.walk() to search all .xml in the directory or subdirectories, too (with topdown=True):

    import os
    
    for root, dirs, files in os.walk(".", topdown=False):
       for name in files:
           if name.endswith('.xml'):
               # do something ...
              print(os.path.join(root, name))