Search code examples
python-3.xxml-parsingrsselementtree

Python remove tag br and other tags from xml


I am splitting XML of this link https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms into many xml according to headlines

#Python code to illustrate parsing of XML files
# importing the required modules
import requests
import xml.tree.ElementTree as ET

def loadRSS():
  
    # url of rss feed
    url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
  
    # creating HTTP response object from given url
    resp = requests.get(url)
  
    # saving the xml file
    with open('topnewsfeed.xml', 'wb') as f:
        f.write(resp.content)
     

def wire_xml(filename):
    context = ET.iterparse(filename, events=('end', ))
    for event, elem in context:
        if elem.tag == 'article':
            title = elem.find('headline').text
            out_filename = format(title + ".xml")
            with open('./xml/'+out_filename, 'wb') as f:
                # f.write(("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"))
                f.write(ET.tostring(elem))  
      
def main():
    # load rss from web to update existing xml file
    loadRSS()
 
    # store news items in a xml file
    wire_xml('topnewsfeed.xml') 
      
if __name__ == "__main__":
  
    # calling main function
    main()

The above code is working , but they have one 2 problem

1.content(text) in xml have unuseful tags How to remove this tage Example :-

 <content><div class="section1"><div class="Normal">HYDERABAD: Bharat Biotech on Friday said it has committed to supply over 500 million doses of its Covid-19 vaccine Covaxin to the Centre under the countrywide immunisation programme.<br/><br/>Speaking at a virtual conference organised by the Confederation of Indian Industry, Suchitra Ella, joint Managing Director of the city-based vaccine maker, said the company's facilities in four cities - Hyderabad, Bengaluru, Pune, and Ankaleshwar - are currently producing Covaxin.<br/><br/>"

2.how to change tag as per my required example :-

<?xml version="1.0" encoding="UTF-8"?>

-<nitf>


-<head>

<title>Ukraine Black Sea ports resume grain operations</title>


-<iim ver="3">

<ds value="" num="1:20"/>

<ds value="Reuter" num="1:30"/>

<ds value="" num="1:40"/>

<ds value="REU" num="1:50"/>

<ds value="20210723" num="1:70"/>

<ds value="055600+0000" num="1:80"/>

<ds value="Reuter.2021-07-23T055600Z_528892025_L1N2OZ07W_RTRMADT_0_GRAINS-UKRAINE-PORTS.XML" num="2:05"/>

<ds value="" num="2:07"/>

<ds value="3" num="2:10"/>

<ds value="OEC" num="2:15"/>

<ds value="" num="2:20"/>

<ds value="" num="2:22"/>

<ds value="GRAINS-UKRAINE/PORTS" num="2:25"/>

<ds value="" num="2:50"/>

<ds value="20210723" num="2:55"/>

<ds value="" num="2:80"/>

<ds value="" num="2:85"/>

<ds value="" num="2:90"/>

<ds value="" num="2:95"/>

<ds value="" num="2:101"/>

<ds value="Ukraine Black Sea ports resume grain operations" num="2:105"/>

<ds value="Reuter" num="2:110"/>

<ds value="Reuter" num="2:115"/>

<ds value="KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday." num="2:120"/>

</iim>

</head>


-<body>


-<body.content>

<p>KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday.</p>

<p>The restrictions of grain-loading operations had applied to the ports of Odesa, Chornomorsk, Mykolayiv, and Pivdeny.</p>

<p>Ukraine is among the world's biggest global grain exporters and plans to ship about 56 million tonnes of grain in the 2021/22 season. (Reporting by Pavel Polityuk)</p>

</body.content>

</body>

</nitf>

I won't save all xml in the above formate


Solution

  • Python code to illustrate parsing of XML files

    # importing the required modules
    import re as re
    import xml.etree.ElementTree as ET
    import pandas as pd
    from urllib.request import Request, urlopen
    import configparser
    import os
    
    def loadRSS():
        try:
            # url of rss feed
            url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
            # creating HTTP response object from given url
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            #saving the url data in xml byte form
            web_byte = urlopen(req)
            #return data 
            return web_byte
        except OSError as e:
            print("Error in connecting TIL site :- ",e)
            input("Press andy to Close")
    
    def parseXML(xmlfile):    
        news=[]
      # create element tree object
        tree = ET.parse(xmlfile)
        # get root element
        root = tree.getroot()
        # iterate through each node of the tree
        for node in root: 
            s_article  = node.attrib.get("ID")
            s_headline = node.find("headline").text
            s_imagename = node.find("imagename").text
            s_content = node.find("content").text
            s_summary = node.find("summary").text
            s_caption = node.find("caption").text
            s_cats = node.find("cats").text
            #update data in news list 
            news.append({"ID": s_article, "headline": s_headline, 
                "imagename": s_imagename, "content": s_content,
                "summary": s_summary, "caption": s_caption,"cats":s_cats})
        #return data in form of list 
        return news
    
    def savetodf(newsitems):
        #difining Data fram columns 
        df_cols  = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
        #making data fram 
        out_df = pd.DataFrame(newsitems, columns = df_cols)
        #removing unwanted chrater form content
        out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
        #returning data frame 
        return out_df
    
    def define_filename(filename):
        #Defining file name of each news which save in xml 
        config = configparser.ConfigParser()
        config.read('path.ini')
        for section_name in config.sections():
            for name, value in config.items(section_name):
                if name=='default_path':
                    default_path=value
            file_formate="xml"
            return os.path.join(default_path,filename + "." + file_formate)
    
    def build_item_xml(row):
        #defining new xml as per CCI sturctuer
        items = ET.Element('nitf')
        #defining Head and other attributes 
        head = ET.SubElement(items,'head')
        title = ET.SubElement(head,'title')
        title.text=row["headline"]
        country=ET.SubElement(head,'cats')
        country.text=row["cats"]
        item1=ET.SubElement(head,'iim', ver='3')
        ET.SubElement(item1, 'ds num="1:20"', value="79")
        #This is important attribute to Import in CCI
        ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
    #   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
    #   ET.SubElement(item1, 'ds num="2:10"',value="3")
        ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
        ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
        ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
        #savine content in body of xml
        body=ET.SubElement(items, 'body')
        content= ET.SubElement(body, 'body.content')
        content.text=row["content"]
        tree = ET.ElementTree(items)
        #riting in XMl 
        tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
        #returning in form of row 
        return row  
    
    def main():
        # load rss from web to update existing xml file
        lodrss=loadRSS()
        # parse xml file
        newsitems = parseXML(lodrss)
        # store news items in a datafram|
        df=savetodf(newsitems)
        # this calls build_item_xml per row
        df.apply(build_item_xml, axis=1)
             
    if __name__ == "__main__":
    #     pd.set_option('display.max_colwidth', -1)
        # calling main function
        main()
    

    1.content(text) in xml have unuseful tags How to remove this tage Example :-

    A better way is that you save your input feed in the data frame. Then you can remove your tags

    #removing unwanted chrater form content
        out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
    

    2.how to change tag as per my required example:-

    you need to create a new XML file from the data frame below code this and save it in different XML with name

    def build_item_xml(row):
            #defining new xml as per CCI sturctuer
            items = ET.Element('nitf')
            #defining Head and other attributes 
            head = ET.SubElement(items,'head')
            title = ET.SubElement(head,'title')
            title.text=row["headline"]
            country=ET.SubElement(head,'cats')
            country.text=row["cats"]
            item1=ET.SubElement(head,'iim', ver='3')
            ET.SubElement(item1, 'ds num="1:20"', value="79")
            #This is important attribute to Import in CCI
            ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
        #   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
        #   ET.SubElement(item1, 'ds num="2:10"',value="3")
            ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
            ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
            ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
            #savine content in body of xml
            body=ET.SubElement(items, 'body')
            content= ET.SubElement(body, 'body.content')
            content.text=row["content"]
            tree = ET.ElementTree(items)
            #riting in XMl 
            tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
            #returning in form of row 
            return row