I am splitting XML of this link https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms into many xml according to headlines
#Python code to illustrate parsing of XML files
# importing the required modules
import requests
import xml.tree.ElementTree as ET
def loadRSS():
# url of rss feed
url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
# creating HTTP response object from given url
resp = requests.get(url)
# saving the xml file
with open('topnewsfeed.xml', 'wb') as f:
f.write(resp.content)
def wire_xml(filename):
context = ET.iterparse(filename, events=('end', ))
for event, elem in context:
if elem.tag == 'article':
title = elem.find('headline').text
out_filename = format(title + ".xml")
with open('./xml/'+out_filename, 'wb') as f:
# f.write(("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"))
f.write(ET.tostring(elem))
def main():
# load rss from web to update existing xml file
loadRSS()
# store news items in a xml file
wire_xml('topnewsfeed.xml')
if __name__ == "__main__":
# calling main function
main()
The above code is working , but they have one 2 problem
1.content(text) in xml have unuseful tags How to remove this tage Example :-
<content><div class="section1"><div class="Normal">HYDERABAD: Bharat Biotech on Friday said it has committed to supply over 500 million doses of its Covid-19 vaccine Covaxin to the Centre under the countrywide immunisation programme.<br/><br/>Speaking at a virtual conference organised by the Confederation of Indian Industry, Suchitra Ella, joint Managing Director of the city-based vaccine maker, said the company's facilities in four cities - Hyderabad, Bengaluru, Pune, and Ankaleshwar - are currently producing Covaxin.<br/><br/>"
2.how to change tag as per my required example :-
<?xml version="1.0" encoding="UTF-8"?>
-<nitf>
-<head>
<title>Ukraine Black Sea ports resume grain operations</title>
-<iim ver="3">
<ds value="" num="1:20"/>
<ds value="Reuter" num="1:30"/>
<ds value="" num="1:40"/>
<ds value="REU" num="1:50"/>
<ds value="20210723" num="1:70"/>
<ds value="055600+0000" num="1:80"/>
<ds value="Reuter.2021-07-23T055600Z_528892025_L1N2OZ07W_RTRMADT_0_GRAINS-UKRAINE-PORTS.XML" num="2:05"/>
<ds value="" num="2:07"/>
<ds value="3" num="2:10"/>
<ds value="OEC" num="2:15"/>
<ds value="" num="2:20"/>
<ds value="" num="2:22"/>
<ds value="GRAINS-UKRAINE/PORTS" num="2:25"/>
<ds value="" num="2:50"/>
<ds value="20210723" num="2:55"/>
<ds value="" num="2:80"/>
<ds value="" num="2:85"/>
<ds value="" num="2:90"/>
<ds value="" num="2:95"/>
<ds value="" num="2:101"/>
<ds value="Ukraine Black Sea ports resume grain operations" num="2:105"/>
<ds value="Reuter" num="2:110"/>
<ds value="Reuter" num="2:115"/>
<ds value="KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday." num="2:120"/>
</iim>
</head>
-<body>
-<body.content>
<p>KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday.</p>
<p>The restrictions of grain-loading operations had applied to the ports of Odesa, Chornomorsk, Mykolayiv, and Pivdeny.</p>
<p>Ukraine is among the world's biggest global grain exporters and plans to ship about 56 million tonnes of grain in the 2021/22 season. (Reporting by Pavel Polityuk)</p>
</body.content>
</body>
</nitf>
I won't save all xml in the above formate
# importing the required modules
import re as re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.request import Request, urlopen
import configparser
import os
def loadRSS():
try:
# url of rss feed
url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
# creating HTTP response object from given url
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#saving the url data in xml byte form
web_byte = urlopen(req)
#return data
return web_byte
except OSError as e:
print("Error in connecting TIL site :- ",e)
input("Press andy to Close")
def parseXML(xmlfile):
news=[]
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# iterate through each node of the tree
for node in root:
s_article = node.attrib.get("ID")
s_headline = node.find("headline").text
s_imagename = node.find("imagename").text
s_content = node.find("content").text
s_summary = node.find("summary").text
s_caption = node.find("caption").text
s_cats = node.find("cats").text
#update data in news list
news.append({"ID": s_article, "headline": s_headline,
"imagename": s_imagename, "content": s_content,
"summary": s_summary, "caption": s_caption,"cats":s_cats})
#return data in form of list
return news
def savetodf(newsitems):
#difining Data fram columns
df_cols = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
#making data fram
out_df = pd.DataFrame(newsitems, columns = df_cols)
#removing unwanted chrater form content
out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
#returning data frame
return out_df
def define_filename(filename):
#Defining file name of each news which save in xml
config = configparser.ConfigParser()
config.read('path.ini')
for section_name in config.sections():
for name, value in config.items(section_name):
if name=='default_path':
default_path=value
file_formate="xml"
return os.path.join(default_path,filename + "." + file_formate)
def build_item_xml(row):
#defining new xml as per CCI sturctuer
items = ET.Element('nitf')
#defining Head and other attributes
head = ET.SubElement(items,'head')
title = ET.SubElement(head,'title')
title.text=row["headline"]
country=ET.SubElement(head,'cats')
country.text=row["cats"]
item1=ET.SubElement(head,'iim', ver='3')
ET.SubElement(item1, 'ds num="1:20"', value="79")
#This is important attribute to Import in CCI
ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
# ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
# ET.SubElement(item1, 'ds num="2:10"',value="3")
ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
#savine content in body of xml
body=ET.SubElement(items, 'body')
content= ET.SubElement(body, 'body.content')
content.text=row["content"]
tree = ET.ElementTree(items)
#riting in XMl
tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
#returning in form of row
return row
def main():
# load rss from web to update existing xml file
lodrss=loadRSS()
# parse xml file
newsitems = parseXML(lodrss)
# store news items in a datafram|
df=savetodf(newsitems)
# this calls build_item_xml per row
df.apply(build_item_xml, axis=1)
if __name__ == "__main__":
# pd.set_option('display.max_colwidth', -1)
# calling main function
main()
1.content(text) in xml have unuseful tags How to remove this tage Example :-
A better way is that you save your input feed in the data frame. Then you can remove your tags
#removing unwanted chrater form content
out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
2.how to change tag as per my required example:-
you need to create a new XML file from the data frame below code this and save it in different XML with name
def build_item_xml(row):
#defining new xml as per CCI sturctuer
items = ET.Element('nitf')
#defining Head and other attributes
head = ET.SubElement(items,'head')
title = ET.SubElement(head,'title')
title.text=row["headline"]
country=ET.SubElement(head,'cats')
country.text=row["cats"]
item1=ET.SubElement(head,'iim', ver='3')
ET.SubElement(item1, 'ds num="1:20"', value="79")
#This is important attribute to Import in CCI
ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
# ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
# ET.SubElement(item1, 'ds num="2:10"',value="3")
ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
#savine content in body of xml
body=ET.SubElement(items, 'body')
content= ET.SubElement(body, 'body.content')
content.text=row["content"]
tree = ET.ElementTree(items)
#riting in XMl
tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
#returning in form of row
return row