I'm trying to parse 3 different RSS sources, these are the sources.
https://www.nba.com/bucks/rss.xml
http://www.espn.com/espn/rss/ncb/news
http://rss.nytimes.com/services/xml/rss/nyt/ProBasketball.xml
For the most part the structure of all these three sources are similar, except for the url
I'm trying to parse these into the following Feed object,
class Feed(Base):
title = models.CharField(db_index=True, unique=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, )
summary = models.TextField(null=True)
author = models.CharField(null=True, max_length=255)
url = models.CharField(max_length=512, null=True)
published = models.DateTimeField()
source = models.ForeignKey(Source, on_delete=models.CASCADE, null=True)
This is the source object,
class Source(Base):
name = models.CharField(db_index=True, max_length=255)
link = models.CharField(db_index=True, max_length=255, unique=True)
This is the code that I use to parse,
import logging
import xml.etree.ElementTree as ET
import requests
import maya
from django.utils import timezone
from aggregator.models import Feed
class ParseFeeds:
@staticmethod
def parse(source):
logger = logging.getLogger(__name__)
logger.info("Starting {}".format(source.name))
root = ET.fromstring(requests.get(source.link).text)
items = root.findall(".//item")
for item in items:
title = ''
if item.find('title'):
title = item.find('title').text
link = ''
if item.find('link'):
link = item.find('link').text
description = ''
if item.find('description'):
description = item.find('description').text
author = ''
if item.find('author'):
author = item.find('author').text
published = timezone.now()
if item.find('pubDate'):
published = maya.parse(item.find('pubDate').text).datetime()
url = ''
if item.find('enclosure'):
url = item.find('enclosure').attrib['url']
if item.find('image'):
url = item.find('image')
if not Feed.objects.filter(title=title).exists():
logger.info("Title:{} Link:{} Summary:{} Author:{} Published:{} Url:{}".format(title, link, description, author, published, url))
feed = Feed(title=title, link=link, summary=description, author=author, published=published, url=url,
source=source)
feed.save()
logger.info("Adding {} from {}".format(feed.title, feed.source.name))
logger.info("Finished {}".format(source.name))
While I can parse each of these sources on the python console, the feed object created here ends up with all None
or default fields.
What am I doing wrong here.
You should use
for item in items:
title = ''
if item.find('title') is not None: # The "is not None" part is critical here.
title = item.find('title').text
# And so on ...
If you try in your terminal
bool(item.find('title')) # This is False
item.find('title') is not None # while this is True
Every time you want to check if something is or not is None, use if something is None
construction.