Search code examples
pythonpython-2.7rssfeedparser

How to detect with feedparser if there are new items in an RSS channel?


I have the following code. When you understand the code, you can look at the two comments with the capital letters. I could test if there are new items in the channel with insert or ignore but I'm trying the better mechanism with utilization feed.updated_parsed attribute. Why doesn't it work as expected?

from __future__ import unicode_literals
import feedparser
from sqlite3  import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket

def getActiveChannels():
  """Returns a list of active RSS channels"""
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("select id, title, xmlurl, updated from channels")
  channels = cur.fetchall()
  cur.close()
  con.close()
  return channels

def getItemsForChannel(xmlUrl, lastUpdate):   
  socket.setdefaulttimeout(60)
  feedparserDictionary = feedparser.parse(xmlUrl)
  updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
  lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
  if updatedTime < lst:
    return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS

  items = feedparserDictionary.entries
  print "There are new %d items" % len(items)
  return items

def setChannelUpdateTime(xmlUrl, tm):
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
  con.commit()
  print "updated successfully"
  cur.close()
  con.close()

if __name__ == "_main__":
   con = sqlite.connect(connectionString)
   for channel in getActiveChannels():
     channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
     countOfNewItems = 0
     items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)

     for item in items:
       title, link, description, priority, updated = item
       cur = con.cursor()
       cur.execute("insert or ignore into feeds \
              (title, link, description, read, updated, channelid) \
              values (?, ?, ?, ?, ?, ?)", \
              (title, link, description, 0, updated, channelId))

       countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
       con.commit()
       cur.close()


     if countOfNewItems:
       print "Found new items"
       now = datetime.now().isoformat()
       if "." not in now:
         now = now + ".000000"
       setChannelUpdateTime(channelXmlUrl, now)

Here are the two tables in sqlite:

CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));

Solution

  • I think the possible error is that you are trying to compare updated field on the feed, the feeds could be not well supported by the feed creator. Or timezone formatting because of using isoformat or etc.

    Anyway, I believe that it is much better to compare PER ENTRY updated properties rather than comparing the feed property which is mostly used for invalidating feed cache.

    Here is a working example, where I return only new entries from the function.

    import socket
    from datetime import datetime, timedelta
    from time import mktime
    
    import feedparser
    from pprint import pprint
    
    
    def getItemsForChannel(xmlUrl, lastUpdate):
        lst = datetime.fromisoformat(lastUpdate)
    
        socket.setdefaulttimeout(60)
    
        parsed = feedparser.parse(xmlUrl)
    
        items = [entry for entry in parsed.entries if
                 datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
        print("There are new {} items".format(len(items)))
        return items
    
    
    pprint(getItemsForChannel(
        'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
        (datetime.now() - timedelta(hours=3)).isoformat()
    ))
    

    It uses from/to iso formatting for the last parsed date in your database value and compares entries per entry instead of global comparison based on the feed updated property.