Search code examples
pythonunit-testingrss

Python: best way to check for list of URLs


I have a file defining a list of RSS feeds:

RSS_FEEDS = [
    "https://www.fanpage.it/feed/",
    "https://www.ilfattoquotidiano.it/feed/",
    "https://forbes.it/feed/",
    "https://formiche.net/feed/",
]

I wrote the following test:

import requests

from feeds import RSS_FEEDS

for rssfeed in RSS_FEEDS:
    response = requests.get(rssfeed)
    assert response.status_code == 200

Are there more efficient (download less stuff) ways?

How would you handle a slow response vs a dead link?

The above would just tell me if the URL is fetchable, but how could I assess if it's a valid RSS stream?


Solution

  • You could solve it using the aiohttp library also together with asyncio, like this:

    from aiohttp import ClientSession
    from asyncio import gather, create_task, run, set_event_loop, set_event_loop_policy
    from traceback import format_exc
    import sys
    
    # This is necessary on my Windows computer
    if sys.version_info[0] == 3 and sys.version_info[1] >= 8 and sys.platform.startswith('win'): # Check for operating system
        from asyncio import ProactorEventLoop, WindowsSelectorEventLoopPolicy
        set_event_loop(ProactorEventLoop())
        set_event_loop_policy(WindowsSelectorEventLoopPolicy()) # Bug is not present in Linux
    
    RSS_FEEDS = [
        "https://www.fanpage.it/feed/",
        "https://www.ilfattoquotidiano.it/feed/",
        "https://forbes.it/feed/",
        "https://formiche.net/feed/",
    ]
    
    async def GetRessource(url: str, session: ClientSession) -> dict:
        try:
            async with session.get(url) as response:
                if response.status == 200:
                    return(response.status)
                else:
                    r: str = await response.text()
                    print(f"Error, got response code: {response.status} message: {r}")
        except Exception:
            print(f"General Exception:\n{format_exc()}")
        return({})
    
    async def GetUrls() -> None:
        async with ClientSession() as session:
            Tasks: list = [create_task(GetRessource(url, session)) for url in RSS_FEEDS]
            Results: list = await gather(*Tasks, return_exceptions=False)
            for result in Results:
                assert result == 200
    
    async def main():
        await GetUrls()
    
    if __name__ == "__main__":
        run(main())
    

    Result of Results:

    200
    200
    200
    200
    

    It's checking the URLs in parallel.