Search code examples
pythonsitemap.xmlcustom-error-pages

What changes should I do in my code to exclude error 404 URL's from sitemap?


The code below creates the sitemap but it is including 404 error URL's. How do I exclude them from the sitemap?

from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date

tree = sitemap_tree_for_homepage('')

root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")

for page in tree.all_pages():
    url = page.url
    prio = json.dumps(page.priority, use_decimal=True)
    # format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
    lm = date.today().strftime("%Y-%m-%d")
    cf = page.change_frequency.value
    urlel = ET.SubElement(root, "url")
    ET.SubElement(urlel, "loc").text = url
    ET.SubElement(urlel, "lastmod").text = lm
    ET.SubElement(urlel, "changefreq").text = cf
    ET.SubElement(urlel, "priority").text = prio

ET.indent(root, "  ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )

Solution

  • You have to use (external module) requests or (built-in module) urllib.request to check if url gives status 404

    import urllib.request
    import urllib.error
    
    url = 'https://stackoverflow.com/fake_url'  # wrong url
    #url = 'https://stackoverflow.com/'         # correct url
    
    try:
        r = urllib.request.urlopen(url)
        print('adding url:', url)
        # ... add `url` to sitemap ...
    except urllib.error.HTTPError as ex:
        print('ex:', ex)
        print('wrong url:', url)
    

    or

    import requests
    
    url = 'https://stackoverflow.com/fake_url'
    #url = 'https://stackoverflow.com/'
    
    response = requests.get(url)
    
    #if response.status_code != 404:
    if response.status_code == 200:
        print('adding url:', url)
        # ... add `url` to sitemap ...
    else:
        print('wrong url:', url)
    

    EDIT:

    You could put it as function and return True/False

    import urllib.request
    import urllib.error
    
    def is_correct_url(url):
        try:
           r = urllib.request.urlopen(url)
           return True
        except urllib.error.HTTPError as ex:
           print('ex:', ex)
           return False
    
    # ---
    
    for page in tree.all_pages():
        url = page.url
        if is_correct_url(url):
            print('adding url:', url)
            # ... add `url` to sitemap ...
        else:
            print('wrong url:', url)
    
    import requests
    
    def is_correct_url(url):
        response = requests.get(url)
        #return response.status_code != 404:
        return response.status_code == 200:
    
    # ---
    
    for page in tree.all_pages():
        url = page.url
        if is_correct_url(url):
            print('adding url:', url)
            # ... add `url` to sitemap ...
        else:
            print('wrong url:', url)