The code below creates the sitemap but it is including 404 error URL's. How do I exclude them from the sitemap?
from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date
tree = sitemap_tree_for_homepage('')
root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for page in tree.all_pages():
url = page.url
prio = json.dumps(page.priority, use_decimal=True)
# format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
lm = date.today().strftime("%Y-%m-%d")
cf = page.change_frequency.value
urlel = ET.SubElement(root, "url")
ET.SubElement(urlel, "loc").text = url
ET.SubElement(urlel, "lastmod").text = lm
ET.SubElement(urlel, "changefreq").text = cf
ET.SubElement(urlel, "priority").text = prio
ET.indent(root, " ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )
You have to use (external module) requests or (built-in module) urllib.request to check if url gives status 404
import urllib.request
import urllib.error
url = 'https://stackoverflow.com/fake_url' # wrong url
#url = 'https://stackoverflow.com/' # correct url
try:
r = urllib.request.urlopen(url)
print('adding url:', url)
# ... add `url` to sitemap ...
except urllib.error.HTTPError as ex:
print('ex:', ex)
print('wrong url:', url)
or
import requests
url = 'https://stackoverflow.com/fake_url'
#url = 'https://stackoverflow.com/'
response = requests.get(url)
#if response.status_code != 404:
if response.status_code == 200:
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
EDIT:
You could put it as function and return True
/False
import urllib.request
import urllib.error
def is_correct_url(url):
try:
r = urllib.request.urlopen(url)
return True
except urllib.error.HTTPError as ex:
print('ex:', ex)
return False
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
import requests
def is_correct_url(url):
response = requests.get(url)
#return response.status_code != 404:
return response.status_code == 200:
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)