Search code examples
pythonpython-requestsscrapyhttp-status-code-403

Scraping a page returns a 200, checking that page later returns a 403


I am scraping a number of websites for job postings using Scrapy. If a page on a site fits my requirements, I store a link to the page in a database. No issue there. I've also created a script that goes through each link in the database and pings the URL. If it returns a 404, it gets deleted. The issue I'm having is that some sites are returning 403 errors when I do the deletion check. What's weird is that they all allow scraping, but they are blocking the check. This is the script I'm using to do the deletion check:

from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time


client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
               "Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
               "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
               "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
               "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []

for item in sorted_openings:
    try:
        if requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 200:
            print(str(requests.get(item['Link'])) + ' ' + item['Link'])
            counter += 1
            print(counter)
        elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 304:
            print(requests.get(item['Link']))
            counter += 1
            print(counter)
        elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 403:
            forbidden.append(item['Link'])
            print(requests.get(item['Link']))
            counter += 1
            print(counter)
        else:
            db.openings.remove(item)
            deleted_links.append(item['Link'])
            del_counter += 1
            counter += 1
            print('Deleted ' + item['Link'])
            print(counter)
    except:
        pass
        passed_links.append(item['Link'])
        passed_counter += 1
        counter += 1
        print('Passed link ' + item['Link'])
        print(counter)

Solution

  • You send the request in each condition, send one request and store the result in a value then check with the condition.

    from pymongo import MongoClient
    import requests
    import urllib3
    from operator import itemgetter
    import random
    import time
    
    
    client = MongoClient("path-to-mongo")
    db = client["mongoDB"]
    col = db['mongoCollection']
    openings = list(col.find())
    sorted_openings = sorted(openings, key=itemgetter('Company'))
    del_counter = 0
    user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
                   "Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
                   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
                   "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
                   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
                   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
                   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
    headers = {"User-Agent": user_agents[random.randint(0,11)]}
    counter = 0
    del_counter = 0
    passed_counter = 0
    deleted_links = []
    passed_links = []
    forbidden = []
    
    for item in sorted_openings:
        try:
            response = requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers)
            if response.status_code == 200:
                print(str(response) + ' ' + item['Link'])
                counter += 1
                print(counter)
            elif response.status_code == 304:
                print(response)
                counter += 1
                print(counter)
            elif response.status_code == 403:
                forbidden.append(item['Link'])
                print(response))
                counter += 1
                print(counter)
            else:
                db.openings.remove(item)
                deleted_links.append(item['Link'])
                del_counter += 1
                counter += 1
                print('Deleted ' + item['Link'])
                print(counter)
        except:
            passed_links.append(item['Link'])
            passed_counter += 1
            counter += 1
            print('Passed link ' + item['Link'])
            print(counter)