Search code examples
pythonweb-scrapingautomationpython-requestshttp-status-code-403

Access gets blocked out(error 403) right after login using python requests


I want to scrape using beautiful soup and python requests a website that requires a login first, I'm able to login by giving my username and password via a post request, however making a get request within the same session after login yeilds error 403(FORBIDDEN), is there a solution to this? The last line in my code is producing a 'forbidden' message, is there a workaround?

import requests
from bs4 import BeautifulSoup

headers = {
     'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}

payload = {
    'login' : '#my_username' , 'password': '#my_password', 'remember_me': 'false', 'fallback': 'false'
}

with requests.Session() as s:
    url = 'https://www.hackerrank.com/auth/login'
    r = s.get(url , headers = headers)
    soup = BeautifulSoup(r.content , 'html5lib')

    r = s.post(url , data = payload , headers = headers)
    print(r.content)
    s.get('Webpage_that_can_be_accessed_only_after_login' , headers = headers)

Solution

  • I did the almost the same thing only difference was that I passed the exact header I saw being passed in chrome and passed csrf_token

    import requests
    import json
    import sys
    from bs4 import BeautifulSoup
    
    #header string picked from chrome
    headerString='''
    {
    "accept": "text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9',text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q':0.9",
    "cache-control": "max-age=0",
    "cookie": "hackerrank_mixpanel_token':7283187c-1f24-4134-a377-af6c994db2a0; hrc_l_i':F; _hrank_session':653fb605c88c81624c6d8f577c9094e4f8657136ca3487f07a3068c25080706db7178cc4deda978006ce9d0937c138b52271e3cd199fda638e8a0b8650e24bb7; _ga':GA1.2.397113208.1599678708; _gid':GA1.2.933726361.1599678708; user_type':hacker; session_id':h3xb3ljp-1599678763378; __utma':74197771.397113208.1599678708.1599678764.1599678764.1; __utmc':74197771; __utmz':74197771.1599678764.1.1.utmcsr':(direct)|utmccn':(direct)|utmcmd':(none); __utmt':1; __utmb':74197771.3.10.1599678764; _biz_uid':5969ac22487d4b0ff8d000621de4a30c; _biz_sid:79bd07; _biz_nA':1; _biz_pendingA':%5B%5D; _biz_flagsA':%7B%22Version%22%3A1%2C%22ViewThrough%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _gat_UA-45092266-28':1; _gat_UA-45092266-26':1; session_referrer':https%3A%2F%2Fwww.google.com%2F; session_referring_domain':www.google.com; session_landing_url':https%3A%2F%2Fwww.hackerrank.com%2Fprefetch_data%3Fcontest_slug%3Dmaster%26get_feature_feedback_list%3Dtrue",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    }
    '''
    d=json.loads(headerString)
    
    #creating session
    s = requests.Session()
    url='https://www.hackerrank.com/auth/login'
    r=s.get(url, headers=d)
    
    #getting the csrf_token
    soup = BeautifulSoup(r.text, 'html.parser')
    csrf_token=soup.find('meta', id='csrf-token')['content']
    
    #using it in login post call
    request_header={
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
    "x-csrf-token": csrf_token
    }
    payload={"login":"<user-name>","password":"<password>","remember_me":False,"fallback":True}
    r=s.post(url, headers=request_header, data=payload)
    
    #then I tested if login is successful by going into dashboard page
    d=json.loads(r.text)
    csrf_token=d['csrf_token']
    url='https://www.hackerrank.com/dashboard'
    request_header={
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
    "x-csrf-token": csrf_token
    }
    r=s.get(url, headers=request_header, data=payload)
    print(r.text)```