I want to scrape using beautiful soup and python requests a website that requires a login first, I'm able to login by giving my username and password via a post request, however making a get request within the same session after login yeilds error 403(FORBIDDEN), is there a solution to this? The last line in my code is producing a 'forbidden' message, is there a workaround?
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
payload = {
'login' : '#my_username' , 'password': '#my_password', 'remember_me': 'false', 'fallback': 'false'
}
with requests.Session() as s:
url = 'https://www.hackerrank.com/auth/login'
r = s.get(url , headers = headers)
soup = BeautifulSoup(r.content , 'html5lib')
r = s.post(url , data = payload , headers = headers)
print(r.content)
s.get('Webpage_that_can_be_accessed_only_after_login' , headers = headers)
I did the almost the same thing only difference was that I passed the exact header I saw being passed in chrome and passed csrf_token
import requests
import json
import sys
from bs4 import BeautifulSoup
#header string picked from chrome
headerString='''
{
"accept": "text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9',text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q':0.9",
"cache-control": "max-age=0",
"cookie": "hackerrank_mixpanel_token':7283187c-1f24-4134-a377-af6c994db2a0; hrc_l_i':F; _hrank_session':653fb605c88c81624c6d8f577c9094e4f8657136ca3487f07a3068c25080706db7178cc4deda978006ce9d0937c138b52271e3cd199fda638e8a0b8650e24bb7; _ga':GA1.2.397113208.1599678708; _gid':GA1.2.933726361.1599678708; user_type':hacker; session_id':h3xb3ljp-1599678763378; __utma':74197771.397113208.1599678708.1599678764.1599678764.1; __utmc':74197771; __utmz':74197771.1599678764.1.1.utmcsr':(direct)|utmccn':(direct)|utmcmd':(none); __utmt':1; __utmb':74197771.3.10.1599678764; _biz_uid':5969ac22487d4b0ff8d000621de4a30c; _biz_sid:79bd07; _biz_nA':1; _biz_pendingA':%5B%5D; _biz_flagsA':%7B%22Version%22%3A1%2C%22ViewThrough%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _gat_UA-45092266-28':1; _gat_UA-45092266-26':1; session_referrer':https%3A%2F%2Fwww.google.com%2F; session_referring_domain':www.google.com; session_landing_url':https%3A%2F%2Fwww.hackerrank.com%2Fprefetch_data%3Fcontest_slug%3Dmaster%26get_feature_feedback_list%3Dtrue",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
'''
d=json.loads(headerString)
#creating session
s = requests.Session()
url='https://www.hackerrank.com/auth/login'
r=s.get(url, headers=d)
#getting the csrf_token
soup = BeautifulSoup(r.text, 'html.parser')
csrf_token=soup.find('meta', id='csrf-token')['content']
#using it in login post call
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
payload={"login":"<user-name>","password":"<password>","remember_me":False,"fallback":True}
r=s.post(url, headers=request_header, data=payload)
#then I tested if login is successful by going into dashboard page
d=json.loads(r.text)
csrf_token=d['csrf_token']
url='https://www.hackerrank.com/dashboard'
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
r=s.get(url, headers=request_header, data=payload)
print(r.text)```