python authentication beautifulsoup python-requests session-cookies

login with requests and BeautifulSoup to scrape pages

I need to scrape a page that requires login to access.

I tried to login with the saved logins info converted in cUrl, using requests and BeautifulSoup but it doesn't work.

I need to login on 'https://www.seoprofiler.com/account/login' And then scrape pages like: 'https://www.seoprofiler.com/lp/links?q=test.com'

Here's my code:

from bs4 import BeautifulSoup 
import requests



cookies = {
    'csrftoken': 'token123',
    'seoprofilersession': 'session123',
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'sec-ch-ua': '^\\^',
    'sec-ch-ua-mobile': '?0',
    'Upgrade-Insecure-Requests': '1',
    'Origin': 'https://www.seoprofiler.com',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document',
    'Referer': 'https://www.seoprofiler.com/account/login',
    'Accept-Language': 'en,en-US;q=0.9,it;q=0.8',
}

data = {
    'csrfmiddlewaretoken': 'token123',
    'username': 'email123@gmail.com',
    'password': 'pass123!',
    'button': ''
}



response = requests.post('https://www.seoprofiler.com/account/login',
                             headers=headers, cookies=cookies, data=data)


url = 'https://www.seoprofiler.com/lp/links?q=test.com'
response = requests.get(url, headers= headers, cookies=cookies)
soup = BeautifulSoup(response.content, 'html.parser')
soup.encode('utf-8')
print(soup.title)

I would not use selenium as I have to scrape a lot of data and it would require a lot of time with selenium.

How can I login in order to scrape pages logged in? Thank you!

Solution

You could use requests.Session!

After some trial and error I was able to log in and get the project page using the following script:

import requests

session = requests.Session() # Create new session
session.get(
    "https://www.seoprofiler.com/account/login"
)  # set seoprofilersession and csrftoken cookies

session.post(
    "https://www.seoprofiler.com/account/login",
    data={
        "csrfmiddlewaretoken": session.cookies.get_dict()["csrftoken"],
        "username": "your_email",
        "password": "your_password",
    },
)  # login, sets needed cookies

# Now use this session to get all data you need!
resp = session.get(
    "https://www.seoprofiler.com/project/google.com-fa1b9c855721f3d5"
)  # get main page content

print(resp.status_code) # my output: 200

Edited:

Just checked one more thing and it appears that it is not mandatory to retrieve seoprofilersession and csrftoken cookies and you can just simply call login post with your credentials (without csrfmiddlewaretoken and then use your session)