Search code examples
pythonhtmlweb-scrapingbeautifulsouphtml-parsing

How to scrape information from website that requires login


I am working on a python web scraping project. The website I am trying to scrape data from contains info about all the medicines sold in India. The website requires a user to login before giving access to this information.

I want to access all the links in this url https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand and store it in an array.

Here is my code for logging into the website

##################################### Method 1
import mechanize
import http.cookiejar as cookielib
from bs4 import BeautifulSoup
import html2text

br = mechanize.Browser()
cj = cookielib.LWPCookieJar()

br.set_cookiejar(cj)

br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

br.addheaders = [('User-agent', 'Chrome')]

br.open('https://sso.mims.com/Account/SignIn')

# View available forms
for f in br.forms():
    print(f)

br.select_form(nr=0)

# User credentials
br.form['EmailAddress'] = <USERNAME>
br.form['Password'] = <PASSWORD>

# Login
br.submit()
print(br.open('https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand').read())

But the problem is that when the credentials are submitted, a middle page pops up with the following information.

You will be redirected to your destination shortly.

This page submits a hidden form and only then is the required end page shown. I want to access the end page. But br.open('https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand').read() accesses the middle page and prints the results.

How do I wait for the middle page to submit the hidden form and then access the contents of the end page?


Solution

  • I've posted a selenium solution below, which works, but after understanding a bit more about the login process, it's possible to login using BeautifulSoup and requests only. Please read the comments on the code.

    BeautifulSoup / requests solution

    import requests
    from bs4 import BeautifulSoup
    
    d = {
        "EmailAddress": "your@email.tld",
        "Password": "password",
        "RememberMe":   True,
        "SubscriberId": "",
        "LicenseNumber":    "",
        "CountryCode":  "SG"
    }
    
    req = requests.Session()
    
    login_u = "https://sso.mims.com/"
    html = req.post(login_u, data=d)
    
    products_url = "https://mims.com/india/browse/alphabet/a?cat=drug"
    html = req.get(products_url) # The cookies generated on the previous request will be use on this one automatically because we use Sessions
    
    # Here's the tricky part. The site uses 2 intermediary "relogin" pages that (theoretically) are only available with JavaScript enabled, but we can bypass that, i.e.:
    
    soup = BeautifulSoup(html.text, "html.parser")
    form = soup.find('form', {"id": "openid_message"})
    form_url = form['action'] # used on the next post request
    
    inputs = form.find_all('input')
    form_dict = {}
    for input in inputs:
        if input.get('name'):
            form_dict[input.get('name')] = input.get('value')
    
    form_dict['submit_button'] = "Continue"
    relogin = req.post(form_url, data=form_dict)
    
    
    soup = BeautifulSoup(relogin.text, "html.parser")
    form = soup.find('form', {"id": "openid_message"})
    form_url = form['action'] # used
    inputs = form.find_all('input')
    form_dict = {}
    for input in inputs:
        if input.get('name'):
            form_dict[input.get('name')] = input.get('value')
    products_a = req.post(form_url, data=form_dict)
    print(products_a.text)
    
    # You can now request any url normally because the necessary cookies are already present on the current Session()
    products_url = "https://mims.com/india/browse/alphabet/c?cat=drug"
    products_c = req.get(products_url)
    print(products_c.text)
    

    Selenium solution

    from selenium import webdriver
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.keys import Keys
    from time import sleep
    
    driver = webdriver.Firefox()
    wait = WebDriverWait(driver, 10)
    driver.maximize_window()
    
    driver.get("https://sso.mims.com/")
    el = wait.until(EC.element_to_be_clickable((By.ID, "EmailAddress")))
    el.send_keys("your@email.com")
    
    el = wait.until(EC.element_to_be_clickable((By.ID, "Password")))
    el.send_keys("password")
    
    el = wait.until(EC.element_to_be_clickable((By.ID, "btnSubmit")))
    el.click()
    
    wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "profile-section-header"))) # we logged in successfully
    
    driver.get("http://mims.com/india/browse/alphabet/a?cat=drug")
    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "searchicon")))
    print(driver.page_source)
    
    # do what you need with the source code