Search code examples
pythonscreen-scraping

Scraping website with python when have buttons


I'm using BeautifulSoup and ok, is scraping the website. But in this website there are three list buttons and four other buttons in it. And whenever I click in one of the buttons the website changes, but I can't scrap what changed, only the original page. What I'm trying to do is scrap the page of all the combinations of this three buttons. More specifically, I want to get the values of the table after click.

Command below:

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

#pip install selenium
#dbutils.library.restartPython()

html = requests.get("xxxx").content

soup = BeautifulSoup(html, 'html.parser')

print(soup.prettify())

preco = soup.find("table", class_="ajax-overlay")

print(preco)

buttons = soup.findAll('fieldset')
print(buttons)

I tried to use the BeautifulSoup with the command below:

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

#pip install selenium
#dbutils.library.restartPython()

html = requests.get("xxxxx").content

soup = BeautifulSoup(html, 'html.parser')

print(soup.prettify())

preco = soup.find("table", class_="ajax-overlay")

print(preco)

buttons = soup.findAll('fieldset')
print(buttons)

Solution

  • The site retrieves information dynamically, so u can get every button value

    def get_options() -> dict:
        url = 'https://kitcorretoramil.com.br/linha-selecionada-pme/tabela-de-precos-pme/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        estado = [x.get('value') for x in soup.find('select', {'id': 'estado'}).find_all('option') if x.get('value')]
        compulsorio = [x.get('value') for x in soup.find('select', {'id': 'compulsorio'}).find_all('option') if x.get('value')]
        numero_de_vidas_plano = [x.get('value') for x in soup.find('select', {'id': 'numero_de_vidas_plano'}).find_all('option') if x.get('value')]
        return {
            'Estado': estado,
            'Compulsorio': compulsorio,
            'Numero_de_vidas_plano': numero_de_vidas_plano
        }
    

    OUTPUT:

    'Estado': ['BAHIA', 'CEARÁ', 'DISTRITO FEDERAL', 'GOIÁS', 'CAMPINAS', 'MARANHÃO', 'MINAS GERAIS', 'PARAÍBA', 'PARANÁ', 'PERNAMBUCO', 'RIO DE JANEIRO', 'RIO GRANDE DO NORTE', 'RIO GRANDE DO SUL', 'SANTA CATARINA', 'SÃO PAULO'], 'Compulsorio': ['Compulsório', 'Livre Adesão'], 'Numero_de_vidas_plano': ['2', '3 a 4', '5 a 29', '30 a 99']}
    

    now u need get table, just change estado, compulsorio and numero_de_vidas_plano values

    def get_table(estado: str, compulsorio: str, numero_de_vidas_plano: str) -> pd.DataFrame():
        url = "https://kitcorretoramil.com.br/wp-admin/admin-ajax.php?action=ktc_get_price_table_values"
        payload = json.dumps({
          "pf": "false",
          "Estado": estado,
          "Compulsorio": compulsorio,
          "Numero_de_vidas_plano": numero_de_vidas_plano,
          "Linha": "Linha Selecionada",
          "Coparticipação": "Com coparticipação30"
        })
        headers = {
            'accept': '*/*',
            'content-type': 'application/json',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
        }
    
        response = requests.request("POST", url, headers=headers, data=payload)
        json_data = response.json()
        del json_data['legal_text']
        return pd.DataFrame(json_data)
    

    and try with some values

    df = get_table('BAHIA', 'Compulsório', '2')
    print(df.to_string())
    

    OUTPUT: OUTPUT OF CODE