I am coding some script in Python to input a word into the online Latin morphological analysis software Collatinus (Collatinus) and receive a full declination/conjugation of that word.
Here is what I have so far:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import pandas
from bs4 import BeautifulSoup as bs
import sys
#x = input("Word:")
chrome_service = Service(executable_path="C:\Program Files (x86)\msedgedriver.exe")
driver = webdriver.Edge(service=chrome_service)
driver.get("https://outils.biblissima.fr/en/collatinus-web/")
time.sleep(15)
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.COMMAND + 'r')
time.sleep(15)
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.COMMAND + 'r')
time.sleep(15)
search = driver.find_element(By.ID, "flexion_lemme")
search.send_keys('canis')
time.sleep(7)
submit = driver.find_elements(By.TAG_NAME, "button")
correct = submit[4]
correct.click()
time.sleep(15)
html = driver.page_source
if html.find("Une erreur s'est produite") != -1:
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.COMMAND + 'r')
time.sleep(20)
search = driver.find_element(By.ID, "flexion_lemme")
search.send_keys('canis')
time.sleep(7)
submit = driver.find_elements(By.TAG_NAME, "button")
correct = submit[4]
correct.click()
time.sleep(20)
html = driver.page_source
if html.find("Une erreur s'est produite") != -1:
driver.quit()
raise ZeroDivisionError("Nope.")
else:
results = driver.find_element(By.ID, "results")
titlesh4 = results.find_elements(By.TAG_NAME, "h4")
titlesp = results.find_elements(By.TAG_NAME, "p")
titleshref = results.find_elements(By.XPATH, "//*[ text() = 'Formes composées' ]")
html = driver.page_source
f = open('tables.html', "w", encoding="utf-8")
f.write(html)
f.close()
lh = open("tables.html", "r", encoding="utf-8")
soup = bs(lh, "html.parser")
prettyHTML=soup.prettify()
prettyHTML = prettyHTML.replace("ā", "a").replace("ă","a").replace("ā̆", "a").replace("ē","e").replace("ĕ", "e").replace("ē̆","e").replace("ī", "i").replace("ĭ", "i").replace("ī̆","i").replace("ō","o").replace("ō̆","o").replace("ŏ","o").replace("ŭ","u").replace("ū̆", "u").replace("ū","u")
f = open('tables.html', "w", encoding="utf-8")
f.write(prettyHTML)
f.close()
It's still in development.
The issue is when it comes to the titles of each table that I'm drawing from the function.
I wish to have each table of data that I draw from the HTML have a title that I can work with. But, the titles that the software gives are in differing forms: h4, p, a, etc. Also, some titles overarch others in a hierarchy, e.g. the table title of 'subjonctif' will come under another title, let's say, 'actif' or something.
The only way, in my opinion, how this could be done is if the program could detect the title before it and decide from there.
Also, those hierarchical ones; I wish that the parent name be included in each of the smaller titles, i.e. 'subjonctif' would become 'actif subjonctif'.
The last issue is that some titles house two or three(?) tables inside of them, so I wish that those could be labelled, e.g., as "subjonctif #1" and "subjonctif #2".
In my opinion (please correct me if I am wrong) all these problems could be easily fixed if the program knew what was before each table.
I have not really tried anything, as I'm not sure really where to start.
If anybody could help, it would be really appreciated.
The first thing I'm going to advise you is to give up selenium
, you don't need to use it here. To successfully parse many sites, you need to stop thinking like a human, think like a browser. I wrote a simple scraper
that does without selenium
, just parses the necessary token
for POST
request to this endpoint and gets the data. The end result is that we make the initial request, cookies
are stored, we get the token
and can work with it. The spider looks like the following:
from typing import NamedTuple
import requests
from bs4 import BeautifulSoup
class FormData(NamedTuple):
opera: str
token: str
def to_string(self) -> str:
return f'opera={self.opera}&token={self.token}'
class Scraper:
def __init__(self, form_option: str = 'flexion') -> None:
self.session = None
self.form_data = None
self.form_option = form_option
def __enter__(self):
start_url = 'https://outils.biblissima.fr/en/collatinus-web/'
response = requests.get(start_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
input_tags = soup.find_all('input', attrs={'type': "hidden"})
prev_input_value = ''
for inp in input_tags:
if prev_input_value == self.form_option:
self.form_data = FormData(
opera=prev_input_value,
token=inp['value'],
)
break
if inp['name'] == 'opera':
prev_input_value = inp['value']
self.session = requests.session()
self.session.cookies = response.cookies
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.session.close()
def parse_word(self, target_word: str):
if self.form_data is None or self.session is None:
raise RuntimeError('invalid initialization')
target_url = (
'https://outils.biblissima.fr/collatinus-web/collatinus-web.php'
'/collatinus-web.php'
)
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
response = self.session.post(
target_url,
headers=headers,
data=f'lemme={target_word}&{self.form_data.to_string()}'
)
response.raise_for_status()
return _parse_html(response.text)
The token for your form can be found here:
To work you will need the following libraries: request
, beautifulsoup4
, lxml
. Install them using pip
or another package manager. The spider
will do everything itself the first time you connect, it will parse the token
, set cookies
, in order for you to use it, do the following:
with Scraper() as scraper:
# you can give him a list of words right away
for word in ['canis']:
scraper.parse_word(word)
Now let's get to the interesting part. I have written a parsing function for you as I understand your request, but the data on your site is very chaotic. Sometimes there are no elements that can be used for headers at all, sometimes there are just too many, and it is not clear what to use. Now, the logic of the parser is:
Look at what you get and tell me what is expected as a name, as long as we don't take parent names, for the current node only. I will be able to advise you on something else.
def _parse_html(html):
def is_has_prev(node):
if node is None:
return False
try:
return node.get('class')[0] != table_cls_name
except (TypeError, AttributeError):
return True
soup = BeautifulSoup(html, 'lxml')
valid_tags = {'p', 'a', 'h4'} # add if there's anything else
table_cls_name = 'table-responsive'
previous_nodes = {}
tables = []
for idx, table in enumerate(soup.find_all(class_=table_cls_name)):
previous_node = table.previous_sibling
tables.append(table)
previous_nodes[idx] = []
while is_has_prev(node=previous_node):
if previous_node.name in valid_tags:
# if you need to look at nodes text
# replace the string `previous_node` with `previous_node.text`
previous_nodes[idx].append(previous_node)
previous_node = previous_node.previous_sibling
for prev_nodes, t in zip(previous_nodes.values(), tables):
print('POSSIBLES TABLE NAMES', prev_nodes)
print('RESULT TABLE', t)
print('#############################################################')
return tuple(zip(previous_nodes.values(), tables))