Search code examples
pythonseleniumweb-scrapingmapsopenstreetmap

Get universities names from OpenStreetMap


I'm trying to get universities names, from an OpenStreetMap embedded to this website https://collegecrisis.shinyapps.io/dashboard/.

I have tried to automate this task using Python Selenium Library, and I hover over all universities one by one and take their names, it seemed fine but when I took a deep look, I found some wrong data, I think that happened when the script tried to hover over a spot which was full of universities, which made it hover over a different university and take its name, I thought about zooming than taking the name than zooming out, but this will take really too long and may cause some run time errors by missing a zoom in or a zoom out.

I don't have a considerable knowledge in maps, so I want to ask if there is any way to take the names of the marked universities on the map at once.

If anyone needs the code I tried, it was this one:

from selenium import webdriver
from bs4 import BeautifulSoup
import lxml
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep

# setup drivers
PATH = "/Applications/chromedriver"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(10) # seconds
driver.get("https://collegecrisis.shinyapps.io/dashboard/")

# find all class elements =leaflet-interactive
nodes = driver.find_elements_by_class_name("leaflet-interactive")

# use actionchains
nodelist = []

# loop through each node
for node in nodes:
    ActionChains(driver).move_to_element(node).perform() # Used actionchains class to click to open popup
    sleep(.5)
    nodelist.append(BeautifulSoup(driver.page_source, 'lxml').find(class_=lambda value: value and 'leaflet-tooltip leaflet-zoom-animated' in value).text.lower())

which was inspired from this one


Solution

  • This service uses http streaming. It will just open an http connection on the following endpoint:

    POST https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_streaming
    

    And it will send commands using the following endpoint :

    POST https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send
    

    You can check the result looking for xhr_streaming in the network tab in Chrome development console.

    The token is retrieved from another http call on :

    GET https://collegecrisis.shinyapps.io/dashboard/{workerIDFull}__token__
    

    while the workerID is present in the original page itself

    Some parameters named singletons are necessary and are also located in the original page in a script tag like this :

    <script type="application/shiny-singletons">fafb5589cb5a9f24485f3df0511b50d5cd0c7497,603e796bcfc2ab3685167d58c426f64c15a95192</script>
    

    The following script :

    • scrape the required elements from the original page
    • get the token using workerID
    • launch the POST /xhr_streaming in a new thread
    • send the "open channel command" on POST /xhr_send which is '["0#0|o|"]'
    • send the "init command" using the singletons values previously scraped and a large JSON static config

    The complete code :

    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    from random import choice
    from string import ascii_letters,digits
    from threading import Thread
    from time import sleep
    import json
    
    session = requests.Session()
    r = session.get("https://collegecrisis.shinyapps.io/dashboard/")
    soup = BeautifulSoup(r.content, "lxml")
    
    singletons = soup.find("script", {"type":"application/shiny-singletons"}).text
    
    workerIDFull = soup.find("base")["href"]
    workerID = re.search('_w_(\w+)', workerIDFull).group(1)
    timestamp = int(round(time.time() * 1000))
    
    r = session.get(f"https://collegecrisis.shinyapps.io/dashboard/{workerIDFull}__token__",
        params = {
            "_": timestamp
    })
    token = r.text
    
    random_token = ''.join(choice(ascii_letters) for i in range(18))
    random_token2 = ''.join(choice(ascii_letters) for i in range(8))
    random_num = ''.join(choice(digits) for i in range(3))
    
    def getData():
        r = requests.Request("POST", f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_streaming").prepare()
        resp = session.send(r, stream=True)
    
        for line in resp.iter_lines():
            if line:
                print(line)
                splitted = str(line.decode('unicode_escape'))[2:-2].split("|")
                if (len(splitted) > 2):
                    data = json.loads(splitted[2])
                    if ("values" in data):
                        print([ t["args"][8] for t in data["values"]["homeMap"]["x"]["calls"] if t["method"] == "addCircles"][0])
    
    def openChannel():
        r = session.post(f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send",
        data = '["0#0|o|"]', headers = {"Content-Type":"text/plain;charset=UTF-8"})
    
    def sendInit():
        data = json.dumps({
            "method":"init",
            "data":{
                "sidebarItemExpanded":None,
                "sidebarCollapsed":True,
                "resetAll:shiny.action":0,
                "fallResetAll:shiny.action":0,
                "lawResetAll:shiny.action":0,
                ".clientdata_output_authModal_hidden":False,
                ".clientdata_output_homefullOnlineVB_hidden":False,
                ".clientdata_output_homepOnlineVB_hidden":False,
                ".clientdata_output_homeHybridVB_hidden":False,
                ".clientdata_output_homepPersonVB_hidden":False,
                ".clientdata_output_homePersonVB_hidden":False,
                ".clientdata_output_homeTBDVB_hidden":False,
                ".clientdata_output_homeOtherVB_hidden":False,
                ".clientdata_output_homeTotalShownVB_hidden":False,
                ".clientdata_output_homeMap_hidden":False,
                ".clientdata_output_graphStateFilter_hidden":True,
                ".clientdata_output_fallBarGraph_hidden":True,
                ".clientdata_output_covidAthleticGraph_hidden":True,
                ".clientdata_output_schoolCovidPlot_hidden":True,
                ".clientdata_output_intlFilter_hidden":True,
                ".clientdata_output_intlGraph_hidden":True,
                ".clientdata_output_facultyBarGraph_hidden":True,
                ".clientdata_output_stateTrendsGraph_hidden":True,
                ".clientdata_output_covidHeatmap_hidden":True,
                ".clientdata_output_announceHeatmap_hidden":True,
                ".clientdata_output_onlineHeatmap_hidden":True,
                ".clientdata_output_springBreak_hidden":True,
                ".clientdata_output_peerInstPicker_hidden":True,
                ".clientdata_output_statusFilter_hidden":True,
                ".clientdata_output_rankcatFilter_hidden":True,
                ".clientdata_output_hospitalFilter_hidden":True,
                ".clientdata_output_covidFilter_hidden":True,
                ".clientdata_output_campusTypeFilter_hidden":True,
                ".clientdata_output_sectorFilter_hidden":True,
                ".clientdata_output_ccbasicFilter_hidden":True,
                ".clientdata_output_divisionFilter_hidden":True,
                ".clientdata_output_conferenceFilter_hidden":True,
                ".clientdata_output_sizeSlider_hidden":True,
                ".clientdata_output_resHallSlider_hidden":True,
                ".clientdata_output_sportsRevenueSlider_hidden":True,
                ".clientdata_output_intlSlider_hidden":True,
                ".clientdata_output_onlineVB_hidden":True,
                ".clientdata_output_announcedVB_hidden":True,
                ".clientdata_output_noDecisionVB_hidden":True,
                ".clientdata_output_totalVB_hidden":True,
                ".clientdata_output_dateSlider_hidden":True,
                ".clientdata_output_springMap_hidden":True,
                ".clientdata_output_fallPeerInstPicker_hidden":True,
                ".clientdata_output_fallStatusFilter_hidden":True,
                ".clientdata_output_fallRankcatFilter_hidden":True,
                ".clientdata_output_fallFacultyFilter_hidden":True,
                ".clientdata_output_fallHospitalFilter_hidden":True,
                ".clientdata_output_fallCovidFilter_hidden":True,
                ".clientdata_output_fallCampusTypeFilter_hidden":True,
                ".clientdata_output_fallSectorFilter_hidden":True,
                ".clientdata_output_fallCcbasicFilter_hidden":True,
                ".clientdata_output_fallStaffFilter_hidden":True,
                ".clientdata_output_fallDivisionFilter_hidden":True,
                ".clientdata_output_fallConferenceFilter_hidden":True,
                ".clientdata_output_fallSizeSlider_hidden":True,
                ".clientdata_output_fallResHallSlider_hidden":True,
                ".clientdata_output_fallSportsRevenueSlider_hidden":True,
                ".clientdata_output_fallIntlSlider_hidden":True,
                ".clientdata_output_fallfullOnlineVB_hidden":True,
                ".clientdata_output_fallpOnlineVB_hidden":True,
                ".clientdata_output_fallHybridVB_hidden":True,
                ".clientdata_output_fallpPersonVB_hidden":True,
                ".clientdata_output_fallPersonVB_hidden":True,
                ".clientdata_output_fallTBDVB_hidden":True,
                ".clientdata_output_fallOtherVB_hidden":True,
                ".clientdata_output_fallTotalShownVB_hidden":True,
                ".clientdata_output_fallMap_hidden":True,
                ".clientdata_output_greFilter_hidden":True,
                ".clientdata_output_modelFilter_hidden":True,
                ".clientdata_output_planFilter_hidden":True,
                ".clientdata_output_videoPlatformFilter_hidden":True,
                ".clientdata_output_lawSectorFilter_hidden":True,
                ".clientdata_output_lawMinoritySlider_hidden":True,
                ".clientdata_output_lawLSATtwofiveSlider_hidden":True,
                ".clientdata_output_lawLSATmedianSlider_hidden":True,
                ".clientdata_output_lawLSATsevenfiveSlider_hidden":True,
                ".clientdata_output_lawAcceptanceSlider_hidden":True,
                ".clientdata_output_lawFYSlider_hidden":True,
                ".clientdata_output_lawFullOnlineVB_hidden":True,
                ".clientdata_output_lawPartialOnlineVB_hidden":True,
                ".clientdata_output_lawHybridVB_hidden":True,
                ".clientdata_output_lawPersonVB_hidden":True,
                ".clientdata_output_lawNDVB_hidden":True,
                ".clientdata_output_lawTotalVB_hidden":True,
                ".clientdata_output_lawMap_hidden":True,
                ".clientdata_output_intlOnlineVB_hidden":True,
                ".clientdata_output_intlHybridVB_hidden":True,
                ".clientdata_output_intlInPersonVB_hidden":True,
                ".clientdata_output_intlCovidVB_hidden":True,
                ".clientdata_output_intlTBDVB_hidden":True,
                ".clientdata_output_intlTotalVB_hidden":True,
                ".clientdata_output_intlMap_hidden":True,
                ".clientdata_pixelratio":1,
                ".clientdata_url_protocol":"https:",
                ".clientdata_url_hostname":"collegecrisis.shinyapps.io",
                ".clientdata_url_port":"",
                ".clientdata_url_pathname":"/dashboard/",
                ".clientdata_url_search":"",
                ".clientdata_url_hash_initial":"",
                ".clientdata_url_hash":"",
                ".clientdata_singletons": singletons,
                ".clientdata_allowDataUriScheme":True
            }
        })
        r = session.post(f"https://collegecrisis.shinyapps.io/dashboard/__sockjs__/n={random_token}/t={token}/w={workerID}/s=0/{random_num}/{random_token2}/xhr_send",
        data = f'["1#0|m|{json.dumps(data)[1:-1]}"]', headers = {"Content-Type":"text/plain;charset=UTF-8"})
    
    thread = Thread(target = getData, args = ())
    thread.start()
    sleep(1)
    openChannel()
    sendInit()
    thread.join()
    

    Checkout the field data["values"]["homeMap"]["x"]["calls"] if you need more data from the map

    run this on repl.it