Search code examples
pythonpandasselenium-webdriverweb-scrapingselenium-chromedriver

Trying to scrape the google map links inframed on dynamic web pages, successful in some of them but fail in others


I'm tring to get the time and location info on a website which presents public events called Timable, it uses Google Map to show the detailed locations, some of the event has only one location and I've successfully obtained the link, but for those with several links, it seems there's no link coming from Google at all. How did this happen?

I've got code written shown below, and four example event pages for testing. The numbers represents the amount of locations for each event.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException 
import time
import pandas as pd
import os
import re 

chrome_options = Options()
browser = webdriver.Chrome()


# test examples
# 1 聯和墟社區會堂
url = 'https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022'
# 3 甲辰年中秋綵燈會 2024
# url = 'https://timable.com/hk/zh/event/66a32ffbea8f6c5f5431d608/%E7%94%B2%E8%BE%B0%E5%B9%B4%E4%B8%AD%E7%A7%8B%E7%B6%B5%E7%87%88%E6%9C%832024'
# 6 避風涼茶館社區巡演
# url = 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94'
# 1 Skechers Summer Waterpark
# url = 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92'
browser.get(url)

time.sleep(5)

# -----------------------------------------------------------------------------------------------------------------------
def sanitize_string(value):
    # Define a regex pattern to match illegal characters (e.g., control characters)
    # Only keep printable characters (remove non-printable characters)
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', value)
# -----------------------------------------------------------------------------------------------------------------------
def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""
# -----------------------------------------------------------------------------------------------------------------------
def extract_event_info(url):
    # event = ["", "", "", "", "", ""] # ensure at least an array should be returned
    
    attempts_u = 3
    while attempts_u > 0:
        try:
            browser.get(url)
            time.sleep(5) 

            # Retry to locate parent container to handle Stale Element
            attempts_p = 3
            while attempts_p > 0:
                try: 
                    parent_container = WebDriverWait(browser, 15).until(
                        EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']"))
                    )
                    break  # Break if successful
                except StaleElementReferenceException:
                    attempts_p -= 1
                    if attempts_p == 0:
                        print(f"Error(parent_container) for url: {url}: {e}")
                        return event
                    time.sleep(2)  # Wait before retrying
                except:
                    attempts_p -= 1
                    if attempts_p == 0:
                        print(f"Error(parent_container) for url {url}: {e}")
                        return event
                    time.sleep(2)  # Wait before retrying
                    
            # fully load the parent container to make sure everything wanted is loaded before extraction
            parent_container = WebDriverWait(browser, 15).until(
            EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']")) 
            )

            # box 2
            event_box2 = WebDriverWait(browser, 15).until(
                EC.visibility_of_element_located((By.XPATH, "//div[@id='displayLocation']")) 
                )
            if event_box2: print("event_box2 found!")
            if event_box2:
                # box2_child_box_list = event_box2.find_elements(By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]")
                box2_child_box_list = WebDriverWait(event_box2, 15).until(
                    EC.presence_of_all_elements_located((By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]"))
                    )
                if box2_child_box_list: print("box2_child_box_list found!")

                # a_tags = event_box2.find_elements(By.XPATH, "//a")
                a_tags = event_box2.find_elements(By.XPATH, "//a[contains(@href,'https://maps.google.com/maps?ll=')]")
                if a_tags: 
                    print("a tags found!")
                    location_div_titles = [box2_child_box.find_element(By.XPATH, ".//p[contains(@class, 'chakra-text cha-722v25')]").text for box2_child_box in box2_child_box_list]
                    print(f"location_div_titles: {location_div_titles}")
                    for a_tag in a_tags: 
                        link = a_tag.get_attribute("href")
                        print(f"link: {link}")
      
            
            # box 1
            event_box1 = parent_container.find_element(By.XPATH, ".//div[contains(@class, 'cha-nm882m')]") # get the box covering all the times and locations
            child_box_list = event_box1.find_elements(By.XPATH, "./div")
            child_box_amt = len(child_box_list)
            # print(f"child_box_amt: {child_box_amt}")
            box1 = []
            for child_box in child_box_list:
                child_box_div_list = child_box.find_elements(By.XPATH, "./div")
                child_box_div_all = []
                child_box_div_time = []
                child_box_div_location = []
                for child_box_div in child_box_div_list:
                    element_class = child_box_div.get_attribute("class")
                    if element_class == 'chakra-stack cha-16yidj1': # time
                        child_box_div_time.append(child_box_div.text)
                    elif element_class == 'chakra-stack cha-1igwmid': # location
                        location_name_text = child_box_div.find_element(By.XPATH, ".//button").text
                        child_box_div_p2 = child_box_div.find_elements(By.XPATH, ".//p[2]")
                        if child_box_div_p2:
                            # detailed_address_text = child_box_div_p2[0].text
                            detailed_address_text = ', ' + child_box_div_p2[0].text
                        else: detailed_address_text = ''
                        location = location_name_text + detailed_address_text

                        # # if "显示位置" is available, add the detailed info of locations for reference
                        # if box2_child_box_list_for_use:
                        #     for i in range(len(box2_child_box_list_for_use)):
                        #         if box2_child_box_list_for_use[i][0] == location_name_text:
                        #             location += ', ' + box2_child_box_list_for_use[i][1] + ', ' + box2_child_box_list_for_use[i][2]
                    
                        child_box_div_location.append(location)
                child_box_div_all.append(child_box_div_time)
                child_box_div_all.append(child_box_div_location)
                location_num = len(child_box_div_location)
                child_box_string = '; '.join(', '.join(map(str, subarray)) for subarray in child_box_div_all)
                box1.append(child_box_string)
            event_time_location = '~'.join(map(str, box1))
            # print(f"event_time_location: {event_time_location}")
            # event[3] = location_num
            # event[4] = event_time_location

            break  # Break if successful

        except StaleElementReferenceException:
            attempts_u -= 1
            if attempts_u == 0:
                print(f"Error(getting url) for url: {url}: {e}")
                return event
            time.sleep(2)  # Wait before retrying
        except Exception as e:
            attempts_u -= 1
            if attempts_u == 0:
                print(f"Error(getting url) for url: {url}: {e}")
                return event
            time.sleep(2)  # Wait before retrying
    
    # return event
    return None

# event= extract_event_info(url)
extract_event_info(url)
# print(event)
# output_file = os.path.join('output', f'output_test_1018.xlsx')  # Output file name in the output folder
# columns = ['Title', 'Type', 'Keyword', 'Location Num', 'Time & Location', 'Description']
# events_df = pd.DataFrame(event, columns=columns)
# events_df = pd.DataFrame([event], columns=columns)
# events_df.to_excel(output_file, index=False)
browser.close()

For example, when testing the first event, got the link as: enter image description here but for the second event, there's no link obtained: enter image description here


Solution

  • You can use the API route endpoint of your target to get the desired output {url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29

    Sample code:

    import requests
    
    def getDetails(url):
        url = f'{url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29'
        resp = requests.get(url).json()
        location_details = resp['event']['sections']
    
        for i in location_details:
            data = [f"name: {i['location']['name']}", f"map_url: https://maps.google.com/maps?ll={i['location']['coordinate'][0]},{i['location']['coordinate'][1]}&t=m&hl=en-US&gl=US&mapclient=apiv3"]
            print(data)
    
    urls = ['https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022', 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94', 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92']
    for i in urls:
        print(f"[+] Location of URL: {i} ---------")
        getDetails(i)
    

    Sample output:

    [+] Location of URL: https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022 ---------
    ['name: 聯和墟社區會堂', 'map_url: https://maps.google.com/maps?ll=114.140367,22.495468&t=m&hl=en-US&gl=US&mapclient=apiv3']
    [+] Location of URL: https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94 ---------
    ['name: 添馬公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.165768,22.281595&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 中山紀念公園', 'map_url: https://maps.google.com/maps?ll=114.144315,22.290303&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 上水花園 第一號', 'map_url: https://maps.google.com/maps?ll=114.130484,22.503538&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 粉嶺聯和趁墟 - 戶外活動空間(魚花園)', 'map_url: https://maps.google.com/maps?ll=114.1427568,22.4978873&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 荃灣公園 中央廣場', 'map_url: https://maps.google.com/maps?ll=114.113423,22.364039&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 黃大仙廣場', 'map_url: https://maps.google.com/maps?ll=114.193813,22.341969&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: JCCAC賽馬會創意藝術中心', 'map_url: https://maps.google.com/maps?ll=114.165719,22.334665&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 觀塘海濱花園', 'map_url: https://maps.google.com/maps?ll=114.216954,22.312821&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 牛棚藝術村 露天空地', 'map_url: https://maps.google.com/maps?ll=114.191326,22.320579&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 荔枝角公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.138414,22.339071&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 風之塔公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.1538531,22.2429592&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 香港仔海濱公園 露天廣場', 'map_url: https://maps.google.com/maps?ll=114.15334,22.247818&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 仁愛廣場', 'map_url: https://maps.google.com/maps?ll=113.975603,22.396653&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 天水圍嘉湖銀座廣場', 'map_url: https://maps.google.com/maps?ll=114.003357,22.457727&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 東區文化廣場', 'map_url: https://maps.google.com/maps?ll=114.23025,22.282962&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 西貢海濱公園', 'map_url: https://maps.google.com/maps?ll=114.274504,22.382432&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 九龍公園 拱廊', 'map_url: https://maps.google.com/maps?ll=114.170356,22.300241&t=m&hl=en-US&gl=US&mapclient=apiv3']
    ['name: 茂蘿街7號 公眾休憩空間', 'map_url: https://maps.google.com/maps?ll=114.176814,22.27736&t=m&hl=en-US&gl=US&mapclient=apiv3']
    [+] Location of URL: https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92 ---------
    ['name: Skechers Summer Waterpark', 'map_url: https://maps.google.com/maps?ll=114.1356492743805,22.507647999322668&t=m&hl=en-US&gl=US&mapclient=apiv3']
    

    Let me know if this solves your problem