Search code examples
pythonpython-3.xselenium-webdriverweb-scraping

Extracting Owner’s Username from Nested Page on HuggingFace


I am scraping the HuggingFace research forum (https://discuss.huggingface.co/c/research/7/l/latest) using Selenium. I am able to successfully extract the following attributes from the main page of the forum:

  • Activity Date
  • View Count
  • Replies Count
  • Title
  • URL

However, I am encountering an issue when trying to extract the owner’s username from the individual topic pages. The owner’s username is located on a nested page that is accessible via the URL found in the main page’s topic link.

For example, on the main page, I have the following HTML snippet for a topic:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

# Set up Chrome options to use headless mode (for Colab)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Set the path to chromedriver explicitly (installed by apt)
chrome_path = "/usr/bin/chromedriver"

# Initialize the WebDriver with the updated path
driver = webdriver.Chrome(options=chrome_options)

# Open the HuggingFace page
url = "https://discuss.huggingface.co/c/research/7/l/latest"  # URL for HuggingFace Issues
driver.get(url)

# Wait for the page to load
time.sleep(6)

def scrape_huggingface_issues():
    titles_and_links = []
    seen_titles_and_links = set()
    owner = []
    replies = []
    views = []
    activity = []

    while True:
        try:
            # Find all issue rows (elements in the table)
            elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')

            # Extract and store the titles, links, and other data
            for elem in elements:
                topic_id = elem.get_attribute("data-topic-id")
                if topic_id in seen_titles_and_links:
                    continue

                seen_titles_and_links.add(topic_id)

                # Extract title and link
                selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link')
                title = selected_title.text.strip()
                relative_link = selected_title.get_attribute('href')  # Get the relative URL from the href attribute
                full_link = relative_link  # Construct the absolute URL (if needed)

                # Extract replies count
                try:
                    replies_elem = elem.find_element(By.CSS_SELECTOR, 'button.btn-link.posts-map.badge-posts')
                    replies_count = replies_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
                except:
                    replies_count = "0"

                # Extract views count
                try:
                    views_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.views.topic-list-data')
                    views_count = views_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
                except:
                    views_count = "0"

                # Extract activity (last activity)
                try:
                    activity_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.topic-list-data.age.activity')
                    activity_text = activity_elem.get_attribute('title').strip()
                except:
                    activity_text = "N/A"

                # Use the helper function to get the owner info from the topic page
                owner_text = scrape_issue_details(relative_link)

                # Store the extracted data in the lists
                titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text))
                seen_titles_and_links.add((title, full_link))  # Add to the seen set to avoid duplicates

            # Scroll down to load more content (if the forum uses infinite scroll)
            driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
            time.sleep(3)  # Adjust based on loading speed

            # Check if the "Next" button is available and click it
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
                next_button.click()
                time.sleep(3)  # Wait for the next page to load
            except:
                # If there's no "Next" button, exit the loop
                print("No more pages to scrape.")
                break

        except Exception as e:
            print(f"Error occurred: {e}")
            continue

    return titles_and_links

def scrape_issue_details(url):
    """
    Navigate to the topic page and scrape additional details like the owner's username.
    """
    # Go to the topic page
    driver.get(url)
    time.sleep(3)  # Wait for the page to load

    # Extract the owner's username
    try:
        owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user')))
        owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip()
        owner_username = owner_elem.text.strip()  # Extract the username from the link
    except Exception as e:
        owner_username = "N/A"  # Default value if no owner found

    return owner_username

# Scrape the HuggingFace issues across all pages
issues = scrape_huggingface_issues()

# Print the titles, links, and additional data (owner, replies, views, activity)
print("Scraped Titles, Links, Owner, Replies, Views, Activity:")
for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1):
    print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")

# Close the browser
driver.quit()

Problem:

I cannot fetch the owner’s username from the individual topic page. After following the URL, I am unable to locate and extract the owner’s username even though I know its location in the HTML.

<a href="/t/model-that-can-generate-both-text-and-image-as-output/132209" role="heading" aria-level="2" class="title raw-link raw-topic-link" data-topic-id="132209">Model that can generate both text and image as output</a>

The owner’s username is located on the topic’s individual page at the following HTML snippet:

<span class="first username new-user"><a href="/u/InsertOPUsername" data-user-card="InsertOPUsername" class="">InsertOPUsername</a></span>

What I’ve Tried:

  • I used driver.get(url) to navigate to the individual topic pages.
  • I attempted to locate the username using WebDriverWait and the correct CSS selector (span.first.username.new-user a).
  • I am successfully scraping other details like Activity, Views, and Replies from the main page but unable to retrieve the owner’s username from the topic page.

Solution

  • All the data you're after comes from two API endpoints. Most of what you already have can be fetched from the frist one. If you follow the post, you'll get even more data and you'll find the posters section, there you can find your owner aka Original Poster.

    This is just to push you in the right direction (and no selenium needed!). Once you know the endpoints you can massage the data to whatever you like it to be.

    import requests
    from tabulate import tabulate
    
    API_ENDPOINT = "https://discuss.huggingface.co/c/research/7/l/latest.json?filter=latest"
    TRACK_ENDPOINT = "https://discuss.huggingface.co/t/{}.json?track_visit=true&forceLoad=true"
    
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
        "Accept": "application/json",
        "X-Requested-With": "XMLHttpRequest"
    }
    
    def get_posters(track_id: str, current_session: requests.Session) -> dict:
        track = current_session.get(TRACK_ENDPOINT.format(track_id), headers=HEADERS)
        posts = track.json()["post_stream"]["posts"]
        return {
            "owner": posts[0]["username"],
            "owner_name": posts[0]["name"],
            "owner_id": posts[0]["id"],
            "posters": [p["name"] for p in posts],
        }
    
    
    with requests.Session() as session:
        response = session.get(API_ENDPOINT, headers=HEADERS)
        topics_data = response.json()["topic_list"]["topics"]
    
        topics = []
        for topic in topics_data:
            posters = get_posters(topic["id"], session)
            topics.append(
                [
                    topic["title"],
                    f"https://discuss.huggingface.co/t/{topic['slug']}/{topic['id']}",
                    topic["posts_count"],
                    topic["views"],
                    topic["like_count"],
                    topic["id"],
                    posters["owner_name"],
                    posters["owner_id"],
                    # ", ".join(posters["posters"]),
                ]
            )
    
        columns = ["Title", "URL", "Posts", "Views", "Likes", "ID", "Owner", "Owner ID"]
        table = tabulate(topics, headers=columns, tablefmt="pretty", stralign="left")
        print(table)
    
    

    You should get this table:

    +----------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+-------+-------+-------+--------+------------------------+----------+
    | Title                                                                                  | URL                                                                                                                        | Posts | Views | Likes | ID     | Owner                  | Owner ID |
    +----------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+-------+-------+-------+--------+------------------------+----------+
    | Merry Christmas & We have released "Awesome-Neuro-Symbolic-Learning-with-LLM"          | https://discuss.huggingface.co/t/merry-christmas-we-have-released-awesome-neuro-symbolic-learning-with-llm/133045          | 1     | 36    | 4     | 133045 | Lan-Zhe Guo            | 191786   |
    | Why do some commits have zero insertions and zero deletions?                           | https://discuss.huggingface.co/t/why-do-some-commits-have-zero-insertions-and-zero-deletions/132603                        | 1     | 12    | 0     | 132603 | Sandra                 | 191238   |
    | Model that can generate both text and image as output                                  | https://discuss.huggingface.co/t/model-that-can-generate-both-text-and-image-as-output/132209                              | 5     | 73    | 7     | 132209 | Bibhuti Bhusan Padhi   | 190689   |
    | Using mixup on RoBERTa                                                                 | https://discuss.huggingface.co/t/using-mixup-on-roberta/306                                                                | 8     | 2228  | 8     | 306    | FRAN Valero            | 576      |
    | Seeking Guidance on Training a Model for Generating Gregorian Chant Music              | https://discuss.huggingface.co/t/seeking-guidance-on-training-a-model-for-generating-gregorian-chant-music/131700          | 3     | 21    | 4     | 131700 | Martim Ramos           | 189949   |
    | Interest in Contributing PEFT Educational Resources - Seeking Community Input          | https://discuss.huggingface.co/t/interest-in-contributing-peft-educational-resources-seeking-community-input/131143        | 3     | 30    | 6     | 131143 | Jen Wei                | 188941   |
    | LLM for analysing JSON data                                                            | https://discuss.huggingface.co/t/llm-for-analysing-json-data/130407                                                        | 2     | 67    | 2     | 130407 | S. Gow                 | 188022   |
    | Models for Document Image Annotation Without OCR                                       | https://discuss.huggingface.co/t/models-for-document-image-annotation-without-ocr/129604                                   | 2     | 109   | 3     | 129604 | Pavel Spirin           | 186986   |
    | Get gaierror when trying to access HF Token for login                                  | https://discuss.huggingface.co/t/get-gaierror-when-trying-to-access-hf-token-for-login/128870                              | 3     | 36    | 3     | 128870 | S. Gow                 | 186043   |
    | Evaluation metrics for BERT-like LMs                                                   | https://discuss.huggingface.co/t/evaluation-metrics-for-bert-like-lms/1256                                                 | 5     | 4455  | 1     | 1256   | Vladimir Blagojevic    | 3083     |
    | Introducing ClearerVoice-Studio: Your One-Stop Speech Processing Platform!             | https://discuss.huggingface.co/t/introducing-clearervoice-studio-your-one-stop-speech-processing-platform/129193           | 3     | 92    | 0     | 129193 | Alibaba_Speech_Lab_SG  | 186434   |
    | Seeking Advice on Building a Custom Virtual Try-On Model Using Pre-Existing Models     | https://discuss.huggingface.co/t/seeking-advice-on-building-a-custom-virtual-try-on-model-using-pre-existing-models/128946 | 1     | 44    | 1     | 128946 | Abeer Ilyas            | 186127   |
    | LLM Hackathon in Ecology                                                               | https://discuss.huggingface.co/t/llm-hackathon-in-ecology/128906                                                           | 1     | 35    | 0     | 128906 | Jennifer D'Souza       | 186080   |
    | Retrieving Meta Data on Models for Innovation Research                                 | https://discuss.huggingface.co/t/retrieving-meta-data-on-models-for-innovation-research/128646                             | 1     | 33    | 1     | 128646 | Fabian F               | 185762   |
    | (Research/Personal) Projects Ideas                                                     | https://discuss.huggingface.co/t/research-personal-projects-ideas/71651                                                    | 3     | 1410  | 0     | 71651  | HeHugging              | 111782   |
    | Understanding Technical Drawings                                                       | https://discuss.huggingface.co/t/understanding-technical-drawings/78903                                                    | 2     | 287   | 1     | 78903  | Yakoi                  | 121186   |
    | Ionic vs. React Native vs. Flutter                                                     | https://discuss.huggingface.co/t/ionic-vs-react-native-vs-flutter/128132                                                   | 1     | 97    | 0     | 128132 | yaw                    | 185084   |
    | Choosing Benchmarks for Fine-Tuned Models in Emotion Analysis                          | https://discuss.huggingface.co/t/choosing-benchmarks-for-fine-tuned-models-in-emotion-analysis/127106                      | 1     | 38    | 1     | 127106 | Pavol                  | 183654   |
    | I have a project Skin Lens Please can you fill the form                                | https://discuss.huggingface.co/t/i-have-a-project-skin-lens-please-can-you-fill-the-form/108980                            | 2     | 48    | 2     | 108980 | Soopramanien           | 158453   |
    | How does an API work?                                                                  | https://discuss.huggingface.co/t/how-does-an-api-work/121828                                                               | 5     | 102   | 2     | 121828 | riddhi patel           | 176354   |
    | More expressive attention with negative weights                                        | https://discuss.huggingface.co/t/more-expressive-attention-with-negative-weights/119667                                    | 2     | 252   | 4     | 119667 | AngLv                  | 173243   |
    | Biases in AI Hallucinations Based on Context                                           | https://discuss.huggingface.co/t/biases-in-ai-hallucinations-based-on-context/117082                                       | 1     | 28    | 1     | 117082 | That Prommolmard       | 169443   |
    | RAG performance                                                                        | https://discuss.huggingface.co/t/rag-performance/116048                                                                    | 1     | 59    | 1     | 116048 | Salah Ghalyon          | 168143   |
    | Gangstalkers AI harassment voice to skull                                              | https://discuss.huggingface.co/t/gangstalkers-ai-harassment-voice-to-skull/115897                                          | 1     | 87    | 0     | 115897 | Andrew Cruz AKA OmegaT | 167944   |
    | How Pika Effects works? 🤔                                                              | https://discuss.huggingface.co/t/how-pika-effects-works/115760                                                             | 1     | 45    | 0     | 115760 | JiananZHU              | 167769   |
    | An idea about LLMs                                                                     | https://discuss.huggingface.co/t/an-idea-about-llms/115462                                                                 | 1     | 56    | 1     | 115462 | Garrett Johnson        | 167279   |
    | Different response from different UI's                                                 | https://discuss.huggingface.co/t/different-response-from-different-uis/115192                                              | 3     | 49    | 2     | 115192 | Marvin Snell           | 166941   |
    | Gradio is more than UI?                                                                | https://discuss.huggingface.co/t/gradio-is-more-than-ui/114715                                                             | 5     | 62    | 4     | 114715 | Zebra                  | 166264   |
    | Narrative text generation                                                              | https://discuss.huggingface.co/t/narrative-text-generation/114869                                                          | 2     | 43    | 1     | 114869 | QUANGDUC               | 166472   |
    | Say goodbye to manual testing of your LLM-based apps – automate with EvalMy.AI beta! 🚀 | https://discuss.huggingface.co/t/say-goodbye-to-manual-testing-of-your-llm-based-apps-automate-with-evalmy-ai-beta/114533  | 1     | 38    | 1     | 114533 | Petr Pascenko          | 166007   |
    +----------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+-------+-------+-------+--------+------------------------+----------+
    

    Bonus:

    To get more of the latest you can paginate the API by adding the page=<PAGE_VALUE> parameter to the first endpoint. For example, latest.json?page=2