Search code examples
pythonselenium-webdriver

Cannot scrap xpath using Selenium


I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm. Here is the screenshot of what I am working on.

enter image description here

Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is

//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path

10 reviews are gathered from each page, but 0 for recommendations in

print(svg_elements)

It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.

import csv
import time
from seleniumbase import SB
from selenium.webdriver.common.by import By

def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
    """
    Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
    """
    try:
        with SB(uc=True) as sb:
            base_url = "https://www.glassdoor.com"
            start_url = "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm"
            
            all_pros = []
            all_cons = []
            all_recommendations = []

            # Loop through pages 1 to 5
            for page_num in range(1, 6):
                print(f"Scraping page {page_num}...")
                if page_num == 1:
                    sb.uc_open_with_reconnect(start_url, 6)
                else:
                    next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
                    sb.open(base_url + next_page_link)
                if page_num == 2:
                    email_input = sb.find_element('input[data-test="emailInput-input"]')
                    email_input.send_keys("my id")
                    sb.sleep(2)  # Wait for the email to be entered
                    continue_button = sb.find_element('button[data-test="email-form-button"]')
                    continue_button.click()
                    sb.sleep(2)  # Wait for the next page to load

                    password_input = sb.find_element('input[data-test="passwordInput-input"]')
                    password_input.send_keys("my password")
                    sb.sleep(2)  # Wait for the password to be entered
                    sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
                    sign_in_button.click()
                    sb.sleep(2)  # Wait for the sign-in process to complete
                sb.uc_gui_click_captcha()
                sb.sleep(4)  # Wait for the page to load
                
                # 1. Scrape PROS
                pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
                pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]

                # 2. Scrape CONS
                cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
                cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]

                # 3. Scrape Recommendations (Yes/No)
                svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
                recommendations = []
                for svg in svg_elements:
                    d_attribute = svg.get_attribute('d')
                    if d_attribute:
                        if '8.835 17.64' in d_attribute:  # Unique part of the "Yes" SVG
                            recommendations.append('Yes')
                        elif '18.299 5.327' in d_attribute:  # Unique part of the "No" SVG
                            recommendations.append('No')

                # Collect data from this page
                all_pros.extend(pros_texts)
                all_cons.extend(cons_texts)
                all_recommendations.extend(recommendations)

                # Debug: Print collected data for this page
                print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")

            # Save all collected data to CSV
            print("Saving data to CSV...")
            with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["pros_text", "cons_text", "recommendation"])  # Add "recommendation" here
                
                for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
                    writer.writerow([pros, cons, rec])

            print("Scraping completed successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Exiting function (finally block).")


# Example usage:
if __name__ == "__main__":
    scrape_stackoverflow_cloudflare_and_save_csv()

Solution

  • Instead of using svg values to check if it is recommended, try to check the class for the div that clearly states if it is positive, negative, or neutral. There are neutral and nodata ratings as well.

    I have modified the recommendation checking part only. You have to check for neutral and no data.

    Try this:

    elements = sb.find_elements(By.XPATH, '//span[text()="Recommend"]/parent::div')
    recommendations = []
    for elem in elements:
        attribute = elem.get_attribute('class')
        if 'positiveStyles' in attribute:
            recommendations.append('Yes')
        elif 'negativeStyles' in attribute:
            recommendations.append('No')
    

    It should give you the recommended and not recommended reviews.