Search code examples
pythonpython-3.xseleniumselenium-webdriverselenium-chromedriver

Unable to scrape google images selenium


I have the following script which i want it to scrapes google images. It clicks on the image first and then clicks on next (>) button to switch to the next image.

It downloads the first image, but when it's turn of the second image then it throws me an error.

Traceback (most recent call last):
  File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>
    attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")
  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:

My code :

import requests
import shutil
import time
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
             'Chrome/80.0.3987.132 Safari/537.36'

options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")

driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)
driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")

driver.find_element_by_class_name("rg_i").click()

i = 0
while i < 10:
    i += 1
    time.sleep(5)
    attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb'))).get_attribute("src")
    print(attribute_value)
    resp = requests.get(attribute_value, stream=True)
    local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
    resp.raw.decode_content = True
    shutil.copyfileobj(resp.raw, local_file)
    del resp
    driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()




Solution

  • I've tidied up and refactored a bit your code. The final result is capable of grabbing n amount of images for keywords of your choice (see SEARCH_TERMS):

    
    import base64
    import os
    import requests
    import time
    
    from io import BytesIO
    from PIL import Image
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.options import Options
    from selenium import webdriver
    
    CHROME_DRIVER_LOCATION = r'C:\Users\intel\Downloads\setups\chromedriver.exe'
    SEARCH_TERMS = ['very', 'hot', 'chicks']
    TARGET_SAVE_LOCATION = os.path.join(r'c:\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]),  r'{}.{}')
    if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
        os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
    
    def check_if_result_b64(source):
        possible_header = source.split(',')[0]
        if possible_header.startswith('data') and ';base64' in possible_header:
            image_type = possible_header.replace('data:image/', '').replace(';base64', '')
            return image_type
        return False
    
    def get_driver():
    
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                     'Chrome/80.0.3987.132 Safari/537.36'
        options = Options()
        #options.add_argument("--headless")
        options.add_argument(f'user-agent={user_agent}')
        options.add_argument("--disable-web-security")
        options.add_argument("--allow-running-insecure-content")
        options.add_argument("--allow-cross-origin-auth-prompt")
    
        new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
        new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")
        return new_driver
    
    
    
    driver = get_driver()
    
    first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
    first_search_result.click()
    
    right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[@data-query="{' '.join(SEARCH_TERMS)}"]''')))
    first_image = right_panel_base.find_elements_by_xpath('//*[@data-noaft="1"]')[0]
    magic_class = first_image.get_attribute('class')
    image_finder_xp = f'//*[@class="{magic_class}"]'
    
    
    # initial wait for the first image to be loaded
    # this part could be improved but I couldn't find a proper way of doing it
    time.sleep(3)
    
    # initial thumbnail for "to_be_loaded image"
    thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
    
    for i in range(10):
    
        # issue 4: All image elements share the same class. Assuming that you always click "next":
        # The last element is the base64 encoded thumbnail version is of the "next image"
        # [-2] element is the element currently displayed
        target = driver.find_elements_by_xpath(image_finder_xp)[-2]
    
        # you need to wait until image is completely loaded:
        # first the base64 encoded thumbnail will be displayed
        # so we check if the displayed element src match the cached thumbnail src.
        # However sometimes the final result is the base64 content, so wait is capped
        # at 5 seconds.
        wait_time_start = time.time()
        while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:
            time.sleep(0.2)
        thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
        attribute_value = target.get_attribute("src")
        print(attribute_value)
    
        # issue 1: if the image is base64, requests get won't work because the src is not an url
        is_b64 = check_if_result_b64(attribute_value)
        if is_b64:
            image_format = is_b64
            content = base64.b64decode(attribute_value.split(';base64')[1])
        else:
            resp = requests.get(attribute_value, stream=True)
            temp_for_image_extension = BytesIO(resp.content)
            image = Image.open(temp_for_image_extension)
            image_format = image.format
            content = resp.content
        # issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead
        with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
            f.write(content)
        # issue 3: this Xpath is bad """//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly
        svg_arrows_xpath = '//div[@jscontroller]//a[contains(@jsaction, "click:trigger")]//*[@viewBox="0 0 24 24"]'
        next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
        next_arrow.click()