Search code examples
pythonselenium-webdrivergetattribute

python selenium getting urls from google search results


I am trying to get firt 10 urls from google search results with selenium. I knew that there was other term than inerHTML which will give me the text inside cite tags.

here is code

#open google
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys

chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')

#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
    print(r.get_attribute('innerHTML'))

#take email addresses from company
#send email

the result is below

https://github.com<span class="dyjrff qzEoUe" role="text"> › opsdisk</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>

I want to get rid of <span... as I need only urls. I can get off them with reg.ex but I need get_attribute('TEXT') or sth else that will easily give the result.


Solution

  • The best way to get the value of the node to use javascripts executor and use the firstchild of the node to get the value.

    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    driver.get('https://www.google.com/')
    
    #paste - write name
    #var_inp=input('Write the name to search:')
    var_inp='python google search'
    #search for image
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
    #find first 10 companies
    res_lst=[]
    res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
    print(len(res))
    for r in res:
        print(driver.execute_script("return arguments[0].firstChild.textContent;", r))
    

    Output:

    27
    https://pypi.org
    https://pypi.org
    https://www.geeksforgeeks.org
    https://www.geeksforgeeks.org
    https://stackoverflow.com
    https://stackoverflow.com
    https://www.geeksforgeeks.org
    https://www.geeksforgeeks.org
    https://www.geeksforgeeks.org
    https://www.geeksforgeeks.org
    https://www.jcchouinard.com
    https://www.jcchouinard.com
    https://www.educative.io
    https://www.educative.io
    https://python-googlesearch.readthedocs.io
    https://python-googlesearch.readthedocs.io
    https://medium.com
    https://medium.com
    https://medium.com
    https://medium.com
    https://github.com
    https://github.com
    https://github.com
    https://github.com