I am trying to get firt 10 urls from google search results with selenium. I knew that there was other term than inerHTML
which will give me the text inside cite
tags.
here is code
#open google
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')
#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
print(r.get_attribute('innerHTML'))
#take email addresses from company
#send email
the result is below
https://github.com<span class="dyjrff qzEoUe" role="text"> › opsdisk</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
I want to get rid of <span...
as I need only urls. I can get off them with reg.ex but I need get_attribute('TEXT')
or sth else that will easily give the result.
The best way to get the value of the node
to use javascripts executor
and use the firstchild
of the node to get the value.
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')
#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
print(driver.execute_script("return arguments[0].firstChild.textContent;", r))
Output:
27
https://pypi.org
https://pypi.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://stackoverflow.com
https://stackoverflow.com
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.jcchouinard.com
https://www.jcchouinard.com
https://www.educative.io
https://www.educative.io
https://python-googlesearch.readthedocs.io
https://python-googlesearch.readthedocs.io
https://medium.com
https://medium.com
https://medium.com
https://medium.com
https://github.com
https://github.com
https://github.com
https://github.com