I am trying to get firt 10 urls from google search results with selenium. I knew that there was other term than inerHTML which will give me the text inside cite tags.
here is code
#open google
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')
#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
print(r.get_attribute('innerHTML'))
#take email addresses from company
#send email
the result is below
https://github.com<span class="dyjrff qzEoUe" role="text"> › opsdisk</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
https://blog.apilayer.com<span class="dyjrff qzEoUe" role="text"> › h...</span>
I want to get rid of <span... as I need only urls. I can get off them with reg.ex but I need get_attribute('TEXT') or sth else that will easily give the result.
The best way to get the value of the node to use javascripts executor and use the firstchild of the node to get the value.
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.google.com/')
#paste - write name
#var_inp=input('Write the name to search:')
var_inp='python google search'
#search for image
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys(var_inp+Keys.RETURN)
#find first 10 companies
res_lst=[]
res=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,'cite')))
print(len(res))
for r in res:
print(driver.execute_script("return arguments[0].firstChild.textContent;", r))
Output:
27
https://pypi.org
https://pypi.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://stackoverflow.com
https://stackoverflow.com
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.geeksforgeeks.org
https://www.jcchouinard.com
https://www.jcchouinard.com
https://www.educative.io
https://www.educative.io
https://python-googlesearch.readthedocs.io
https://python-googlesearch.readthedocs.io
https://medium.com
https://medium.com
https://medium.com
https://medium.com
https://github.com
https://github.com
https://github.com
https://github.com
This is for this specific case:
def remove_span(string):
start = string.find("<span")
end = string.find("</span>") + len("</span>")
return string[:start] + string[end:]
The function manipulates the string and removes the span from it.
for r in res:
print(removeSpan(r.get_attribute('innerHTML'))) # returns https://github.com
Related
I am trying to extract the links of all application from a particular developer present on the playstore.
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver. common.by import By
driver = webdriver.Chrome (executable_path=ChromeDriverManager().install())
driver.get("https://play.google.com/store/apps/dev?id=5305197572942248936")
l1 = driver.find_elements(By.CLASS_NAME, 'ULeU3b')
You are close to the solution.
Inside elements you located there are a elements containing the links.
All you need here is to wait for all those elements to become visible, get the list of those elements, iterate over the list and extract the links.
The following cod works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument('--disable-notifications')
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 20)
url = "https://play.google.com/store/apps/dev?id=5305197572942248936"
driver.get(url)
links = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".ULeU3b a")))
for link in links:
print(link.get_attribute("href"))
The result is:
https://play.google.com/store/apps/details?id=com.tatamotors.eguruibcrm
https://play.google.com/store/apps/details?id=com.T1.Primarun
https://play.google.com/store/apps/details?id=com.tata.skoolman
https://play.google.com/store/apps/details?id=com.ttl.tatafleetman
Using this url I want to locate div tags which has attribute data-asin . When I use //div[#data-asin] in Chrome Inspect mode it gives 21 elements. But while trying to get these elements via Selenium in both ways, explicit wait and direct length gives 0. As I guess Selenium remote browser is unable to get anyone of these elements as a DOM tree. code is below
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\Users\Admin\Downloads\chromedriver107v\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
return driver
#going to urls 1-by-1
def goToUrl_Se(driver):
global counter
counter = 0
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(amazn) via Selenium WebDriver
driver.get(xUrl)
parse_data()
counter+=1
driver.quit()
#fetch-parse the data from url page
def parse_data():
global asin, title, bookform, priceNewProd,author
wait=WebDriverWait(driver,timeout=77)
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.TAG_NAME,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.TAG_NAME,'//div[#data-asin]')###Anf attention here
print(len(y_index))
driver=create_driver()
goToUrl_Se(driver)
You have to mention XPATH not TAG_NAME:
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.XPATH,'//div[#data-asin]')###Anf attention here
print(len(y_index))
I'm trying to get a full-length screenshot and haven't been able to make it work. Here's the code I'm using:
from Screenshot import Screenshot
from selenium import webdriver
import time
ob = Screenshot.Screenshot()
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
url = "https://stackoverflow.com/questions/73298355/how-to-remove-duplicate-values-in-one-column-but-keep-the-rows-pandas"
driver.get(url)
img_url = ob.full_Screenshot(driver, save_path=r'.', image_name='example.png')
print(img_url)
driver.quit()
But this gives us a clipped screenshot:
So as you can see that's just what the driver window is showing, not a full-length screenshot. How can I tweak this code to get what I'm looking for?
Here is an example of how you can take full <body> screenshot of a page:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://stackoverflow.com/questions/7263824/get-html-source-of-webelement-in-selenium-webdriver-using-python?rq=1'
browser.get(url)
required_width = browser.execute_script('return document.body.parentNode.scrollWidth')
required_height = browser.execute_script('return document.body.parentNode.scrollHeight')
browser.set_window_size(required_width, required_height)
t.sleep(5)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight);")
required_width = browser.execute_script('return document.body.parentNode.scrollWidth')
required_height = browser.execute_script('return document.body.parentNode.scrollHeight')
browser.set_window_size(required_width, required_height)
t.sleep(1)
body_el = WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.TAG_NAME, "body")))
body_el.screenshot('full_page_screenshot.png')
print('took full screenshot!')
t.sleep(1)
browser.quit()
Selenium setup is for linux, but just note the imports, and the part after defining the browser. Code above is starting from a small window, then it maximizes it to fit in the full page body, then it waits a bit and computes the body size again, just to account for some scripts kicking in on user's input. Then it takes the screenshot - tested and working on a really long page.
To get a full-page screenshot using Selenium-Python clients you can use the GeckoDriver and firefox based save_full_page_screenshot() method as follows:
Code:
driver = webdriver.Firefox(service=s, options=options)
driver.get('https://stackoverflow.com/questions/73298355/how-to-remove-duplicate-values-in-one-column-but-keep-the-rows-pandas')
driver.save_full_page_screenshot('fullpage_gecko_firefox.png')
driver.quit()
Screenshot:
tl; dr
[py] Adding full page screenshot feature for Firefox
enter image description here
''' Trying to fetch Equity Derivatives data from NSE
https://www.nseindia.com/->Market Data-> Derivatives Market
Works until click action, the browser navigates to Derivatives Market but then thorws access
denied error as below
<h1>
Access Denied
</h1>
You don't have permission to access "http://www.nseindia.com/market-data/equity-derivatives-watch" on this server.
<p>
Reference #18.386dcc17.1603823463.54b06d7
</p>
'''
from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import time
# Tried all possible options below
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.get("https://www.nseindia.com/")
marketdata = driver.find_element_by_xpath("//*[#id='main_navbar']/ul/li[3]/a")
derivativesmarket = driver.find_element_by_xpath("//*[#id='main_navbar']/ul/li[3]/div/div[1]/div/div[1]/ul/li[3]/a")
actions = ActionChains(driver)
actions.move_to_element(marketdata).move_to_element(derivativesmarket).click().perform()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# soup = BeautifulSoup(html,'lxml')
time.sleep(7)
print(soup.prettify())[enter image description here][1]
Add
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
and use
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Then access the element like so:
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "h1.h1")))
Reference:- Access Denied You don't have permission to access "site" on this server using ChromeDriver and Chrome through Selenium Python
I want to click on the second link in result in google search area using selenium-web-driver
( Need a method that suits for any google search result )
example page
This is my code, How can I modify the if statement
import speech_recognition as sr
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
global driver
driver = webdriver.Chrome('C:\Windows\chromedriver.exe', options=chrome_options)
chrome_options.add_argument("--start-maximized")
wait = WebDriverWait(driver, 10)
def google(text):
if "first link" in text:
RESULTS_LOCATOR = "//div/h3/a"
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, RESULTS_LOCATOR)))
page1_results = driver.find_elements(By.XPATH, RESULTS_LOCATOR)
for item in page1_results:
print(item.text)
# driver.find_element_by_class_name().click()
else:
ggwp=text.replace(" ", "")
driver.get("https://www.google.com")
driver.find_element_by_xpath('//*[#id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input').send_keys(ggwp)
driver.find_element_by_xpath('//*[#id="tsf"]/div[2]/div[1]/div[3]/center/input[1]').send_keys(Keys.ENTER)
Second link can by placed in different div-s.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver")
link = 'http://www.google.com'
browser.get(link)
# search keys
search = browser.find_element_by_name('q')
search.send_keys("python")
search.send_keys(Keys.RETURN)
# click second link
for i in range(10):
try:
browser.find_element_by_xpath('//*[#id="rso"]/div['+str(i)+']/div/div[2]/div/div/div[1]/a').click()
break
except:
pass