locating element via selenium explicitly waiting and without gives 0 - python

Using this url I want to locate div tags which has attribute data-asin . When I use //div[#data-asin] in Chrome Inspect mode it gives 21 elements. But while trying to get these elements via Selenium in both ways, explicit wait and direct length gives 0. As I guess Selenium remote browser is unable to get anyone of these elements as a DOM tree. code is below
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\Users\Admin\Downloads\chromedriver107v\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
return driver
#going to urls 1-by-1
def goToUrl_Se(driver):
global counter
counter = 0
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(amazn) via Selenium WebDriver
driver.get(xUrl)
parse_data()
counter+=1
driver.quit()
#fetch-parse the data from url page
def parse_data():
global asin, title, bookform, priceNewProd,author
wait=WebDriverWait(driver,timeout=77)
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.TAG_NAME,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.TAG_NAME,'//div[#data-asin]')###Anf attention here
print(len(y_index))
driver=create_driver()
goToUrl_Se(driver)

You have to mention XPATH not TAG_NAME:
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.XPATH,'//div[#data-asin]')###Anf attention here
print(len(y_index))

Related

Can't find element by name using selenium

I'm using selenium 4.7.2 and can't find the element by its name. The following code returns NoSuchElementException error:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# Get the website using the Chrome webbdriver
browser = webdriver.Chrome()
browser.get('https://www.woofshack.com/en/cloud-chaser-waterproof-softshell-dog-jacket-ruffwear-rw-5102.html')
# Print out the result
price = browser.find_element(By.NAME, 'data-price-665')
print("Price: " + price.text)
# Close the browser
time.sleep(3)
browser.close()
What's wrong in using find_element method?
Looks like you are using a wrong locator here. I see no element with name attribute value 'data-price-665' on that page.
The following code is working:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
url = "https://www.woofshack.com/en/cloud-chaser-waterproof-softshell-dog-jacket-ruffwear-rw-5102.html"
driver.get(url)
price = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#product-price-665 .price")))
print("Price: " + price.text)
The output is:
Price: €112.95

WebScrapping with Selenium and BeaufitulSoup can't find anything

I am trying to extract all the description in the links in the class="publication u-padding-xs-ver js-publication" of this website: https://www.sciencedirect.com/browse/journals-and-books?accessType=openAccess&accessType=containsOpenAccess
I tried both with BeautifulSoup and Selenium but I can't extract anything. You can see in the image below the result I got
result
Here is the code I am using
options = Options()
options.add_argument("headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
ul = driver.find_element(By.ID, "publication-list")
print("Links")
allLi = ul.find_elements(By.TAG_NAME, "li")
for li in allLi:
print("Links " + str(count) + " " + li.text)
You are missing waits.
You have to wait for elements to become visible before accessing them.
The best approach to do that is with use of WebDriverWait expected_conditions explicit waits.
The following code works
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 20)
url = "https://www.sciencedirect.com/browse/journals-and-books?accessType=openAccess&accessType=containsOpenAccess"
driver.get(url)
ul = wait.until(EC.visibility_of_element_located((By.ID, "publication-list")))
allLi = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "li")))
print(len(allLi))
the output is:
167

python Selenium taking instance of webdriver

i defined two separate functions for opening url with selenium, and fetching data with selenium.
In my second function driver variable is unassignable because it stays local inside first function.
I do not know if it s logical to separate selenium activity in two separate ways, I use this method first time.
Any suggestions to take instance of webdriver and use it inside second function?
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
#going to urls 1-by-1
def goToUrl_Se():
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(a,amazn) via Selenium WebDriver
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\pythonPro\w_crawl\AmznScrpBot\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
driver.get(xUrl)
driver.quit()
#fetch-parse the data from url page
def parse_data():
x_title=driver.find_element(By.XPATH,'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[2]/div/div/div/div/div/div[2]/div/div/div[1]/h2/a/span')
goToUrl_Se()
As I see, you trying to parse data from each URL you opening in goToUrl_Se(). If so the better way is to put the parsing data code inside the loop used in goToUrl_Se() method.
Also, no need to define and create driver each time.
And you definitely have to improve your locators. Very long absolute XPaths are extremely fragile and breakable.
The following flow seems for me to be better.
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = Null
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\pythonPro\w_crawl\AmznScrpBot\chromedriver.exe')
global driver
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
#going to urls 1-by-1
def goToUrl_Se():
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(a,amazn) via Selenium WebDriver
driver.get(xUrl)
x_title=driver.find_element(By.XPATH,'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[2]/div/div/div/div/div/div[2]/div/div/div[1]/h2/a/span')
driver.quit()
create_driver()
goToUrl_Se()
You should return the driver from your create_driver() function:
def create_drive():
// ...
return driver
and change your function to accept a parameter:
def parse_data(driver):
// ...
Now you can get the driver with an assignment and pass it to your function:
driver = create_driver()
parse_data(driver)
I suggest you read more about return values and function parameters to understand this better.
In this structure you can call your second function parse_data within your first function goToUrl_Se() only.
like:
driver.get(xUrl)
somoething = parse_data()
and change parse_data for it to return something
if you want to call them both outside themselves, then you need to do 2 things:
parse_data should get driver as and argumentdef parse_data(driver)
you should not quit selenium within goToUrl_Se()
and if you want to do it as it really should be done, then just use OOP. If you still don't want to, then you'd better initiate driver name ouside any functions and use function to change it. For instance you can have a function that change driver's options only. But that's bad practice when one function does multiple things, like your goToUrl_Se() one.

cannot click element in python via selenium as a variable

I want to save element data to an excel file via python. I have the code below, I need some help why the line where
element.click()
gives an error. Even though I put the click() method upper line, but i need it to be in line below.
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(r"C:\Users\Admin\Downloads\chromedriver_win32 (1)\chromedriver.exe")
driver.get("https://www.nba.com/schedule?pd=false&region=1")
driver.implicitly_wait(30)
element_to_click=driver.find_element(By.ID,"onetrust-accept-btn-handler").click()
element_to_click.click() 'error
element_to_save=driver.find_element(By.XPATH,"//div/div/div/div/h4")
#element_to_save.to_excel("3row,3column)")
driver.quit()
This is one way to reject/accept cookies on that website:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
url = 'https://www.nba.com/schedule?pd=false&region=1'
browser.get(url)
try:
wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))).click()
print('accepted cookies')
except Exception as e:
print('no cookie button!')
Setup is selenium/chrome on linux - just observe the imports and the part after defining the browser/driver.
Selenium documentation can be found at https://www.selenium.dev/documentation/

How would I loop through elements Python Selenium

I have a website for which I want to download excel files. (https://www.rivm.nl/media/smap/eenzaamheid.html)
First I want to click on the region and then perform the download. This I have working.
wijk_keuze = WebDriverWait(driver2nd,20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g']//*[name()='path']")))
wijk_keuze.click()
download = WebDriverWait(driver2nd, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g' and #aria-label='View export menu']//*[name()='rect']")))
download.click()
WebDriverWait(download, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='highcharts-menu']//*[contains(text(),'XLS downloaden')]"))).click()
time.sleep(2)
above code selects the first region in the parent element and then downloads the excel.
What I want to do is loop through each element in the parent element. How would I go about doing so?
The parent element looks as follows:
<g transform="transform(0,0), scale(1,1)" animator="1">
my entire code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
#defining URL
url='https://www.rivm.nl/media/smap/eenzaamheid.html'
#defining driver
driver = webdriver.PhantomJS(r'./phantomjs-2.1.1-windows/bin/phantomjs')
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options, executable_path = r'./chromedriver_win32/chromedriver')
driver.get(url)
# Gemeentes
Detail_keuze = Select(driver.find_element_by_id("detail"))
options = Detail_keuze.options
Indicator_keuze = Select(driver.find_element_by_id("indicator"))
indicator_options = Indicator_keuze.options
for index in range(0, len(indicator_options) ):
#defining URL
url='https://www.rivm.nl/media/smap/eenzaamheid.html'
#defining driver
driver2nd = webdriver.PhantomJS(r'./phantomjs-2.1.1-windows/bin/phantomjs')
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", {
"download.default_directory": r"MY_PATH",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
driver2nd = webdriver.Chrome(options=options, executable_path = r'./chromedriver_win32/chromedriver')
driver2nd.get(url)
# Gemeentes
Detail_keuze = Select(driver2nd.find_element_by_id("detail"))
options = Detail_keuze.options
Indicator_keuze = Select(driver2nd.find_element_by_id("indicator"))
indicator_options = Indicator_keuze.options
time.sleep(1)
Indicator_keuze.select_by_index(index)
wijk_keuze = WebDriverWait(driver2nd,20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g']//*[name()='path']")))
wijk_keuze.click()
download = WebDriverWait(driver2nd, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g' and #aria-label='View export menu']//*[name()='rect']")))
download.click()
WebDriverWait(download, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='highcharts-menu']//*[contains(text(),'XLS downloaden')]"))).click()
time.sleep(2)
######## HERE I WANT TO LOOP THROUGH EACH AND EVERY REGION
driver2nd.close()
As you can see I also want to loop through eachh and every indicator. This works. Now I want to add a loop through each and every region. I have it working so that I can click on the first region.
You don't have to click on the options. You can get the details by changing the url 'https://www.rivm.nl/media/smap/{indicator}?detail={detail}'
Just add the logic for downloading it.
Try:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from itertools import product
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install())#, chrome_options=chrome_options)
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.rivm.nl/media/smap/eenzaamheid.html')
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
ind = soup.find('select', attrs = {'name': 'indicator'} )
indicators = [i['value'] for i in ind.findAll('option')]
det = soup.find('select', attrs = {'name': 'detail'})
details = [i['value'] for i in det.findAll('option')]
for detail, indicator in list(product(details, indicators)):
print(indicator, detail)
new_url = f'https://www.rivm.nl/media/smap/{indicator}?detail={detail}'
driver.get(new_url)
# Write code for downloading it

Categories

Resources