I have a list, which is dynamically loaded by AJAX.
At first, while loading, it's code is like this:
<ul><li class="last"><a class="loading" href="#"><ins> </ins>Загрузка...</a></li></ul>
When the list is loaded, all of it li and a are changed. And it's always more than 1 li.
Like this:
<ul class="ltr">
<li id="t_b_68" class="closed" rel="simple">
<a id="t_a_68" href="javascript:void(0)">Category 1</a>
</li>
<li id="t_b_64" class="closed" rel="simple">
<a id="t_a_64" href="javascript:void(0)">Category 2</a>
</li>
...
I need to check if list is loaded, so I check if it has several li.
So far I tried:
1) Custom waiting condition
class more_than_one(object):
def __init__(self, selector):
self.selector = selector
def __call__(self, driver):
elements = driver.find_elements_by_css_selector(self.selector)
if len(elements) > 1:
return True
return False
...
try:
query = WebDriverWait(driver, 30).until(more_than_one('li'))
except:
print "Bad crap"
else:
# Then load ready list
2) Custom function based on find_elements_by
def wait_for_several_elements(driver, selector, min_amount, limit=60):
"""
This function provides awaiting of <min_amount> of elements found by <selector> with
time limit = <limit>
"""
step = 1 # in seconds; sleep for 500ms
current_wait = 0
while current_wait < limit:
try:
print "Waiting... " + str(current_wait)
query = driver.find_elements_by_css_selector(selector)
if len(query) > min_amount:
print "Found!"
return True
else:
time.sleep(step)
current_wait += step
except:
time.sleep(step)
current_wait += step
return False
This doesn't work, because driver (current element passed to this function) gets lost in DOM. UL isn't changed but Selenium can't find it anymore for some reason.
3) Excplicit wait. This just sucks, because some lists are loaded instantly and some take 10+ secs to load. If I use this technique I have to wait max time every occurence, which is very bad for my case.
4) Also I can't wait for child element with XPATH correctly. This one just expects ul to appear.
try:
print "Going to nested list..."
#time.sleep(WAIT_TIME)
query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, './/ul')))
nested_list = child.find_element_by_css_selector('ul')
Please, tell me the right way to be sure, that several heir elements are loaded for specified element.
P.S. All this checks and searches should be relative to current element.
First and foremost the elements are AJAX elements.
Now, as per the requirement to locate all the desired elements and create a list, the simplest approach would be to induce WebDriverWait for the visibility_of_all_elements_located() and you can use either of the following Locator Strategies:
Using CSS_SELECTOR:
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "ul.ltr li[id^='t_b_'] > a[id^='t_a_'][href]")))
Using XPATH:
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//ul[#class='ltr']//li[starts-with(#id, 't_b_')]/a[starts-with(#id, 't_a_') and starts-with(., 'Category')]")))
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Incase your usecase is to wait for certain number of elements to be loaded e.g. 10 elements, you can use you can use the lambda function as follows:
Using >:
myLength = 9
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_xpath("//ul[#class='ltr']//li[starts-with(#id, 't_b_')]/a[starts-with(#id, 't_a_') and starts-with(., 'Category')]")) > int(myLength))
Using ==:
myLength = 10
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_xpath("//ul[#class='ltr']//li[starts-with(#id, 't_b_')]/a[starts-with(#id, 't_a_') and starts-with(., 'Category')]")) == int(myLength))
You can find a relevant discussion in How to wait for number of elements to be loaded using Selenium and Python
References
You can find a couple of relevant detailed discussions in:
Getting specific elements in selenium
Cannot find table element from div element in selenium python
Extract text from an aria-label selenium webdriver (python)
I created AllEc which basically piggybacks on WebDriverWait.until logic.
This will wait until the timeout occurs or when all of the elements have been found.
from typing import Callable
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
class AllEc(object):
def __init__(self, *args: Callable, description: str = None):
self.ecs = args
self.description = description
def __call__(self, driver):
try:
for fn in self.ecs:
if not fn(driver):
return False
return True
except StaleElementReferenceException:
return False
# usage example:
wait = WebDriverWait(driver, timeout)
ec1 = EC.invisibility_of_element_located(locator1)
ec2 = EC.invisibility_of_element_located(locator2)
ec3 = EC.invisibility_of_element_located(locator3)
all_ec = AllEc(ec1, ec2, ec3, description="Required elements to show page has loaded.")
found_elements = wait.until(all_ec, "Could not find all expected elements")
Alternatively I created AnyEc to look for multiple elements but returns on the first one found.
class AnyEc(object):
"""
Use with WebDriverWait to combine expected_conditions in an OR.
Example usage:
>>> wait = WebDriverWait(driver, 30)
>>> either = AnyEc(expectedcondition1, expectedcondition2, expectedcondition3, etc...)
>>> found = wait.until(either, "Cannot find any of the expected conditions")
"""
def __init__(self, *args: Callable, description: str = None):
self.ecs = args
self.description = description
def __iter__(self):
return self.ecs.__iter__()
def __call__(self, driver):
for fn in self.ecs:
try:
rt = fn(driver)
if rt:
return rt
except TypeError as exc:
raise exc
except Exception as exc:
# print(exc)
pass
def __repr__(self):
return " ".join(f"{e!r}," for e in self.ecs)
def __str__(self):
return f"{self.description!s}"
either = AnyEc(ec1, ec2, ec3)
found_element = wait.until(either, "Could not find any of the expected elements")
Lastly, if it's possible to do so, you could try waiting for Ajax to be finished.
This is not useful in all cases -- e.g. Ajax is always active. In the cases where Ajax runs and finishes it can work. There are also some ajax libraries that do not set the active attribute, so double check that you can rely on this.
def is_ajax_complete(driver)
rt = driver.execute_script("return jQuery.active", *args)
return rt == 0
wait.until(lambda driver: is_ajax_complete(driver), "Ajax did not finish")
(1) You did not mention the error you get with it
(2) Since you mention
...because driver (current element passed to this function)...
I'll assume this is actually a WebElement. In this case, instead of passing the object itself to your method, simply pass the selector that finds that WebElement (in your case, the ul). If the "driver gets lost in DOM", it could be that re-creating it inside the while current_wait < limit: loop could mitigate the problem
(3) yeap, time.sleep() will only get you that far
(4) Since the li elements loaded dynamically contain class=closed, instead of (By.XPATH, './/ul'), you could try (By.CSS_SELECTOR, 'ul > li.closed') (more details on CSS Selectors here)
Keeping in mind comments of Mr.E. and Arran I made my list traversal fully on CSS selectors. The tricky part was about my own list structure and marks (changing classes, etc.), as well as about creating required selectors on the fly and keeping them in memory during traversal.
I disposed waiting for several elements by searching for anything that is not loading state. You may use ":nth-child" selector as well like here:
#in for loop with enumerate for i
selector.append(' > li:nth-child(%i)' % (i + 1)) # identify child <li> by its order pos
This is my hard-commented code solution for example:
def parse_crippled_shifted_list(driver, frame, selector, level=1, parent_id=0, path=None):
"""
Traversal of html list of special structure (you can't know if element has sub list unless you enter it).
Supports start from remembered list element.
Nested lists have classes "closed" and "last closed" when closed and "open" and "last open" when opened (on <li>).
Elements themselves have classes "leaf" and "last leaf" in both cases.
Nested lists situate in <li> element as <ul> list. Each <ul> appears after clicking <a> in each <li>.
If you click <a> of leaf, page in another frame will load.
driver - WebDriver; frame - frame of the list; selector - selector to current list (<ul>);
level - level of depth, just for console output formatting, parent_id - id of parent category (in DB),
path - remained path in categories (ORM objects) to target category to start with.
"""
# Add current level list elements
# This method selects all but loading. Just what is needed to exclude.
selector.append(' > li > a:not([class=loading])')
# Wait for child list to load
try:
query = WebDriverWait(driver, WAIT_LONG_TIME).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ''.join(selector))))
except TimeoutException:
print "%s timed out" % ''.join(selector)
else:
# List is loaded
del selector[-1] # selector correction: delete last part aimed to get loaded content
selector.append(' > li')
children = driver.find_elements_by_css_selector(''.join(selector)) # fetch list elements
# Walk the whole list
for i, child in enumerate(children):
del selector[-1] # delete non-unique li tag selector
if selector[-1] != ' > ul' and selector[-1] != 'ul.ltr':
del selector[-1]
selector.append(' > li:nth-child(%i)' % (i + 1)) # identify child <li> by its order pos
selector.append(' > a') # add 'li > a' reference to click
child_link = driver.find_element_by_css_selector(''.join(selector))
# If we parse freely further (no need to start from remembered position)
if not path:
# Open child
try:
double_click(driver, child_link)
except InvalidElementStateException:
print "\n\nERROR\n", InvalidElementStateException.message(), '\n\n'
else:
# Determine its type
del selector[-1] # delete changed and already useless link reference
# If <li> is category, it would have <ul> as child now and class="open"
# Check by class is priority, because <li> exists for sure.
current_li = driver.find_element_by_css_selector(''.join(selector))
# Category case - BRANCH
if current_li.get_attribute('class') == 'open' or current_li.get_attribute('class') == 'last open':
new_parent_id = process_category_case(child_link, parent_id, level) # add category to DB
selector.append(' > ul') # forward to nested list
# Wait for nested list to load
try:
query = WebDriverWait(driver, WAIT_LONG_TIME).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ''.join(selector))))
except TimeoutException:
print "\t" * level, "%s timed out (%i secs). Failed to load nested list." %\
''.join(selector), WAIT_LONG_TIME
# Parse nested list
else:
parse_crippled_shifted_list(driver, frame, selector, level + 1, new_parent_id)
# Page case - LEAF
elif current_li.get_attribute('class') == 'leaf' or current_li.get_attribute('class') == 'last leaf':
process_page_case(driver, child_link, level)
else:
raise Exception('Damn! Alien class: %s' % current_li.get_attribute('class'))
# If it's required to continue from specified category
else:
# Check if it's required category
if child_link.text == path[0].name:
# Open required category
try:
double_click(driver, child_link)
except InvalidElementStateException:
print "\n\nERROR\n", InvalidElementStateException.msg, '\n\n'
else:
# This element of list must be always category (have nested list)
del selector[-1] # delete changed and already useless link reference
# If <li> is category, it would have <ul> as child now and class="open"
# Check by class is priority, because <li> exists for sure.
current_li = driver.find_element_by_css_selector(''.join(selector))
# Category case - BRANCH
if current_li.get_attribute('class') == 'open' or current_li.get_attribute('class') == 'last open':
selector.append(' > ul') # forward to nested list
# Wait for nested list to load
try:
query = WebDriverWait(driver, WAIT_LONG_TIME).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ''.join(selector))))
except TimeoutException:
print "\t" * level, "%s timed out (%i secs). Failed to load nested list." %\
''.join(selector), WAIT_LONG_TIME
# Process this nested list
else:
last = path.pop(0)
if len(path) > 0: # If more to parse
print "\t" * level, "Going deeper to: %s" % ''.join(selector)
parse_crippled_shifted_list(driver, frame, selector, level + 1,
parent_id=last.id, path=path)
else: # Current is required
print "\t" * level, "Returning target category: ", ''.join(selector)
path = None
parse_crippled_shifted_list(driver, frame, selector, level + 1, last.id, path=None)
# Page case - LEAF
elif current_li.get_attribute('class') == 'leaf':
pass
else:
print "dummy"
del selector[-2:]
This How I solved the problem that I want to wait until certain amount of post where complete load through AJAX
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# create a new Chrome session
driver = webdriver.Chrome()
# navigate to your web app.
driver.get("http://my.local.web")
# get the search button
seemore_button = driver.find_element_by_id("seemoreID")
# Count the cant of post
seemore_button.click()
# Wait for 30 sec, until AJAX search load the content
WebDriverWait(driver,30).until(EC.visibility_of_all_elements_located(By.CLASS_NAME, "post")))
# Get the list of post
listpost = driver.find_elements_by_class_name("post")
Related
I need to scrap all the google reviews. There are 90,564 reviews in my page. However the code i wrote can scrap only top 9 reviews. The other reviews are not scraped.
The code is given below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# specify the url of the business page on Google
url = 'https://www.google.com/maps/place/ISKCON+temple+Bangalore/#13.0098328,77.5510964,15z/data=!4m7!3m6!1s0x0:0x7a7fb24a41a6b2b3!8m2!3d13.0098328!4d77.5510964!9m1!1b1'
# create an instance of the Chrome driver
driver = webdriver.Chrome()
# navigate to the specified url
driver.get(url)
# Wait for the reviews to load
wait = WebDriverWait(driver, 20) # increased the waiting time
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'wiI7pd')))
# extract the text of each review
reviews = [element.text for element in review_elements]
# print the reviews
print(reviews)
# close the browser
driver.quit()
what should i edit/modify the code to extract all the reviews?
Here is the working code for you after launching the url
totalRev = "div div.fontBodySmall"
username = ".d4r55"
reviews = "wiI7pd"
wait = WebDriverWait(driver, 20)
totalRevCount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).get_attribute("textContent").split(' ')[0].replace(',','').replace('.','')
print("totalRevCount - ", totalRevCount)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).click()
mydict = {}
found = 0
while found < int(totalRevCount):
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, reviews)))
reviewer_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, username)))
found = len(mydict)
for rev, name in zip(review_elements, reviewer_names):
mydict[name.text] = rev.text
if len(rev.text) == 0:
found = int(totalRevCount) + 1
break
for i in range(8):
ActionChains(driver).key_down(Keys.ARROW_DOWN).perform()
print("found - ", found)
print(mydict)
time.sleep(2)
Explanation -
Get the locators for user name and review since we are going to create a key-value pair which will be useful in creating a non-duplicate result
You need to first get the total number of reviews/ratings that are present for that given location.
Get the username and review for the "visible" part of the webpage and store it in the dictionary
Scroll down the page and wait a few seconds
Get the username and review again and add them to dictionary. Only new ones will be added
As soon as a review that has no text (only rating), the loop will close and you have your results.
NOTE - If you want all reviews irrespective of the review text present or not, you can remove the "if" loop
I think you'll need to scoll down at first, and the get all the reviews.
scroll_value = 230
driver.execute_script( 'window.scrollBy( 0, '+str(scroll_value)+ ' )' ) # to scroll by value
# to get the current scroll value on the y axis
scroll_Y = driver.execute_script( 'return window.scrollY' )
That might be because the elements don't get loaded elsewise.
Since they are over 90'000, you might consider scolling down a little, then getting the reviews, repeat.
Resource: https://stackoverflow.com/a/74508235/20443541
I am trying to scrape products listed on https://www.ethicon.com/. My approach is to start with
scraping product links from product list page
find all variant link pages from category pages (From Product Specifications in sample)
extract relevant details from the variant page
I am testing with moving from 2 -> 3 as of now. I am trying with
this code (full code)
def fetch_productlinks(self, url):
self.driver.get(url)
elems = self.driver.find_elements_by_xpath("//a[#href]")
# print(elems[0].get_attribute("href"))
counter = 0
for elem in elems:
elem_url = elem.get_attribute("href")
if re.match(".*/code/.*", elem_url):
# print(elem_url)
if counter <= 1:
self.extract_imagesandmetadata(elem_url, self.driver)
counter += 1
def extract_imagesandmetadata(self, url, driver):
# driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't')
before_window = driver.window_handles[0]
driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't')
driver.get(url)
after_window = driver.window_handles[1]
driver.switch_to.window(after_window)
print("Crawling ..." + url)
html = driver.page_source
if html:
self.soup = BeautifulSoup(html, 'html.parser')
pimg = self.soup.find('img', {'class': 'img-responsive'})
if pimg:
print(pimg["src"])
self.tempdict["pimg"] = self.img_base_path + pimg["src"]
else:
self.tempdict["pimg"] = ""
ptitle = self.soup.find('h1', {'class': 'eprc-title'})
if ptitle:
print(self.sanitize_text(ptitle.text))
self.tempdict["ptitle"] = self.sanitize_text(ptitle.text)
else:
self.tempdict["ptitle"] = ""
self.tempdict["purl"] = url
self.outdict.append(self.tempdict)
driver.switch_to.window(before_window)
and getting below error
selenium.common.exceptions.StaleElementReferenceException: Message:
stale element reference: element is not attached to the page document
which I believe I am getting because the webdriver is losing the reference after the function call. I am calling fetch_productlinks from the main function.
What can I do to resolve this?
The basics of selenium are that you locate an element by one of selenium's locators. One assumes once you have the element you can interact with it, and life is easy. Typically it's not, because any interaction with the page, can cause the DOM to re-render making your element something that no longer exists. That is simplified explanation, it can get way worse, but we will stick with this simple concept for now.
You need to accept that the DOM has the potential to re-render at the most inopportune times. Once you accept this, you start to think in terms of how to recover and what is the acceptable limits of recovery.
Below I have some pseudo code to give you an idea of how to recover.
In the pseudo code I have two classes which semi-mimic your troubled code section.
The first class is called PseudoClass1:
What PseudoClass1 does is just simply save your data to memory and if we manage to receive no errors, move that data to its desired location. This assumes memory usage is not a problem.
The second class is called PseudoClass2:
What PseudoClass2 does is just simply save your data to a temporary directory and if we manage to receive no errors, move that data to its desired location. This assumes memory usage is a problem, so we write to a temp location then move it to the final destination.
import os
import re
import tempfile
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
# We are assuming the webdriver is right next to this file
CHROME_DRIVER_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
class PseudoClass1:
def __init__(self):
self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH)
self.cache = {}
def save_images_plus_meta_into_memory(self, url):
"""Does some work and saves whatever you want into self.cache"""
def move_images_plus_meta_into_desired_location(self):
"""Take whatever has been safely processed in self.cache and move it to the desired location"""
def fetch_productlinks_saved_in_memory(self, url):
"""If you have no memory constraints save the values into memory"""
self.driver.get(url)
retries = 1
max_retries = 10
while retries <= max_retries:
self.cache = {}
retries += 1
try:
elems = self.driver.find_elements_by_xpath("//a[#href]")
counter = 0
for elem in elems:
elem_url = elem.get_attribute("href")
if re.match(".*/code/.*", elem_url):
if counter <= 1:
self.save_images_plus_meta_into_memory(elem_url)
counter += 1
# If are here, you did not get a stale element reference
# You can now transfer data to the desired place
self.move_images_plus_meta_into_desired_location()
return # Exit out of the retry loop
except StaleElementReferenceException:
print(f'Retry count: {retries}')
continue
raise Exception('Failed to fetch product links')
class PseudoClass2:
def __init__(self):
self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH)
def save_images_plus_meta_into_a_temp_dir(self, url, temp_dir):
"""Does some work and saves whatever you want into the temp_dir location"""
def move_images_plus_meta_into_desired_location(self, temp_dir):
"""Take whatever has been safely processed in temp_dir and move it to the desired location"""
def fetch_productlinks_saved_in_a_temp_directory(self, url):
self.driver.get(url)
retries = 1
max_retries = 10
while retries <= max_retries:
retries += 1
try:
# creates a temporary directory to write stuff to
with tempfile.TemporaryDirectory() as temp_dir:
elems = self.driver.find_elements_by_xpath("//a[#href]")
counter = 0
for elem in elems:
elem_url = elem.get_attribute("href")
if re.match(".*/code/.*", elem_url):
if counter <= 1:
self.save_images_plus_meta_into_a_temp_dir(elem_url, temp_dir)
counter += 1
# If are here, you did not get a stale element reference
# You can now transfer data to the desired place
self.move_images_plus_meta_into_desired_location(temp_dir)
return # Exit out of the retry loop
except StaleElementReferenceException:
print(f'Retry count: {retries}')
continue
raise Exception('Failed to fetch product links')
With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?
Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()
You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')
I am trying to iterate through a list that refreshes every 10 sec.
this is what I have tried:
driver.get("https://www.winmasters.ro/ro/live-betting/")
events = driver.find_elements_by_css_selector('.event-wrapper.v1.event-live.odds-hidden.event-sport-1')
for i in range(len(events)):
try:
event = events[i]
name = event.find_element_by_css_selector('.event-details-team-name.event-details-team-a')# the error occurs here
except: # NoSuchElementException or StaleElementReferenceException
time.sleep(3) # i have tried up to 20 sec
event = events[i]
name = event.find_element_by_css_selecto('.event-details-team-name.event-details-team-a')
this did not work so I tried another except
except: # second try that also did not work
element = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.event-details-team-name.event-details-team-a'))
)
name = event.find_element_by_css_selecto('.event-details-team-name.event-details-team-a')
Now I am assigning something that I will never use to name like:
try:
event = events[i]
name = event.find_element_by_css_selector('.event-details-team-name.event-details-team-a')
except:
name = "blablabla"
With this code when the page refreshes I get about 7 or 8 of the "blablabla" until it finds my selector again from the webpage
You can get all required data using JavaScript.
Code below will give you list of events map with all details instantly and without NoSuchElementException or StaleElementReferenceException errors:
me_id : unique identificator
href : href with details which you can use to get details
team_a : name of the first team
team_a_score : score of the first team
team_b : name of the second team
team_b_score : score of the second team
event_status : status of the event
event_clock : time of the event
def events = driver.execute_script('return [...document.querySelectorAll(\'[data-uat="live-betting-overview-leagues"] .events-for-league .event-live\')].map(e=>{return {me_id:e.getAttribute("me_id"), href:e.querySelector("a.event-details-live").href, team_a:e.querySelector(".event-details-team-a").textContent, team_a_score:e.querySelector(".event-details-score-1").textContent, team_b:e.querySelector(".event-details-team-b").textContent, team_b_score:e.querySelector(".event-details-score-2").textContent, event_status:e.querySelector(\'[data-uat="event-status"]\').textContent, event_clock:e.querySelector(\'[data-uat="event-clock"]\').textContent}})')
for event in events:
print(event.get('me_id'))
print(event.get('href')) #using href you can open event details using: driver.get(event.get('href'))
print(event.get('team_a'))
print(event.get('team_a_score'))
print(event.get('team_b'))
print(event.get('team_b_score'))
print(event.get('event_status'))
print(event.get('event_clock'))
One primary problem is that you are acquiring all of the elements up front, and then iterating through that list. As the page itself is updating frequently, the elements you've already acquired have gone "stale", meaning they are not long associated with current DOM objects. When you try to use those stale elements, Selenium throw StaleElementReferenceExceptions because it has no way of doing anything with those now out-of-date objects.
One way to overcome this is to only acquire and use an element right as you need it, rather than fetching them all up front. I personally feel the cleanest approach is to use the CSS :nth-child() approach:
from selenium import webdriver
def main():
base_css = '.event-wrapper.v1.event-live.odds-hidden.event-sport-1'
driver = webdriver.Chrome()
try:
driver.get("https://www.winmasters.ro/ro/live-betting/")
# Get a list of all elements
events = driver.find_elements_by_css_selector(base_css)
print("Found {} events".format(len(events)))
# Iterate through the list, keeping track of the index
# note that nth-child referencing begins at index 1, not 0
for index, _ in enumerate(events, 1):
name = driver.find_element_by_css_selector("{}:nth-child({}) {}".format(
base_css,
index,
'.event-details-team-name.event-details-team-a'
))
print(name.text)
finally:
driver.quit()
if __name__ == "__main__":
main()
If I run the above script, I get this output:
$ python script.py
Found 2 events
Hapoel Haifa
FC Ashdod
Now, as the underlying webpage really does update a lot, there is still a decent chance you can get a SERE error. To overcome that you can use a retry decorator (pip install retry to get the package) to handle the SERE and reacquire the element:
import retry
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
#retry.retry(StaleElementReferenceException, tries=3)
def get_name(driver, selector):
elem = driver.find_element_by_css_selector(selector)
return elem.text
def main():
base_css = '.event-wrapper.v1.event-live.odds-hidden.event-sport-1'
driver = webdriver.Chrome()
try:
driver.get("https://www.winmasters.ro/ro/live-betting/")
events = driver.find_elements_by_css_selector(base_css)
print("Found {} events".format(len(events)))
for index, _ in enumerate(events, 1):
name = get_name(
driver,
"{}:nth-child({}) {}".format(
base_css,
index,
'.event-details-team-name.event-details-team-a'
)
)
print(name)
finally:
driver.quit()
if __name__ == "__main__":
main()
Now, despite the above examples, I think you still have issues with your CSS selectors, which is the primary reason for the NoSuchElement exceptions. I can't help with that without a better description of what you are actually trying to accomplish with this script.
I've been working on a research project that is looking to obtain a list of reference articles from the Brazil Hemeroteca (The desired page reference: http://memoria.bn.br/DocReader/720887x/839, needs to be collected from two hidden elements on the following page: http://memoria.bn.br/DocReader/docreader.aspx?bib=720887x&pasta=ano%20189&pesq=Milho). I asked a question a few weeks back that was answered and I was able to get things running well in regards to that, but now I've hit a new snag and I'm not exactly sure how to get around it.
The problem is that after the first form is filled in, the page redirects to a second page, which is a JavaScript/AJAX enabled page which I need to spool through all of the matches, which is done by means of clicking a button at the top of the page. The problem I'm encountering is that when clicking the next page button I'm dealing with elements on the page that are updating, which leads to Stale Elements. I've tried to implement a few pieces of code to detect when this "stale" effect occurs to indicate the page has changed, but this has not provided much luck. Here is the code I've implemented:
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
saveDir = "C:/tmp"
print("Opening Page...")
browser = webdriver.Chrome()
url = "http://bndigital.bn.gov.br/hemeroteca-digital/"
browser.get(url)
print("Searching for elements")
fLink = ""
fails = 0
frame_ref = browser.find_elements_by_tag_name("iframe")[0]
iframe = browser.switch_to.frame(frame_ref)
journal = browser.find_element_by_id("PeriodicoCmb1_Input")
search_journal = "Relatorios dos Presidentes dos Estados Brasileiros (BA)"
search_timeRange = "1890 - 1899"
search_text = "Milho"
xpath_form = "//input[#name=\'PesquisarBtn1\']"
xpath_journal = "//li[text()=\'"+search_journal+"\']"
xpath_timeRange = "//input[#name=\'PeriodoCmb1\' and not(#disabled)]"
xpath_timeSelect = "//li[text()=\'"+search_timeRange+"\']"
xpath_searchTerm = "//input[#name=\'PesquisaTxt1\']"
print("Locating Journal/Periodical")
journal.click()
dropDownJournal = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.XPATH, xpath_journal)))
dropDownJournal.click()
print("Waiting for Time Selection")
try:
timeRange = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeRange)))
timeRange.click()
time.sleep(1)
print("Locating Time Range")
dropDownTime = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeSelect)))
dropDownTime.click()
time.sleep(1)
except:
print("Failed...")
print("Adding Search Term")
searchTerm = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_searchTerm)))
searchTerm.clear()
searchTerm.send_keys(search_text)
time.sleep(5)
print("Perform search")
submitButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_form)))
submitButton.click()
# Wait for the second page to load, pull what we need from it.
download_list = []
browser.switch_to_window(browser.window_handles[-1])
print("Waiting for next page to load...")
matches = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//span[#id=\'OcorNroLbl\']")))
print("Next page ready, found match element... counting")
countText = matches.text
countTotal = int(countText[countText.find("/")+1:])
print("A total of " + str(countTotal) + " matches have been found, standing by for page load.")
for i in range(1, countTotal+2):
print("Waiting for page " + str(i-1) + " to load...")
while(fLink in download_list):
try:
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
except:
fails += 1
time.sleep(1)
if(fails == 10):
print("Locked on a page, attempting to push to next.")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
#raise
while(fLink == ""):
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
fails = 0
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
There are two "bugs" I'm trying to solve with this block. First, the very first page is always skipped in the loop (IE: fLink = ""), even though there is a test in there for it, I'm not sure why this occurs. The other bug is that the code will hang on specific pages completely randomly and the only way out is to break the code execution.
This block has been modified a few times so I know it's not the most "elegant" of solutions, but I'm starting to run out of time.
After taking a day off from this to think about it (And get some more sleep), I was able to figure out what was going on. The above code has three "big faults". This first is that it does not handle the StaleElementException versus the NoSuchElementException, which can occur while the page is shifting. Secondly, the loop condition was checking iteratively that a page wasn't in the list, which when entering the first run allowed the blank condition to load in directly as the loop was never executed on the first run (Should have used a do-while there, but I made more modifications). Finally, I made the silly error of only checking if the first hidden element was changing, when in fact that is the journal ID, and is pretty much constant through all.
The revisions began with an adaptation of a code on this other SO article to implement a "hold" condition until either one of the hidden elements changed:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
def hold_until_element_changed(driver, element1_xpath, element2_xpath, old_element1_text, old_element2_text):
while True:
try:
element1 = driver.find_element_by_xpath(element1_xpath)
element2 = driver.find_element_by_xpath(element2_xpath)
if (element1.get_attribute('value') != old_element1_text) or (element2.get_attribute('value') != old_element2_text):
break
except StaleElementReferenceException:
break
except NoSuchElementException:
return False
time.sleep(1)
return True
I then modified the original looping condition, going back to the original "for loop" counter I had created without an internal loop, instead shooting a call to the above function to create the "hold" until the page had flipped, and voila, worked like a charm. (NOTE: I also upped the timeout on the next page button as this is what caused the locking condition)
for i in range(1, countTotal+1):
print("Waiting for page " + str(i) + " to load...")
bibxpath = "//input[#name=\'HiddenBibAlias\']"
pagexpath = "//input[#name=\'hPagFis\']"
jIDElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, bibxpath)))
jPageElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, pagexpath)))
jidtext = jIDElement.get_attribute('value')
jpagetext = jPageElement.get_attribute('value')
fLink = "http://memoria.bn.br/DocReader/" + jidtext + "/" + jpagetext + "&pesq=" + search_text
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
# Wait for next page to be ready
change = hold_until_element_changed(browser, bibxpath, pagexpath, jidtext, jpagetext)
if(change == False):
print("Something went wrong.")
All in all, a good exercise in thought and some helpful links for me to consider when posting future questions. Thanks!