Randomly clicking all the tabs for an infinite amount - python

I am trying to randomly click all unopened tabs (elements) on this page.
The below tends to work however unfortunately, using this approach it does not seem to click all the elements. I assume for some reason it does not like for index in indexes (or does not load all of them correctly). Adding while True:, tends to fix this, though I am unsure how to randomise for an infinite amount. Any ideas?
options = driver.find_elements_by_xpath('//*[#class="KambiBC-mod-event-group-header__event-count"]')
indexes = [index for index in range(len(options))]
shuffle(indexes)
for index in indexes:
time.sleep(1)
clickMe = wait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'(//*[#class="KambiBC-mod-event-group-header__event-count"])[%s]' % str(index + 1))))
driver.execute_script("arguments[0].scrollIntoView();", clickMe)
clickMe.click()
time.sleep(1)
Adding while True:
while True:
time.sleep(0)
#clickMe = wait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'(//*[#class="KambiBC-mod-event-group-header__event-count"])[%s]' % str(index + 1))))
clickMe = wait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="KambiBC-mod-event-group-header__event-count"])[%s]' % str(index + 1))))
driver.execute_script("arguments[0].scrollIntoView();", clickMe)
clickMe.click()
time.sleep(0)

Not all options appears simultaneously, but they added by groups one after another.
You need to wait until all options are available.
Try below code:
def get_options_length():
list_length = len(wait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[#class="KambiBC-collapsible-header KambiBC-mod-event-group-header"]'))))
while True:
time.sleep(1)
if list_length == len(driver.find_elements_by_xpath('//*[#class="KambiBC-collapsible-header KambiBC-mod-event-group-header"]')):
break
else:
list_length = len(driver.find_elements_by_xpath('//*[#class="KambiBC-collapsible-header KambiBC-mod-event-group-header"]'))
return list_length
indexes = [index for index in range(get_options_length())]
shuffle(indexes)
for index in indexes:
time.sleep(1)
clickMe = wait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'(//*[#class="KambiBC-collapsible-header KambiBC-mod-event-group-header"])[%s]' % str(index + 1))))
clickMe = clickMe.find_element_by_xpath('.//span')
driver.execute_script("arguments[0].scrollIntoView();", clickMe)
clickMe.click()

Related

Stale Element - Selenium - Python

So I'll start by saying that this has became such a mess with me trying to solve this issue, other times I have been able to resolve the stale element issue.
Problem all starts after the first players stats are stored ( Everything it should be doing up this point ), and then once it goes back to loop and find the next player we have the issue.
I'm not sure if its caused by the nested loops or what.
I try reinstating the variable that is giving me the issues I assume all throughout the code.
player_stats
The thing is I did have it previously going through 5 players, and I am not sure what happened, or when the bug first established itself lol, as I was working on getting the rounds won, and played situated.
(We aren't even able to print("Found playerCol element") on the second go around)
All print statements works till it hangs in the while loop after the first iteration.
Here is the full code (with comments):
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions importStaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import re
import time
# Initialize the webdriver
driver = webdriver.Firefox()
# Navigate to the website
url = "https://www.hltv.org/stats/players"
driver.get(url)
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
# Find the elements containing the player statistics
player_stats = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail"))
)
# Extract the relevant data from the elements
players = []
for i, player_stat in enumerate(player_stats):
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
while True:
player_stats = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
try:
if "playerCol" in player_stat.get_attribute("class"):
print("Found playerCol element")
name = player_stat.find_element(By.CSS_SELECTOR, "a").text if player_stat.find_elements(By.CSS_SELECTOR, "a") else player_stat.text
print(f"Name: {name}")
elif "statsDetail" in player_stat.get_attribute("class"):
stats = player_stat.text.split()
if len(stats) >= 1 and re.search(r"\d+\.\d+", stats[0]):
kd_ratio = stats[0]
break
except StaleElementReferenceException as e:
player_stats = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
print(f"An error occurred while processing match stats: {e}")
break
# Extract the player stats
if "statsDetail" in player_stat.get_attribute("class"):
stats = player_stat.text.split()
if len(stats) >= 1 and re.search(r"\d+\.\d+", stats[0]):
kd_ratio = stats[0]
# Process match stats for the player
try:
time.sleep(1)
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
player_link = driver.find_element(By.XPATH, f"//a[contains(text(), '{name}')]")
print(player_link.get_attribute('outerHTML'))
driver.execute_script("arguments[0].click();", player_link)
time.sleep(1)
player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
player = [name, kd_ratio]
# Extract additional player stats
headshot_percentage = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Headshot %')]/following-sibling::span"))).text
player.append(headshot_percentage)
kpr = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Kills / round')]/following-sibling::span"))).text
player.append(kpr)
dpr = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Deaths / round')]/following-sibling::span"))).text
player.append(dpr)
# Extract match stats for the player
matches_link = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/stats/players/matches/'][data-link-tracking-destination='Click on Matches -> Individual -> Overview [subnavigation]']")))
driver.execute_script("arguments[0].click();", matches_link)
match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))
match_scores = []
num_of_matches = 0
rounds_won = 0
rounds_played = 0
# Process match stats for the player
for i, match_stat in enumerate(match_stats):
player_name = player[0]
player_team = driver.find_element(By.CSS_SELECTOR, ".gtSmartphone-only span:last-of-type").text
try:
team_name = ""
score = ""
while team_name == "" or score == "":
try:
team = match_stat.find_element(By.CSS_SELECTOR, ".gtSmartphone-only span:last-of-type").text
team_name = team.strip()
score_span = match_stat.find_element(By.XPATH, ".//div[contains(#class, 'gtSmartphone-only')]//*[contains(text(), '(')]")
score_text = score_span.text.strip()
score = re.search(r'\((\d+)\)', score_text).group(1)
except:
time.sleep(1)
match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))
match_stat = match_stats[i]
team_data = match_stat.find_elements(By.CSS_SELECTOR, ".gtSmartphone-only span")
print("Team data:", team_data[3].text)
if team_name.lower() == player_team.lower():
player_score = score
opposing_team_name = team_data[2].text.strip()
print(opposing_team_name)
opposing_team_score = team_data[3].text.strip('()')
print("Score strip: ", opposing_team_score)
rounds_won += int(player_score)
rounds_played += int(player_score) + int(opposing_team_score)
else:
player_score = team_data[1].text.strip('()')
print(player_score)
opposing_team_score = score
print(opposing_team_score)
opposing_team_name = team_data[0].text.strip()
print(opposing_team_name)
rounds_won += int(opposing_team_score)
rounds_played += int(player_score) + int(opposing_team_score)
match_scores.append((team_name, opposing_team_name, player_score, opposing_team_score))
num_of_matches += 1
if num_of_matches == 5: # exit loop after 5 iterations
break
except:
# Refresh the page if the element can't be found
driver.back()
player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
time.sleep(1)
match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))
except Exception as e:
print(f"An error occurred while processing data for player {name}: {e}")
continue
players.append([name, kd_ratio, headshot_percentage, kpr, dpr, rounds_won, rounds_played])
print(players)
print(f"{player_name}: {rounds_won} rounds won out of {rounds_played} rounds played in {num_of_matches} matches")
driver.get(url)
time.sleep(1)
except StaleElementReferenceException as e:
# handle the exception here
print(f"An error occurred while processing match stats: {e}")
break
# Close the webdriver
driver.quit()
# Store the data in a Pandas dataframe
df = pd.DataFrame(players, columns=["Name", "K/D", "HS %", "KPR", "DPR", "RW", "RP"])
# Clean the data
df["K/D"] = df["K/D"].str.extract(r"(\d+\.\d+)").astype(float)
df["HS %"] = df["HS %"].str.extract(r"(\d+\.\d+)").astype(float)
df["KPR"] = df["KPR"].str.extract(r"(\d+\.\d+)").astype(float)
df["DPR"] = df["DPR"].str.extract(r"(\d+\.\d+)").astype(float)
# Drop any rows that have missing or invalid data
df.dropna(subset=["Name", "K/D", "HS %", "KPR", "DPR"], inplace=True)
# Save the data to a CSV file
df.to_csv("player_stats.csv", index=False, sep='\t')
# Close the webdriver
driver.quit()

How to create a for-loop in relation to value obtained via Selenium

Variables
chrome_path = 'chromedriver'
driver = webdriver.Chrome(chrome_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-popup-blocking")
driver.get("https://gibiru.com/")
driver.find_element(By.CSS_SELECTOR, '.form-control.has-feedback.has-clear').click()
driver.find_element(By.CSS_SELECTOR, '.form-control.has-feedback.has-clear').send_keys("lfc")
driver.find_element(By.CSS_SELECTOR, '.form-control.has-feedback.has-clear').send_keys(Keys.RETURN)
driver.find_element(By.XPATH, "/html/body/div[1]/main/div[1]/div/div/div/div[2]").click()
time.sleep(2)
I have this try-stratement, which works perfect, but needs to be looped arcordding to the value of page_length, which is equal to 10 in this situation.
try:
#1st page
page_length = len(driver.find_elements(By.CSS_SELECTOR, "div.gsc-resultsRoot.gsc-tabData.gsc-tabdActive div.gsc-cursor-box.gs-bidi-start-align div.gsc-cursor div.gsc-cursor-page"))
index_count = 0
current_page = int(page_length) - int(index_count)
print("Number of availables pages : "+str(current_page)) #Print = 10
find_href = driver.find_elements(By.CSS_SELECTOR, 'img.gs-image.gs-image-scalable')
with open("txt.txt", "a") as textFile:
for my_href in find_href:
textFile.write(str(my_href.get_attribute("src")) + "\n")
print(my_href.get_attribute("src"))
index_count += 1
driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
driver.find_element(By.XPATH, '/html/body/div[1]/main/div[2]/div[2]/div/div[1]/div/div/div/div/div[5]/div[2]/div[2]/div/div[2]/div/div['+str(index_count)+']').click()
time.sleep(2)
#2nd page
current_page = int(page_length) - int(index_count)
print("Number of availables pages : "+str(current_page)) #Print = 10
find_href = driver.find_elements(By.CSS_SELECTOR, 'img.gs-image.gs-image-scalable')
with open("txt.txt", "a") as textFile:
for my_href in find_href:
textFile.write(str(my_href.get_attribute("src")) + "\n")
print(my_href.get_attribute("src"))
index_count += 1
driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
driver.find_element(By.XPATH, '/html/body/div[1]/main/div[2]/div[2]/div/div[1]/div/div/div/div/div[5]/div[2]/div[2]/div/div[2]/div/div['+str(index_count)+']').click()
time.sleep(2)
except Exception as e:
print(e)
driver.quit()
But I seek help in regards to creating a for-loop. That can do what the try-statement can, but in fewer lines of code. This is what I'm thinking of :
for x in page_array_number:
index_count = 0
current_page = int(page_length) - int(index_count)
print("Number of availables pages : "+str(current_page))
find_href = driver.find_elements(By.CSS_SELECTOR, 'img.gs-image.gs-image-scalable')
with open("txt.txt", "a") as textFile:
for my_href in find_href:
textFile.write(str(my_href.get_attribute("src")) + "\n")
print(my_href.get_attribute("src"))
print("Counter is before : "+str(index_count))
index_count += 1
print("Counter is after : "+str(index_count))
driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
time.sleep(2)
driver.find_element(By.XPATH, '/html/body/div[1]/main/div[2]/div[2]/div/div[1]/div/div/div/div/div[5]/div[2]/div[2]/div/div[2]/div/div['+str(index_count)+']').click()
time.sleep(2)
if index_count == page_length:
print("Done scraping urls from "+str(page_length)+" pages")
break
The output I be getting is as such: It's seems like it is the counter that is the problem, it doesnt add 1 on for every loop.
len() returns an integer, which is not an iterable object. I would use the enumerate() method, which returns the index and value of the next item in the iterable. enumerate() is also faster in many cases.
pages = driver.find_elements()
page_length = len(pages)
for index, value in enumerate(pages):
current_page = page_length - index
...
Also, the last two lines of code is redundant. If index_count == page_length, then that is the last iteration of the loop and will exit anyway.
Some other notes: if you are looping and don't need the loop variable, replace it with an underscore. In the above code, since we don't need the variable value:
for index, _ in enumerate(pages):
# This is clear that we don't use the values contained in pages
current_page = page_length - index
...
Lastly, you can often get errors like NoSuchAttributeException and ElementNotInteractableException due to variations in page load and JS execution times. I would suggest encapsulating selenium code that interacts with the web page in try except statements.
I got it working with this for loop :
for index, item in enumerate(page_array_number):
print(index)
current_page = int(page_length) - int(index)
print("Number of availables pages : "+str(current_page))
index = index+1
print("Counter is after : "+str(index))
find_href = driver.find_elements(By.CSS_SELECTOR, 'img.gs-image.gs-image-scalable')
with open("txt.txt", "a") as textFile:
for my_href in find_href:
textFile.write(str(my_href.get_attribute("src")) + "\n")
print(my_href.get_attribute("src"))
driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
time.sleep(2)
if index == 1:
driver.find_element(By.XPATH, '/html/body/div[1]/main/div[2]/div[2]/div/div[1]/div/div/div/div/div[5]/div[2]/div[2]/div/div[2]/div/div['+str(index+1)+']').click()
time.sleep(2)
elif index > 1:
driver.find_element(By.XPATH, '/html/body/div[1]/main/div[2]/div[2]/div/div[1]/div/div/div/div/div[5]/div[2]/div[2]/div/div[2]/div/div['+str(index)+']').click()
time.sleep(2)
elif index == page_length:
print("Done scraping urls from "+str(page_length)+" pages")
break

How to scroll several times in order to find an element?

I want to create a script to keep scrolling until the desired element is found using:
from appium.webdriver.common.touch_action import TouchAction
The page is an Events page with different games titles.
All I got so far is the single scrolling:
# swipe down
time.sleep(1)
print('Scrolling the page..')
time.sleep(4)
touch = TouchAction(driver)
touch.long_press(x=500, y=1800).move_to(x=500, y=400).release().perform()
time.sleep(2)
print('PASS - Step 1. Page was scrolled')
Element is searched with:
selenium.webdriver.support import expected_conditions as EC
# Game name
time.sleep(1)
wait = WebDriverWait(driver, 15)
Event_Name = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#text="Event name"]//ancestor::*[contains(#class, "android.widget.LinearLayout")]')))
Event_Name.click()
time.sleep(2)
Game_title = driver.find_element_by_id('com.project.projectname:id/eventName')
print('PASS - Step 2. ' + Game_title.text + ' game card was clicked')
time.sleep(2)
wait = WebDriverWait(driver, 15)
Game_size_GameName = wait.until(EC.visibility_of_all_elements_located((By.ID, 'com.project.projectname:id/size')))
# noinspection PyTypeChecker
for size in Game_size_Gamename:
if '55.68 MB' == size.text:
print('PASS - Step 3. User is on the ' + Game_title.text + ' product page')
else:
print('FAIL - Step 3. Failed to redirect user')
time.sleep(2)
Events_tab_title = wait.until(EC.visibility_of_element_located((By.ID, 'com.project.projectname:id/screenTitle')))
back_button.click()
print('PASS - Step 11. User is back on the ' + Events_tab_title.text + ' tab')
time.sleep(2)
Can anyone help me with that? Thanks in advance.
Mention: Not all the elements are visible and that's why I need to scroll the page through the script.
I've tried a few methods for scrolling webpage to element. Some of them work but scrolled down page to a place just above element I needed. The best working was the JavaScript one:
element = driver.find_element_by_id('element_id')
context.driver.execute_script("arguments[0].scrollIntoView();", element)
element.click()

Saving images while crawling website in Selenium

I would like to download images like those that can be found on this page.
I need to download all of the images, each one once.
Here's the code I'm using:
links = []
wait = WebDriverWait(driver, 5)
all_images = wait.until(
EC.presence_of_all_elements_located((By.XPATH, "//div[contains(#class,'swiper-button-next swiper-button-white')]")))
for image in all_images:
a = image.get_attribute('style')
b = a.split("(")[1].split(")")[0].replace('"', '')
links.append(b)
all_images = wait.until(
EC.presence_of_all_elements_located((By.XPATH, "//div[contains(#class,'swiper-slide swiper-slide-visible swiper-slide-active swiper-slide-thumb-active')]")))
for image in all_images:
a = image.get_attribute('style')
b = a.split("(")[1].split(")")[0].replace('"', '')
links.append(b)
all_images = wait.until(
EC.presence_of_all_elements_located((By.XPATH, "//div[contains(#class,'swiper-slide swiper-slide-visible')]")))
for image in all_images:
a = image.get_attribute('style')
b = a.split("(")[1].split(")")[0].replace('"', '')
links.append(b)
index = 1
for i in range(len(links)//2 + 1):
with open(title.replace(' ', '-') + str(index) + '.jpg', 'wb') as file:
im = requests.get(links[i])
file.write(im.content)
print('Saving image.. ', title + str(index))
index += 1
The problem is that this saves images repeatedly, and doesn't save some others, and I don't know how to fix it.
You are using a wrong locator.
Additionally, presence_of_all_elements_located doesn't wait for ALL the elements, it waits for presence of at least 1 element.
Also, presence of element waits for element presence, while this may be not enought. It is recommended to use visibility_of_element_located instead.
I think the following code will work better:
links = []
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'swiper-slide')]")))
time.sleep(0.5)
all_images = driver.find_elements_by_xpath("//div[contains(#class,'swiper-slide')]")

Scraping an updating JavaScript page in Python

I've been working on a research project that is looking to obtain a list of reference articles from the Brazil Hemeroteca (The desired page reference: http://memoria.bn.br/DocReader/720887x/839, needs to be collected from two hidden elements on the following page: http://memoria.bn.br/DocReader/docreader.aspx?bib=720887x&pasta=ano%20189&pesq=Milho). I asked a question a few weeks back that was answered and I was able to get things running well in regards to that, but now I've hit a new snag and I'm not exactly sure how to get around it.
The problem is that after the first form is filled in, the page redirects to a second page, which is a JavaScript/AJAX enabled page which I need to spool through all of the matches, which is done by means of clicking a button at the top of the page. The problem I'm encountering is that when clicking the next page button I'm dealing with elements on the page that are updating, which leads to Stale Elements. I've tried to implement a few pieces of code to detect when this "stale" effect occurs to indicate the page has changed, but this has not provided much luck. Here is the code I've implemented:
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
saveDir = "C:/tmp"
print("Opening Page...")
browser = webdriver.Chrome()
url = "http://bndigital.bn.gov.br/hemeroteca-digital/"
browser.get(url)
print("Searching for elements")
fLink = ""
fails = 0
frame_ref = browser.find_elements_by_tag_name("iframe")[0]
iframe = browser.switch_to.frame(frame_ref)
journal = browser.find_element_by_id("PeriodicoCmb1_Input")
search_journal = "Relatorios dos Presidentes dos Estados Brasileiros (BA)"
search_timeRange = "1890 - 1899"
search_text = "Milho"
xpath_form = "//input[#name=\'PesquisarBtn1\']"
xpath_journal = "//li[text()=\'"+search_journal+"\']"
xpath_timeRange = "//input[#name=\'PeriodoCmb1\' and not(#disabled)]"
xpath_timeSelect = "//li[text()=\'"+search_timeRange+"\']"
xpath_searchTerm = "//input[#name=\'PesquisaTxt1\']"
print("Locating Journal/Periodical")
journal.click()
dropDownJournal = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.XPATH, xpath_journal)))
dropDownJournal.click()
print("Waiting for Time Selection")
try:
timeRange = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeRange)))
timeRange.click()
time.sleep(1)
print("Locating Time Range")
dropDownTime = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeSelect)))
dropDownTime.click()
time.sleep(1)
except:
print("Failed...")
print("Adding Search Term")
searchTerm = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_searchTerm)))
searchTerm.clear()
searchTerm.send_keys(search_text)
time.sleep(5)
print("Perform search")
submitButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_form)))
submitButton.click()
# Wait for the second page to load, pull what we need from it.
download_list = []
browser.switch_to_window(browser.window_handles[-1])
print("Waiting for next page to load...")
matches = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//span[#id=\'OcorNroLbl\']")))
print("Next page ready, found match element... counting")
countText = matches.text
countTotal = int(countText[countText.find("/")+1:])
print("A total of " + str(countTotal) + " matches have been found, standing by for page load.")
for i in range(1, countTotal+2):
print("Waiting for page " + str(i-1) + " to load...")
while(fLink in download_list):
try:
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
except:
fails += 1
time.sleep(1)
if(fails == 10):
print("Locked on a page, attempting to push to next.")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
#raise
while(fLink == ""):
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
fails = 0
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
There are two "bugs" I'm trying to solve with this block. First, the very first page is always skipped in the loop (IE: fLink = ""), even though there is a test in there for it, I'm not sure why this occurs. The other bug is that the code will hang on specific pages completely randomly and the only way out is to break the code execution.
This block has been modified a few times so I know it's not the most "elegant" of solutions, but I'm starting to run out of time.
After taking a day off from this to think about it (And get some more sleep), I was able to figure out what was going on. The above code has three "big faults". This first is that it does not handle the StaleElementException versus the NoSuchElementException, which can occur while the page is shifting. Secondly, the loop condition was checking iteratively that a page wasn't in the list, which when entering the first run allowed the blank condition to load in directly as the loop was never executed on the first run (Should have used a do-while there, but I made more modifications). Finally, I made the silly error of only checking if the first hidden element was changing, when in fact that is the journal ID, and is pretty much constant through all.
The revisions began with an adaptation of a code on this other SO article to implement a "hold" condition until either one of the hidden elements changed:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
def hold_until_element_changed(driver, element1_xpath, element2_xpath, old_element1_text, old_element2_text):
while True:
try:
element1 = driver.find_element_by_xpath(element1_xpath)
element2 = driver.find_element_by_xpath(element2_xpath)
if (element1.get_attribute('value') != old_element1_text) or (element2.get_attribute('value') != old_element2_text):
break
except StaleElementReferenceException:
break
except NoSuchElementException:
return False
time.sleep(1)
return True
I then modified the original looping condition, going back to the original "for loop" counter I had created without an internal loop, instead shooting a call to the above function to create the "hold" until the page had flipped, and voila, worked like a charm. (NOTE: I also upped the timeout on the next page button as this is what caused the locking condition)
for i in range(1, countTotal+1):
print("Waiting for page " + str(i) + " to load...")
bibxpath = "//input[#name=\'HiddenBibAlias\']"
pagexpath = "//input[#name=\'hPagFis\']"
jIDElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, bibxpath)))
jPageElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, pagexpath)))
jidtext = jIDElement.get_attribute('value')
jpagetext = jPageElement.get_attribute('value')
fLink = "http://memoria.bn.br/DocReader/" + jidtext + "/" + jpagetext + "&pesq=" + search_text
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
# Wait for next page to be ready
change = hold_until_element_changed(browser, bibxpath, pagexpath, jidtext, jpagetext)
if(change == False):
print("Something went wrong.")
All in all, a good exercise in thought and some helpful links for me to consider when posting future questions. Thanks!

Categories

Resources