In selenium how to find out the exact number of XPATH links with different ids?

In selenium how to find out the exact number of XPATH links with different ids? - python

With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?

Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()

You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')

Related

How to extract all the google reviews from google map

I need to scrap all the google reviews. There are 90,564 reviews in my page. However the code i wrote can scrap only top 9 reviews. The other reviews are not scraped.
The code is given below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# specify the url of the business page on Google
url = 'https://www.google.com/maps/place/ISKCON+temple+Bangalore/#13.0098328,77.5510964,15z/data=!4m7!3m6!1s0x0:0x7a7fb24a41a6b2b3!8m2!3d13.0098328!4d77.5510964!9m1!1b1'
# create an instance of the Chrome driver
driver = webdriver.Chrome()
# navigate to the specified url
driver.get(url)
# Wait for the reviews to load
wait = WebDriverWait(driver, 20) # increased the waiting time
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'wiI7pd')))
# extract the text of each review
reviews = [element.text for element in review_elements]
# print the reviews
print(reviews)
# close the browser
driver.quit()
what should i edit/modify the code to extract all the reviews?

Here is the working code for you after launching the url
totalRev = "div div.fontBodySmall"
username = ".d4r55"
reviews = "wiI7pd"
wait = WebDriverWait(driver, 20)
totalRevCount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).get_attribute("textContent").split(' ')[0].replace(',','').replace('.','')
print("totalRevCount - ", totalRevCount)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).click()
mydict = {}
found = 0
while found < int(totalRevCount):
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, reviews)))
reviewer_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, username)))
found = len(mydict)
for rev, name in zip(review_elements, reviewer_names):
mydict[name.text] = rev.text
if len(rev.text) == 0:
found = int(totalRevCount) + 1
break
for i in range(8):
ActionChains(driver).key_down(Keys.ARROW_DOWN).perform()
print("found - ", found)
print(mydict)
time.sleep(2)
Explanation -
Get the locators for user name and review since we are going to create a key-value pair which will be useful in creating a non-duplicate result
You need to first get the total number of reviews/ratings that are present for that given location.
Get the username and review for the "visible" part of the webpage and store it in the dictionary
Scroll down the page and wait a few seconds
Get the username and review again and add them to dictionary. Only new ones will be added
As soon as a review that has no text (only rating), the loop will close and you have your results.
NOTE - If you want all reviews irrespective of the review text present or not, you can remove the "if" loop

I think you'll need to scoll down at first, and the get all the reviews.
scroll_value = 230
driver.execute_script( 'window.scrollBy( 0, '+str(scroll_value)+ ' )' ) # to scroll by value
# to get the current scroll value on the y axis
scroll_Y = driver.execute_script( 'return window.scrollY' )
That might be because the elements don't get loaded elsewise.
Since they are over 90'000, you might consider scolling down a little, then getting the reviews, repeat.
Resource: https://stackoverflow.com/a/74508235/20443541

using selenium.click() to change pages but gets error

I'm trying to click on a div to get to the next page of a table (the url does not change when the page changes). The go to the next page div has the same class as the go to the previous page's.
ive used:
elem = driver.find_element_by_class_name('cXQSjq')
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
and it worked to get me to page 2, but after page 2 it gives me the error:
selenium.common.exceptions.ElementClickInterceptedException: Message: element click
intercepted: Element <div class="styles__Chevron-sc-1buchb9-1 cXQSjq">...</div> is
not clickable at point (702, 656). Other element would receive the click: <div
id="hs-eu-cookie-confirmation-inner">...</div>
(Session info: chrome=92.0.4515.107)
Do you know if there is an issue that i am able to call elem.click() after using selenium to find other parts of the page. I'm scraping data from
https://nonfungible.com/market/history

I guess the issue here is that the next page button appears on the bottom of the page, so to click on it you should first scroll this element into the view.
See if this will work now:
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elem = driver.find_element_by_class_name('cXQSjq')
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
I didn't check correctness of the rest of your code, but clicking the next button should work now

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)

Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

How to scroll down in an instagram pop-up frame with Selenium

I have a python script using selenium to go to a given Instagram profile and iterate over the user's followers. On the instagram website when one clicks to see the list of followers, a pop-up opens with the accounts listed (here's a screenshot of the site)
However both visually and in the html, only 12 accounts are shown. In order to see more one has to scroll down, so I tried doing this with the Keys.PAGE_DOWN input.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
...
username = 'Username'
password = 'Password'
message = 'blahblah'
tryTime = 2
#create driver and log in
driver = webdriver.Chrome()
logIn(driver, username, password, tryTime)
#gets rid of preference pop-up
a = driver.find_elements_by_class_name("HoLwm")
a[0].click()
#go to profile
driver.get("https://www.instagram.com/{}/".format(username))
#go to followers list
followers = driver.find_element_by_xpath("//a[#href='/{}/followers/']".format(username))
followers.click()
time.sleep(tryTime)
#find all li elements in list
fBody = driver.find_element_by_xpath("//div[#role='dialog']")
fBody.send_keys(Keys.PAGE_DOWN)
fList = fBody.find_elements_by_tag("li")
print("fList len is {}".format(len(fList)))
time.sleep(tryTime)
print("ended")
driver.quit()
When I try to run this I get the following error:
Message: unknown error: cannot focus element
I know this is probably because I'm using the wrong element for fBody, but I don't know which would be the right one. Does anybody know which element I should send the PAGE_DOWN key to, or if there is another way to load the accounts?
Any help is much appreciated!

the element you're looking is //div[#class='isgrP'] and Keys.PAGE_DOWN is not work for scrollable div.
Your variable fList hold old value, you need to find again the elements after scroll.
#find all li elements in list
fBody = driver.find_element_by_xpath("//div[#class='isgrP']")
scroll = 0
while scroll < 5: # scroll 5 times
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;', fBody)
time.sleep(tryTime)
scroll += 1
fList = driver.find_elements_by_xpath("//div[#class='isgrP']//li")
print("fList len is {}".format(len(fList)))
print("ended")
#driver.quit()

The above code works fine if you add iteration (for) with range
for i in range(1, 4):
try:
#find all li elements in list
fBody = self.driver.find_element_by_xpath("//div[#class='isgrP']")
scroll = 0
while scroll < 5: # scroll 5 times
self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;', fBody)
time.sleep(2)
scroll += 1
fList = self.driver.find_elements_by_xpath("//div[#class='isgrP']//li")
print("fList len is {}".format(len(fList)))
except Exception as e:
print(e, "canot scrol")
try:
#get tags with a
hrefs_in_view = self.driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('title') for elem in hrefs_in_view]
[pic_hrefs.append(title) for title in hrefs_in_view if title not in pic_hrefs]
print("Check: pic href length " + str(len(pic_hrefs)))
except Exception as tag:
print(tag, "can not find tag")
So, the for loop makes it to possible scrol even if the while loop miss

Scraping an updating JavaScript page in Python

I've been working on a research project that is looking to obtain a list of reference articles from the Brazil Hemeroteca (The desired page reference: http://memoria.bn.br/DocReader/720887x/839, needs to be collected from two hidden elements on the following page: http://memoria.bn.br/DocReader/docreader.aspx?bib=720887x&pasta=ano%20189&pesq=Milho). I asked a question a few weeks back that was answered and I was able to get things running well in regards to that, but now I've hit a new snag and I'm not exactly sure how to get around it.
The problem is that after the first form is filled in, the page redirects to a second page, which is a JavaScript/AJAX enabled page which I need to spool through all of the matches, which is done by means of clicking a button at the top of the page. The problem I'm encountering is that when clicking the next page button I'm dealing with elements on the page that are updating, which leads to Stale Elements. I've tried to implement a few pieces of code to detect when this "stale" effect occurs to indicate the page has changed, but this has not provided much luck. Here is the code I've implemented:
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
saveDir = "C:/tmp"
print("Opening Page...")
browser = webdriver.Chrome()
url = "http://bndigital.bn.gov.br/hemeroteca-digital/"
browser.get(url)
print("Searching for elements")
fLink = ""
fails = 0
frame_ref = browser.find_elements_by_tag_name("iframe")[0]
iframe = browser.switch_to.frame(frame_ref)
journal = browser.find_element_by_id("PeriodicoCmb1_Input")
search_journal = "Relatorios dos Presidentes dos Estados Brasileiros (BA)"
search_timeRange = "1890 - 1899"
search_text = "Milho"
xpath_form = "//input[#name=\'PesquisarBtn1\']"
xpath_journal = "//li[text()=\'"+search_journal+"\']"
xpath_timeRange = "//input[#name=\'PeriodoCmb1\' and not(#disabled)]"
xpath_timeSelect = "//li[text()=\'"+search_timeRange+"\']"
xpath_searchTerm = "//input[#name=\'PesquisaTxt1\']"
print("Locating Journal/Periodical")
journal.click()
dropDownJournal = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.XPATH, xpath_journal)))
dropDownJournal.click()
print("Waiting for Time Selection")
try:
timeRange = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeRange)))
timeRange.click()
time.sleep(1)
print("Locating Time Range")
dropDownTime = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_timeSelect)))
dropDownTime.click()
time.sleep(1)
except:
print("Failed...")
print("Adding Search Term")
searchTerm = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_searchTerm)))
searchTerm.clear()
searchTerm.send_keys(search_text)
time.sleep(5)
print("Perform search")
submitButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, xpath_form)))
submitButton.click()
# Wait for the second page to load, pull what we need from it.
download_list = []
browser.switch_to_window(browser.window_handles[-1])
print("Waiting for next page to load...")
matches = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//span[#id=\'OcorNroLbl\']")))
print("Next page ready, found match element... counting")
countText = matches.text
countTotal = int(countText[countText.find("/")+1:])
print("A total of " + str(countTotal) + " matches have been found, standing by for page load.")
for i in range(1, countTotal+2):
print("Waiting for page " + str(i-1) + " to load...")
while(fLink in download_list):
try:
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
except:
fails += 1
time.sleep(1)
if(fails == 10):
print("Locked on a page, attempting to push to next.")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
#raise
while(fLink == ""):
jIDElement = browser.find_element_by_xpath("//input[#name=\'HiddenBibAlias\']")
jPageElement = browser.find_element_by_xpath("//input[#name=\'hPagFis\']")
fLink = "http://memoria.bn.br/DocReader/" + jIDElement.get_attribute('value') + "/" + jPageElement.get_attribute('value') + "&pesq=" + search_text
fails = 0
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
There are two "bugs" I'm trying to solve with this block. First, the very first page is always skipped in the loop (IE: fLink = ""), even though there is a test in there for it, I'm not sure why this occurs. The other bug is that the code will hang on specific pages completely randomly and the only way out is to break the code execution.
This block has been modified a few times so I know it's not the most "elegant" of solutions, but I'm starting to run out of time.

After taking a day off from this to think about it (And get some more sleep), I was able to figure out what was going on. The above code has three "big faults". This first is that it does not handle the StaleElementException versus the NoSuchElementException, which can occur while the page is shifting. Secondly, the loop condition was checking iteratively that a page wasn't in the list, which when entering the first run allowed the blank condition to load in directly as the loop was never executed on the first run (Should have used a do-while there, but I made more modifications). Finally, I made the silly error of only checking if the first hidden element was changing, when in fact that is the journal ID, and is pretty much constant through all.
The revisions began with an adaptation of a code on this other SO article to implement a "hold" condition until either one of the hidden elements changed:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
def hold_until_element_changed(driver, element1_xpath, element2_xpath, old_element1_text, old_element2_text):
while True:
try:
element1 = driver.find_element_by_xpath(element1_xpath)
element2 = driver.find_element_by_xpath(element2_xpath)
if (element1.get_attribute('value') != old_element1_text) or (element2.get_attribute('value') != old_element2_text):
break
except StaleElementReferenceException:
break
except NoSuchElementException:
return False
time.sleep(1)
return True
I then modified the original looping condition, going back to the original "for loop" counter I had created without an internal loop, instead shooting a call to the above function to create the "hold" until the page had flipped, and voila, worked like a charm. (NOTE: I also upped the timeout on the next page button as this is what caused the locking condition)
for i in range(1, countTotal+1):
print("Waiting for page " + str(i) + " to load...")
bibxpath = "//input[#name=\'HiddenBibAlias\']"
pagexpath = "//input[#name=\'hPagFis\']"
jIDElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, bibxpath)))
jPageElement = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, pagexpath)))
jidtext = jIDElement.get_attribute('value')
jpagetext = jPageElement.get_attribute('value')
fLink = "http://memoria.bn.br/DocReader/" + jidtext + "/" + jpagetext + "&pesq=" + search_text
print("Link obtained: " + fLink)
download_list.append(fLink)
if(i != countTotal):
print("Moving to next page...")
nextPageButton = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//input[#id=\'OcorPosBtn\']")))
nextPageButton.click()
# Wait for next page to be ready
change = hold_until_element_changed(browser, bibxpath, pagexpath, jidtext, jpagetext)
if(change == False):
print("Something went wrong.")
All in all, a good exercise in thought and some helpful links for me to consider when posting future questions. Thanks!

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

In selenium how to find out the exact number of XPATH links with different ids? - python

You can try run the loop until next button is present on the screen. the logic will look like this, try: next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina') if(next_page.is_displayed()): next_page.click() except NoSuchElementException: print('next page does not exists')

Related

How to extract all the google reviews from google map

using selenium.click() to change pages but gets error

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

How to scroll down in an instagram pop-up frame with Selenium

Scraping an updating JavaScript page in Python

Categories

Resources