I'm trying to loop over a list of web elements matching a div tag. The first loop goes well, but the second one throws a NoSuchElementException. Here is a minimal example of my code:
for div in driver.find_elements_by_xpath("//div[#class='class_name']"):
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)
Thank you!
You are getting StaleElementReferenceException because the reference to a web element you are trying to use is no more valid AKA stale.
See here or on any other resource about the Stale Element Reference Exception.
Since you went to some other web page, even if you get back to the initial web page all the web elements you got there become stale elements.
To overcome this problem you have to get those elements again.
So instead of your current code I'd suggest using something like the following:
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
for i in range(len(divs)):
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
div = divs[i]
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)
You can try to get the specific div inside the loop as following:
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
for i in range(len(divs)):
div = driver.find_element_by_xpath("(//div[#class='class_name'])[" + (str)i + "]")
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)
Related
I scrape data from this url then click the Next button and wait 10 seconds before using requests and bs4 to scrape the next page but the url doesn't change so I just end up scraping the original page data twice. I've tried WebDriverWait until elements on the first page become stale as well as trying to use requests to get the xhr log api call directly (I am not well-versed in ajax however) and can't find a solution. Here is the code as it stands:
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(10)
next_page = driver.current_url
get_page_content(next_page)
break
Here is the URL of the first page: https://www.hunterdouglas.com/locator/results?address=San%20Ramon&country=US&source=
Any direction would be appreciated! Thank you!
For anyone who is interested I got this to work by just using selenium. Here is the code the argument data is just the name of the city I'm submitting to the master_function(data)
def get_links(page):
for p in page:
for l in p.find_elements_by_tag_name("a"):
link = l.get_attribute('href')
if link != None:
link_split = link.split('\n')
for l in link_split:
if "http" in link:
test_list.append(link)
def master_function(data):
for d in data:
base_url = "https://www.hunterdouglas.com/locator"
driver.get(base_url)
url = pop_up_one(driver)
submit(url,driver,d)
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(1)
page = WebDriverWait(driver,5).until(EC.presence_of_all_elements_located((By.XPATH, '//div[#id="loc-results"]')))
get_links(page)
Image is for CSS selector and xpath for pagination.
I also wanted to perform a regex in to to separate Apple, iPhone 12, Neo Galactic Silver like this I wanted to print it in new line.
After finishing the product list of this current page, I want to be able to click next and perform the same procedure with the products on the next page.
This is the problem: when it reaches the 10 items of the current page, I have no idea how to change to another page and start all over again.
import xlwt
from selenium import webdriver
import re
import time
class cometmobiles:
def __init__(self):
self.url='https://www.mediaworld.it/catalogo/telefonia/smartphone-e-cellulari/smartphone'
def comet(self):
try:
driver=webdriver.Chrome()
driver.get(self.url)
time.sleep(5)
cookies = driver.find_element_by_id("onetrust-accept-btn-handler")
cookies.click()
print("accepted cookies")
driver.maximize_window()
print("window maximized")
mylist = []
hasNextPate = True
while hasNextPate:
containers = []
containters =driver.find_elements_by_css_selector('article[class="product clearfix p-list-js"]')
for container in containters:
#Title
try:
title = container.find_element_by_css_selector('h3[class="product-name"]').text
print(title)
except:
pass
#price
try:
price = container.find_element_by_css_selector('span[class="price mw-price enhanced"]').text
print(price)
except:
pass
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
except:
pass
comets=cometmobiles()
comets.comet()
Instead of this part
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
You can use this and also if the page number doesn't exist website turn the main page so you should add
try:
x=0
while True:
x+=1
driver.get(url+"?pageNumber="+str(x)) #Get the next page
if driver.current_url == url: #If there is no next page it will turn main page and you can break at this time
break
except:
pass
I am performing next_page test. I use a loop to capature new data while clicking next page. The click function is successful, but it seemes that the driver.page_source is not updated after next_page function is performed. As it is still within the loop, the code will run about two times then it can be updated. Just in rare occasion it can succeed.
It will be ok if you use like this:
NextPage()
time.sleep(2)
data=driver.page_source
currentpage = GetCurrentPage(data)
I understand that driver.page_source will take time to load completely. However, for a large total of page, using time.sleep() is time-consuming. Then I tried to use the WebDriverWait to wait for the Image class to load(This website has a lot of images each page), but it does not help.
page=int(input("Please input page number:"))
if 1< page < 100:
data=driver.page_source
currentpage = GetCurrentPage(data)
while True:
if currentpage<page:
try:
CaptureData(data,file)
print(currentpage)
time.sleep(0.5)
NextPage()
# time.sleep(1)
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,'J_ItemPicA')))
data=driver.page_source
currentpage = GetCurrentPage(data)
print(currentpage)
except TimeoutException:
print("Timeout!")
except Exception as e:
print("Unexpected error!",e)
break
else:
print('testa')
CaptureData(data,file)
break
elif page ==1:
CaptureData(driver.page_source,file)
At most times, the output of above code will be like:
Please input page number: 2
1
1
1
2
testa
In case you may need the GetCurrentPage code:
def GetCurrentPage(data):
soup=BeautifulSoup(data,'lxml')
comments = soup.find_all("li", class_="item active")
cp = re.findall('\d', comments[0].text)
currentpage = int(''.join(cp))
return currentpage
The link is there, but it is with Chinese.
Any other suggestions please?
Thanks a million.
Solved the problem myself.
Use WebDriverWait. Get the current page, then decide if is equal to the former page plus 1.
Here is the code:
try:
CaptureData(data, file)
print(currentpage)
time.sleep(0.5)
NextPage()
# time.sleep(2)
element = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active'),
str(int(currentpage) + 1)))
if element:
data = driver.page_source
currentpage = GetCurrentPage(data)
print(currentpage)
I have built a script using Selenium that loops through a page, prints the data and goes to the next page and does the same.
Now I am trying to save the data to a CSV file and thus need to create a nested loop - Currently I am repeating the loop multiple times (as below).
How do I create a nested loop then save to the CSV file?
Also will the script fail if it gets the last page and their isn't a next button there?
Thanks - This is the code I am using.
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="/Users/path/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
browser.quit()
I have used try-except so the program will exit when there isn't a next button.
Instead of printing, you can write the result to a CSV file.
while True:
try:
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
except:
break
browser.quit()
I was doing web scraping for a website with multiple pages in one web page.
But when I click page 2, the url showed http://www.worldhospitaldirectory.com/Germany/hospitals#page-2.
And I put this url as next navigation location. And it goes directly to
http://www.worldhospitaldirectory.com/Germany/hospitals#page-1, which is the default page.
I don't how to navigate to these sub pages.
Any suggestions or code?
my code now:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get('http://www.worldhospitaldirectory.com/Germany/hospitals')
url = []
pagenbr = 1
while pagenbr <= 43:
current = driver.current_url
driver.get(current)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
print('page ' + str(pagenbr) + ' is done.')
elm = driver.find_element_by_link_text('Next')
driver.implicitly_wait(10)
elm.click()
pagenbr += 1
Try just to click appropriate button on pagination as
driver.find_element_by_link_text('Next') # to get next page
or
driver.find_element_by_link_text('2') # to get second page
Get element button
button_next = driver.find_element_by_xpath('//a[#class='page-link next'])
button_next.click()
I let the algorithm to iterate all pages for you
This worked for me
while pagenbr <= 3:
current = driver.current_url
print current
driver.get(current)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
print('page ' + str(pagenbr) + ' is done.')
elm = driver.find_element_by_link_text('Next')
driver.implicitly_wait(10)
elm.click()
driver.implicitly_wait(10)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
pagenbr += 1