Url doesn't change after hitting the "Next" button with selenium/python - python

I scrape data from this url then click the Next button and wait 10 seconds before using requests and bs4 to scrape the next page but the url doesn't change so I just end up scraping the original page data twice. I've tried WebDriverWait until elements on the first page become stale as well as trying to use requests to get the xhr log api call directly (I am not well-versed in ajax however) and can't find a solution. Here is the code as it stands:
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(10)
next_page = driver.current_url
get_page_content(next_page)
break
Here is the URL of the first page: https://www.hunterdouglas.com/locator/results?address=San%20Ramon&country=US&source=
Any direction would be appreciated! Thank you!

For anyone who is interested I got this to work by just using selenium. Here is the code the argument data is just the name of the city I'm submitting to the master_function(data)
def get_links(page):
for p in page:
for l in p.find_elements_by_tag_name("a"):
link = l.get_attribute('href')
if link != None:
link_split = link.split('\n')
for l in link_split:
if "http" in link:
test_list.append(link)
def master_function(data):
for d in data:
base_url = "https://www.hunterdouglas.com/locator"
driver.get(base_url)
url = pop_up_one(driver)
submit(url,driver,d)
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(1)
page = WebDriverWait(driver,5).until(EC.presence_of_all_elements_located((By.XPATH, '//div[#id="loc-results"]')))
get_links(page)

Related

Selenium - Why NoSuchElementException happens in the second for loop iteration?

I'm trying to loop over a list of web elements matching a div tag. The first loop goes well, but the second one throws a NoSuchElementException. Here is a minimal example of my code:
for div in driver.find_elements_by_xpath("//div[#class='class_name']"):
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)
Thank you!
You are getting StaleElementReferenceException because the reference to a web element you are trying to use is no more valid AKA stale.
See here or on any other resource about the Stale Element Reference Exception.
Since you went to some other web page, even if you get back to the initial web page all the web elements you got there become stale elements.
To overcome this problem you have to get those elements again.
So instead of your current code I'd suggest using something like the following:
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
for i in range(len(divs)):
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
div = divs[i]
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)
You can try to get the specific div inside the loop as following:
divs = driver.find_elements_by_xpath("//div[#class='class_name']")
for i in range(len(divs)):
div = driver.find_element_by_xpath("(//div[#class='class_name'])[" + (str)i + "]")
print(div.text)
print(f"Current url 1: {driver.current_url}") # url
new_url = url + "/page/"
time.sleep(2)
driver.get(new_url)
print(f"Current url 2: {driver.current_url}") # new_url
time.sleep(2)
# Then get info from the new url
# Go back
# driver.execute_script("window.history.go(-1)")
driver.back()
print(f"Current url 3: {driver.current_url}") # url
print("Sleeping for 3 seconds from now...")
time.sleep(3)

How to go to next page until the last page in Python Selenium when scraping website?

Image is for CSS selector and xpath for pagination.
I also wanted to perform a regex in to to separate Apple, iPhone 12, Neo Galactic Silver like this I wanted to print it in new line.
After finishing the product list of this current page, I want to be able to click next and perform the same procedure with the products on the next page.
This is the problem: when it reaches the 10 items of the current page, I have no idea how to change to another page and start all over again.
import xlwt
from selenium import webdriver
import re
import time
class cometmobiles:
def __init__(self):
self.url='https://www.mediaworld.it/catalogo/telefonia/smartphone-e-cellulari/smartphone'
def comet(self):
try:
driver=webdriver.Chrome()
driver.get(self.url)
time.sleep(5)
cookies = driver.find_element_by_id("onetrust-accept-btn-handler")
cookies.click()
print("accepted cookies")
driver.maximize_window()
print("window maximized")
mylist = []
hasNextPate = True
while hasNextPate:
containers = []
containters =driver.find_elements_by_css_selector('article[class="product clearfix p-list-js"]')
for container in containters:
#Title
try:
title = container.find_element_by_css_selector('h3[class="product-name"]').text
print(title)
except:
pass
#price
try:
price = container.find_element_by_css_selector('span[class="price mw-price enhanced"]').text
print(price)
except:
pass
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
except:
pass
comets=cometmobiles()
comets.comet()
Instead of this part
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
You can use this and also if the page number doesn't exist website turn the main page so you should add
try:
x=0
while True:
x+=1
driver.get(url+"?pageNumber="+str(x)) #Get the next page
if driver.current_url == url: #If there is no next page it will turn main page and you can break at this time
break
except:
pass

Get LinkedIn user href on list (Selenium)

I'm beginner to Python & Selenium, I am just trying to get all LinkedIn profile hrefs on specific page with Selenium for adding to a list but I don't know why it returns all same URL 10 times:
This is my code:
try:
browser.find_element_by_id("username").send_keys(email_address)
sleep(1)
browser.find_element_by_id("password").send_keys(password)
sleep(1)
browser.find_element_by_xpath("//button[#class='btn__primary--large from__button--floating']").click()
sleep(1)
element = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.ID, "global-nav")))
sleep(0.5)
browser.get('https://www.linkedin.com/search/results/people/?geoUrn=%5B%22104246759%22%2C%2290009659%22%2C%22106383538%22%5D&keywords=mark%20johnson&origin=GLOBAL_SEARCH_HEADER')
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
for i in range(10):
href = url[i].get_attribute('href')
user.append(href)
print(user)
except Exception as e:
traceback.print_exc()
It looks like the xpath is matching multiple elements with the same href. You could make a list of unique hrefs:
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
unique_hrefs = []
[unique_hrefs.append(x.get_attribute('href')) for x in url if x.get_attribute('href') not in unique_hrefs]
for i in unique_hrefs:
print(i)

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

Using WebDriverWait still cannot get the new page_source while performing next_page test

I am performing next_page test. I use a loop to capature new data while clicking next page. The click function is successful, but it seemes that the driver.page_source is not updated after next_page function is performed. As it is still within the loop, the code will run about two times then it can be updated. Just in rare occasion it can succeed.
It will be ok if you use like this:
NextPage()
time.sleep(2)
data=driver.page_source
currentpage = GetCurrentPage(data)
I understand that driver.page_source will take time to load completely. However, for a large total of page, using time.sleep() is time-consuming. Then I tried to use the WebDriverWait to wait for the Image class to load(This website has a lot of images each page), but it does not help.
page=int(input("Please input page number:"))
if 1< page < 100:
data=driver.page_source
currentpage = GetCurrentPage(data)
while True:
if currentpage<page:
try:
CaptureData(data,file)
print(currentpage)
time.sleep(0.5)
NextPage()
# time.sleep(1)
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,'J_ItemPicA')))
data=driver.page_source
currentpage = GetCurrentPage(data)
print(currentpage)
except TimeoutException:
print("Timeout!")
except Exception as e:
print("Unexpected error!",e)
break
else:
print('testa')
CaptureData(data,file)
break
elif page ==1:
CaptureData(driver.page_source,file)
At most times, the output of above code will be like:
Please input page number: 2
1
1
1
2
testa
In case you may need the GetCurrentPage code:
def GetCurrentPage(data):
soup=BeautifulSoup(data,'lxml')
comments = soup.find_all("li", class_="item active")
cp = re.findall('\d', comments[0].text)
currentpage = int(''.join(cp))
return currentpage
The link is there, but it is with Chinese.
Any other suggestions please?
Thanks a million.
Solved the problem myself.
Use WebDriverWait. Get the current page, then decide if is equal to the former page plus 1.
Here is the code:
try:
CaptureData(data, file)
print(currentpage)
time.sleep(0.5)
NextPage()
# time.sleep(2)
element = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active'),
str(int(currentpage) + 1)))
if element:
data = driver.page_source
currentpage = GetCurrentPage(data)
print(currentpage)

Categories

Resources