Element not found on page while scrolling down (StaleElementReferenceException) - python

I am currently scrolling through a TripAdivsor page to scrape images and need to scroll until the bottom of the page but keep getting the error of:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
I am assuming it is because it is trying to go very fast through the page but even when I change the implicit wait time to be larger, it does not solve the issue. I also tried making sure the new location is visible first before parsing through to get the url but that also did not do any good. Any help would be greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#time.sleep(1)
#wait until the new web element is visible
driver.implicitly_wait(10)
#WebDriverWait(driver, 20).until(EC.visibility_of_element_located(images[-1]))
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])

Related

Getting an empty list when scraping with Selenium

I am trying to create a python function that can scrape the article titles of a search result on Popular Science's website.
I have written this code, which has worked for a similar science-related website but when I run it specifically for Popular Science, it returns an empty list.
Code:
from selenium import webdriver
import pandas as pd
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
print(scraper("science"))
You can use driver.implicitly_wait(10) for wait while page is loaded.
from selenium import webdriver
def scrapper(text):
driver = webdriver.Chrome('./chromedriver')
driver.get(f"https://www.popsci.com/search-results/{text}/")
driver.implicitly_wait(10)
search = driver.find_elements_by_class_name("siq-partner-result")
wired_dict = [word.text for word in search]
print(wired_dict)
scrapper('sample')
This page takes a while to load. You are using driver.find_elements_by_class_name before the page has finished loading, so it's not finding those elements.
You can test this theory by import time and time.sleep(5) just before the search code.
The best solution is to keep checking until the elements are loaded with WebDriverWait() wait until the elements have loaded.
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
delay = 3
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'siq-partner-result')))
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
You can use WebDriverWait for the desired element to visible and then try to find the elements.
Using XPATH :
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//*[#class='siq-partner-result']")))
search = driver.find_elements_by_class_name("siq-partner-result")
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Value of CSS Property Selenium returning None for all images

I'm trying to scrape all of the images on this site. However, when I run my script and try to get the CSS attribute of 'background-image' to extract the url of each web element, the result is printing out "None". I have no idea why it would be returning None as I print out the web element and the attribute does exist. Any help would be greatly appreciated!
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
#setup opening url window of website to be scraped
options = webdriver.ChromeOptions()
options.headless=False
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3") #possible issue by not including the file extension
# driver.maximize_window()
# time.sleep(5)
# driver.get("""https://www.tripadvisor.com/""") #get the information from the page
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
driver.get("https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
time.sleep(1)
#waits for that amount of time
driver.implicitly_wait(12)
#find the searchbar and then plug in the key
#driver.find_element_by_xpath('//*[#class="typeahead_input"]').send_keys("Washington D.C.", Keys.ENTER)
#wait
time.sleep(1)
#list all of the hotels in that page
images = driver.find_elements_by_xpath('//*[#class="media-viewer-tile-gallery-v2-TileGallery__entryInner--JaADY "]')
image_url = []
for i in range(len(images)):
image_url.append(images[i].value_of_css_property("background-image"))
print("Total Number of images: ", len(images))
# print(images)
firstimage = images[0].get_attribute("innerHTML")
print(firstimage)
for i in range(len(image_url)):
print(image_url[i])
try this. it works for me.
# attach your code as set browser option
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
image_url = []
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
print(image_url)

How to check if popup page is scrolled down fully Selenium [duplicate]

This question already exists:
Condition to check if Selenium is done scrolling based on web element?
Closed 2 years ago.
I wrote a script to scrape images on TripAdvisor of hotels and I am able to iterate through all of them, my concern is whether to know I am finished scrolling within the popup window. I am unable to create a condition to break outside of my loop to then parse through all of the image urls and stays inside the loop infinitely. What should my if condition be in order to leave out of the loop? Any help is greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
old_image_length = len(image_url)
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and add their url to list
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
new_image_length = len(image_url)
#move to next visible images in the array which is the last one
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#wait one second
time.sleep(1)
#if the first and last image in the arrays are the same for visibility then get out
if(old_image_length == new_image_length):
end = True
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])
#print(image_url)
#Rishiraj Kanugo You can check the visibility of last element in the popup to make sure that the popup is fully scrolled to the bottom.if(element.isVisible()){Syso("popup is fully scrolled down");}

Multiple clicking loop on URL issue with Python Selenium

how do I fix this code?
As I'm trying to create a multiple clicking on a multiple URL loop but it just kept at the same link over and over.
if the url contains dr.macio
and contains this div class ('_3ao649')
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
time.sleep(0.8)
# select language
driver.find_element_by_xpath('//div[#class="language-selection__list"]/button').click()
time.sleep(3)
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items = driver.find_elements_by_xpath('//a[#data-sqe="link"]')
print('len:', len(all_items))
all_urls = []
for item in all_items:
url = item.get_attribute('href')
all_urls.append(url)
print(url)
# now use links
for item in all_urls:
a = item.splitlines("\n")
if url.contains("dr.macio"):
continue
else:
driver.get(chr(a))
driver.back()
If I understood your use case that you would like to visit each product url except which contains dr.macio.
Induce WebdriverWait and visibility_of_all_elements_located() and get all the links href value and then during iteration varify the links contains.
Try below code.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
# select language
WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//div[#class="language-selection__list"]/button'))).click()
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items=[item.get_attribute('href') for item in WebDriverWait(driver,15).until(EC.visibility_of_all_elements_located((By.XPATH,'//a[#data-sqe="link"]')))]
print(all_items)
for item in all_items:
#Checking here link contains `dr.macio`
if "dr.macio" in item:
continue
else:
driver.get(item)
driver.back()

How to use mulitple try except in selenium python

this code when given a list of cities goes and searches on google and extract data then covert it into a dataframe
In some cases have to use different xpaths to extract the data. there are three xpaths in total.
Trying to do this :
if
1 doesnt work go to 2
2 doesnt work go to 3
3 doesnt work.
use driver.quit ()
tried this code used NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
df_output = pd.DataFrame(columns=["City", "pincode"])
url = "https://www.google.com/"
chromedriver = ('/home/me/chromedriver/chromedriver.exe')
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
mlist1=['polasa']
for i in mlist1:
try:
search.send_keys(i,' pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[#class="IAznY"]//div[#class="title"]')))
elmts = driver.find_elements_by_xpath('//div[#class="IAznY"]//div[#class="title"]')
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elmts[0].text]]))
driver.quit()
except NoSuchElementException:
try:
elements=driver.find_element_by_xpath("//div[#class='Z0LcW']")
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements.text]]))
driver.quit()
except NoSuchElementException:
try:
elements=driver.find_element_by_xpath("//div[#class='Z0LcW AZCkJd']")
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements.text]]))
driver.quit()
except:
driver.quit()
this code works used one of the 3 tags here
need to combine 3 tags in a single code.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import html5lib
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pandas as pd
url = "https://www.google.com/"
chromedriver = ('/home/me/chromedriver/chromedriver.exe')
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys('polasa',' pincode')
search.send_keys(Keys.RETURN)
elements=driver.find_element_by_xpath("//div[#class='Z0LcW']")
elements.text
``
You don't really need 3 try-catchs. You can do this without throwing exceptions by locating elements (plural) given a locator and then check the length of the collection returned. If length = 0, no elements were found.
The locators you are using don't require XPath so you can instead use a CSS selector and combine all three with an OR and avoid the three checks. (Note: you can do the same thing with XPath but the results are messier and harder to read)
Here are your 3 locators combined into one using OR (the comma) in CSS selector syntax
div.IAznY div.title, div.Z0LcW, div.Z0LcW.AZCkJd
...and the updated code using the combined locator and without the nested try-catch.
...
locator = (By.CSS_SELECTOR, 'div.IAznY div.title, div.Z0LcW, div.Z0LcW.AZCkJd')
for i in mlist1:
search.send_keys(i,' pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located(*locator)
elements = driver.find_elements_by_css_selector(*locator)
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements[0].text]]))
driver.quit()
NOTE: I used your original locators and wasn't returning any results with any of the three. Are you sure they are correct?
Also note... I pulled the driver.quit() out of the loop. I'm not sure if you intended it to be inside or not but from the code provided, if the try succeeds in the first iteration, the browser will quit. You only have one item so you probably didn't notice this yet but would have been confused when you added another item to the iteration.

Categories

Resources