How to check if an element exists in the HTML using Selenium - python

I have a question. I find an element on the pages using the class, and display the text from there, and then split() it disassembles, but there is an error when there is no element, it does not parse.
Code:
spans = driver.find_elements(By.XPATH, "//span[#class='ipsContained ipsType_break']")
for span in spans:
atag = span.find_element(By.XPATH, ".//a")
print(atag.get_attribute('href'))
urlik = atag.get_attribute('href')
driver.get(url=urlik)
time.sleep(2)
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text
for page_number in range(int(urla.split()[3])):
page_number = page_number + 1
driver.get(url=urlik + f"page/{page_number}")
time.sleep(2)
imgs = driver.find_elements(By.CLASS_NAME, "cGalleryPatchwork_image")
for i in imgs:
driver.execute_script("arguments[0].scrollIntoView(true);", i)
time.sleep(0.2)
print(i.get_attribute("src"))
I need check this:
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text

To attempt to find an element on the pages using the class and display the text from there irrespective of the element being present or not you can wrap up the code in a try-except{} block handling the NoSuchElementException as follows:
driver.get(url=urlik)
time.sleep(2)
try:
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text
for page_number in range(int(urla.split()[3])):
page_number = page_number + 1
driver.get(url=urlik + f"page/{page_number}")
time.sleep(2)
imgs = driver.find_elements(By.CLASS_NAME, "cGalleryPatchwork_image")
for i in imgs:
driver.execute_script("arguments[0].scrollIntoView(true);", i)
time.sleep(0.2)
print(i.get_attribute("src"))
except NoSuchElementException:
print("Element is not present")

Instead of
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump")
Use
urla = driver.find_elements(By.CLASS_NAME, "ipsPagination_pageJump")
if urla:
urla[0].text
find_elements method returns a list of web elements matching the passed locator.
So, in case such element(s) existing urla will be a non-empty list while non-empty list is interpreted in Python as a Boolean True.
In case no matching elements found urla will be an empty list while empty list is interpreted in Python as a Boolean False.

Related

Selenium webdriver loops through all pages, but only scraping data for first page

I am attempting to scrape data through multiple pages (36) from a website to gather the document number and the revision number for each available document and save it to two different lists. If I run the code block below for each individual page, it works perfectly. However, when I added the while loop to loop through all 36 pages, it will loop, but only the data from the first page is saved.
#sam.gov website
url = 'https://sam.gov/search/?index=sca&page=1&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'
#webdriver
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
driver.get(url)
#get rid of pop up window
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
#list of revision numbers
revision_num = []
#empty list for all the WD links
WD_num = []
substring = '2015'
current_page = 0
while True:
current_page += 1
if current_page == 36:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
print('Last Page Complete!')
break
else:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
#click on next page
click_icon = WebDriverWait(driver, 5, 0.25).until(EC.visibility_of_element_located([By.ID,'bottomPagination-nextPage']))
click_icon.click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'main-container')))
Things I've tried:
I added the WebDriverWait in order to slow the script down for the page to load and/or elements to be clickable/located
I declared the empty lists outside the loop so it does not overwrite over each iteration
I have edited the while loop multiple times to either count up to 36 (while current_page <37) or moved the counter to the top or bottom of the loop)
Any ideas? TIA.
EDIT: added screenshot of 'field name'
I have refactor your code and made things very simple.
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
revision_num = []
WD_num = []
for page in range(1,37):
url = 'https://sam.gov/search/?index=sca&page={}&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'.format(page)
driver.get(url)
if page==1:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//a[contains(#class,'usa-link') and contains(.,'2015')]")))
wd_links = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//div[#class='sds-field__name' and text()='Revision Number']/following-sibling::div")))
for element in elements:
revision_num.append(element.text)
for wd_link in wd_links:
WD_num.append(wd_link.text)
print(revision_num)
print(WD_num)
if you know only 36 pages to iterate you can pass the value in the url.
wait for element visible using webdriverwait
construct your xpath in such a way so can identify element uniquely without if, but.
console output on my terminal:

selenium stale element reference: element is not attached to the page document error

I have an e-commerce page and there are multiple products on a page. I need to click the link of a product then return on the main page and click the link of the next product, but when I return, the elements can't be found anymore.
Path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(Path)
driver.get("https://www.emag.ro/")
search_bar = driver.find_element_by_id("searchboxTrigger")
search_bar.send_keys("laptopuri")
search_bar.send_keys(Keys.RETURN)
main = None
try:
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "main-container"))
)
print("Page loaded,main retrived succesfully")
print(" ")
except:
driver.quit()
products = main.find_elements_by_css_selector("div.card-item.js-product-data")
for product in products:
raw_name = product.text
raw_price = product.find_element_by_css_selector("p.product-new-price").text
link = product.find_element_by_tag_name("a")
#clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
driver.back()
After the first iteration, I get the following error :
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document on line raw_name = product.text,basically at the beginning of the loop.
I assume the page is not loading properly or something like that, I tried using time.sleep before going through the loop but nothing
When you are writing driver.back(), it will go back to previous page and by the time it will reach to original page all the defined elements will become stale. You need to redefined them like below :-
This should handle the exception.
products = len(main.find_elements_by_css_selector("div.card-item.js-product-data"))
j = 0
for product in range(products):
elements = main.find_elements_by_css_selector("div.card-item.js-product-data")
raw_name = elements[j].text
raw_price = elements[j].find_element_by_css_selector("p.product-new-price").text
link = elements[j].find_element_by_tag_name("a")
# clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
j = j + 1
driver.back()

using selenium.click() to change pages but gets error

I'm trying to click on a div to get to the next page of a table (the url does not change when the page changes). The go to the next page div has the same class as the go to the previous page's.
ive used:
elem = driver.find_element_by_class_name('cXQSjq')
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
and it worked to get me to page 2, but after page 2 it gives me the error:
selenium.common.exceptions.ElementClickInterceptedException: Message: element click
intercepted: Element <div class="styles__Chevron-sc-1buchb9-1 cXQSjq">...</div> is
not clickable at point (702, 656). Other element would receive the click: <div
id="hs-eu-cookie-confirmation-inner">...</div>
(Session info: chrome=92.0.4515.107)
Do you know if there is an issue that i am able to call elem.click() after using selenium to find other parts of the page. I'm scraping data from
https://nonfungible.com/market/history
I guess the issue here is that the next page button appears on the bottom of the page, so to click on it you should first scroll this element into the view.
See if this will work now:
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elem = driver.find_element_by_class_name('cXQSjq')
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
I didn't check correctness of the rest of your code, but clicking the next button should work now

Get LinkedIn user href on list (Selenium)

I'm beginner to Python & Selenium, I am just trying to get all LinkedIn profile hrefs on specific page with Selenium for adding to a list but I don't know why it returns all same URL 10 times:
This is my code:
try:
browser.find_element_by_id("username").send_keys(email_address)
sleep(1)
browser.find_element_by_id("password").send_keys(password)
sleep(1)
browser.find_element_by_xpath("//button[#class='btn__primary--large from__button--floating']").click()
sleep(1)
element = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.ID, "global-nav")))
sleep(0.5)
browser.get('https://www.linkedin.com/search/results/people/?geoUrn=%5B%22104246759%22%2C%2290009659%22%2C%22106383538%22%5D&keywords=mark%20johnson&origin=GLOBAL_SEARCH_HEADER')
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
for i in range(10):
href = url[i].get_attribute('href')
user.append(href)
print(user)
except Exception as e:
traceback.print_exc()
It looks like the xpath is matching multiple elements with the same href. You could make a list of unique hrefs:
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
unique_hrefs = []
[unique_hrefs.append(x.get_attribute('href')) for x in url if x.get_attribute('href') not in unique_hrefs]
for i in unique_hrefs:
print(i)

Remove an element in a container using selenium

I only want to scrape the required information contained in the black box, and delete/remove/exclude the information contained in the red box
I am doing this because class names "entry" and "partial entry" exist in both boxes. Only the first "partial entry" contains the information that I need, so I plan to delete/remove/exclude the classname "mgrRspnInLine".
My code is:
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for item in container:
try:
element = item.find_element_by_class_name('mgrRspnInline')
driver.execute_script("""var element = document.getElementsByClassName("mgrRspnInline")[0];element.parentNode.removeChild(element);""", element)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(2)
rating = item.find_elements_by_xpath('.//*[contains(#class,"ui_bubble_rating bubble_")]')
for rate in rating:
rate = rate.get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_elements_by_xpath('.//*[contains(#class,"recommend-titleInline noRatings")]')
for stayed in stay:
stayed = stayed.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_elements_by_xpath('.//*[contains(#class,"noQuotes")]')
for comment in summary:
comment = comment.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_elements_by_xpath('.//*[contains(#class,"ratingDate")]')
for date in rating_date:
date = date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_elements_by_xpath('.//*[contains(#class,"partial_entry")]')
for comment in review:
comment = comment.text
print(comment)
reviews.append(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break
Basically within the "review-container" I searched first for the class name "mgrRspnInLine", then tried to delete it using the execute_script.
but unfortunately, the output still shows the contents contained in the"mgrRspnInLine".
If you want to avoid matching second element by your XPath you can just modify XPath as below:
.//*[contains(#class,"partial_entry") and not(ancestor::*[#class="mgrRspnInLine"])]
This will match element with class name "partial_entry" only if it doesn't have ancestor with class name "mgrRspnInLine"
If you want the first occurrence you could use css class selector instead of:
.partial_entry
and retrieve with find_element_by_css_selector:
find_element_by_css_selector(".partial_entry")
You can delete all the .mgrRspnInLine elements with:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
Stitching the comment by Andersson, and the two answers provided by QHarr, and pguardiario. I finally solved the problem.
The key is to target a container within the container, all the information is contained in the class name "ui_column is-9" which is contained in the class name "review-container", hence addressing Andersson's comment of multiple mgrRspnInLine.
Within the nested loop, I used pguardianrio's suggestion to delete existing multiple mgrRspnInLine, then adding QHarr's answer on .partial_entry
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for items in container:
element = WebDriverWait(driver, 1000).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(10)
contained = items.find_elements_by_xpath('.//*[contains(#class,"ui_column is-9")]')
for item in contained:
try:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
rating = item.find_element_by_xpath('//*[contains(#class,"ui_bubble_rating bubble_")]')
rate = rating .get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_element_by_xpath('.//*[contains(#class,"recommend-titleInline")]')
stayed = stay.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_element_by_xpath('.//*[contains(#class,"noQuotes")]')
comment = summary.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_element_by_xpath('.//*[contains(#class,"ratingDate")]')
date = rating_date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_element_by_css_selector(".partial_entry")
comment = review.text
print(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break

Categories

Resources