Unable to fetch all the necessary links during Iteration - Selenium Python - python

I am newbie to Selenium Python. I am trying to fetch the profile URLs which will be 10 per page. Without using while, I am able to fetch all 10 URLs but for only the first page alone. When I use while, it iterates, but fetches only 3 or 4 URLs per page.
I need to fetch all the 10 links and keep iterating through pages. I think, I must do something with StaleElementReferenceException
Kindly help me solve this problem.
Given the code below.
def test_connect_fetch_profiles(self):
driver = self.driver
search_data = driver.find_element_by_id("main-search-box")
search_data.clear()
search_data.send_keys("Selenium Python")
search_submit = driver.find_element_by_name("search")
search_submit.click()
noprofile = driver.find_elements_by_xpath("//*[text() = 'Sorry, no results containing all your search terms were found.']")
self.assertFalse(noprofile)
while True:
wait = WebDriverWait(driver, 150)
try:
profile_links = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[contains(#href,'www.linkedin.com/profile/view?id=')][text()='LinkedIn Member'or contains(#href,'Type=NAME_SEARCH')][contains(#class,'main-headline')]")))
for each_link in profile_links:
page_links = each_link.get_attribute('href')
print(page_links)
driver.implicitly_wait(15)
appendFile = open("C:\\Users\\jayaramb\\Documents\\profile-links.csv", 'a')
appendFile.write(page_links + "\n")
appendFile.close()
driver.implicitly_wait(15)
next = wait.until(EC.visibility_of(driver.find_element_by_partial_link_text("Next")))
if next.is_displayed():
next.click()
else:
print("End of Page")
break
except ValueError:
print("It seems no values to fetch")
except NoSuchElementException:
print("No Elements to Fetch")
except StaleElementReferenceException:
print("No Change in Element Location")
else:
break
Please let me know if there are any other effective ways to fetch the required profile URL and keep iterating through pages.

I created a similar setup which works alright for me. I've had some problems with selenium trying to click on the next-button but it throwing a WebDriverException instead, likely because the next-button is not in view. Hence, instead of clicking the next-button I get its href-attribute and load the new page up with driver.get() and thus avoiding an actual click making the test more stable.
def test_fetch_google_links():
links = []
# Setup driver
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.maximize_window()
# Visit google
driver.get("https://www.google.com")
# Enter search query
search_data = driver.find_element_by_name("q")
search_data.send_keys("test")
# Submit search query
search_button = driver.find_element_by_xpath("//button[#type='submit']")
search_button.click()
while True:
# Find and collect all anchors
anchors = driver.find_elements_by_xpath("//h3//a")
links += [a.get_attribute("href") for a in anchors]
try:
# Find the next page button
next_button = driver.find_element_by_xpath("//a[#id='pnnext']")
location = next_button.get_attribute("href")
driver.get(location)
except NoSuchElementException:
break
# Do something with the links
for l in links:
print l
print "Found {} links".format(len(links))
driver.quit()

Related

Web Scraping Selenium , Multiple pages and product issue

i am doing scraping using selenium but not able to get all the href of 25 pages and all 626 products listed by getting all the href of products and multiple features from product and i want to scrape all the products on the 25 pages .
but while extracting all the 25 pages href it only gives 1 to 7 then jump to 25 directly not able to get all 25 pages links . and product listed there.
then i click on product link by sending keys and storing the href of all the products in url of pages .
import selenium
import pandas as pd
from selenium import webdriver
import getpass, time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException,StaleElementReferenceException
#First we will connect to webdriver
driver=webdriver.Chrome(r'/Users/ankit/chromedriver')
#Open the webpage with webdriver
driver.get('https://www.getapp.com/hr-employee-management-software/human-resources/')om/hr-employee-management-software/human-resources/')
URL2 = [] # for product pages
URL = [] # for storing all the pages
URL3=[] # for storing all video links
for i in range(1, 28):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[#data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
for i in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(i) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(#src,'ytimg')]")
for element in elements[0:1]:
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[#data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(3)
try:
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(#id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
except NoSuchElementException:
URL3.append('--')
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[#title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, i)
#we will open 1st product link to get all the necessary paths.
click=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div[2]/div[2]/a/p").click()
NAME=[]
OVERVIEW=[]
Image_url1=[]
Image_url2=[]
Image_url3=[]
Image_url4=[]
Image_url5=[]
#extracting and storing the Features of the product
FEATURE1=[]
FEATURE2=[]
FEATURE3=[]
FEATURE4=[]
FEATURE5=[]
PRICING=[]
for i in URL2:
driver.get(i)
try:
name=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/h2/span")
NAME.append(name.text.replace('product overview', '-'))
except NoSuchElementException:
NAME.append('--')
try:
overview=driver.find_element_by_xpath('//*[#id="__next"]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[2]/p')
OVERVIEW.append(overview.text)
except NoSuchElementException:
OVERVIEW.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url1.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url1.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url2.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url2.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[2]/img")
Image_url3.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url3.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[3]/img")
Image_url4.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url4.append('--')
try:
i=driver.find_element_by_tag_name("img")
Image_url5.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url5.append('--')
try:
feature1=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[1]/div")
FEATURE1.append(feature1.text)
except NoSuchElementException:
FEATURE1.append('--')
try:
feature2=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div")
FEATURE2.append(feature2.text)
except NoSuchElementException:
FEATURE2.append('--')
try:
feature3=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[3]/div")
FEATURE3.append(feature3.text)
except NoSuchElementException:
FEATURE3.append('--')
try:
feature4=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[4]/div")
FEATURE4.append(feature4.text)
except NoSuchElementException:
FEATURE4.append('--')
try:
feature5=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[2]/div[1]/div")
FEATURE5.append(feature4.text)
except NoSuchElementException:
FEATURE5.append('--')
try:
Pricing=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[1]/div/div[1]/div[2]/div[1]/div/p[1]")
PRICING.append( Pricing.text)
except NoSuchElementException:
PRICING.append('--') ```
You are not getting all the pages because the pagination is dynamically loaded on the website. You need to click on the pagination to load the other pages(and the href/link of those pages).
But a smart way would be to make the URLs manually rather than scraping because they are similar.
like this :
URL =[]
for i in range(1,27):
URL.append(f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
I understood that your next goal is to click on the Read More About.... But here is what you are doing wrong/ making some inefficient approach. After entering the first page, you immediately clicked on the Read More About ....
Instead, scrape all the Read More About... links PER PAGE. Then visit these scraped links one by one for the features.
Here is my complete approach:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
URL2 = [] # for product pages
URL = [] # for storing all the pages
for i in range(1, 27):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/"
)
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[#data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
# then collect the features by visiting the URL2 list
It seems like videos are at the end of the preview section, and the links for the videos are not visible directly. They are available when they are clicked because they are embedded.
To achieve our goal, we can take these steps.
Make them properly visible for clicking
Click on the videos (Some Products have multiple)
Extract Links from the iframe.
Close the video preview panel (because products that have multiple videos need to be properly visible before clicking the other video ).
Code for this approach(steps explained with comments)
for ul in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(ul) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(#src,'ytimg')]")
for element in elements[0:1]: # use limit here for number of video links
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[#data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(10)
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(#id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[#title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, ul)
NOTE: In the case of iframe (in selenium) we need to switch to the iframe or handle it in a diffrent way. But luckily for you video links are available outside the iframe.

Handling website errors with selenium python

I am scraping a website with selenium and send an alert, if something specific happens. Generally, my code works fine, but sometimes the website doesn't load the elements or the website has an error message like: "Sorry, something went wrong! Please refresh the page and try again!" Both times, my script waits until elements are loaded, but they don't and then my program doesn't do anything. I usually use requests and Beautifulsoup for web scraping, so I am not that familiar with selenium and I am not sure how to handle these errors, because my code doesn't send an error message and just waits, until the elements load, which will likely never happen. If I manually refresh the page, the program continues to work. My idea would be something like: If it takes more than 10 seconds to load, refresh the page and try again.
My code looks somewhat like this:
def get_data():
data_list = []
while len(data_list) < 3:
try:
data = driver.find_elements_by_class_name('text-color-main-secondary.text-sm.font-bold.text-left')
count = len(data)
data_list.append(data)
driver.implicitly_wait(2)
time.sleep(.05)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,
'text-color-main-secondary.text-sm.font-bold.text-left'.format(
str(
count + 1)))))
except TimeoutException:
break
text = []
elements = []
for i in range(len(data_list)):
for j in range(len(data_list[i])):
t = data_list[i][j].text
elements.append(data_list[i][j])
for word in t.split():
if '#' in word:
text.append(word)
return text, elements
option = webdriver.ChromeOptions()
option.add_extension('')
path = ''
driver = webdriver.Chrome(executable_path=path, options=option)
driver.get('')
login(passphrase)
driver.switch_to.window(driver.window_handles[0])
while True:
try:
infos, elements = get_data()
data, message = check_data(infos, elements)
if data:
send_alert(message)
time.sleep(600)
driver.refresh()
except Exception as e:
exception_type, exception_object, exception_traceback = sys.exc_info()
line_number = exception_traceback.tb_lineno
print("an exception occured - {}".format(e) + " in line: " + str(line_number))
You can use try and except to overcome this problem. First, let's locate the element with a 10s waiting time if the element is not presented you can refresh the page. here is the basic version of the code
try:
# wait for 10s to load element if it did not load then it will redirect to except block
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'text-color-main-secondary.text-sm.font-bold.text-left'.format(str(count + 1)))))
except:
driver.refresh()
# locate the elemnt here again

ElementClickInterceptedException: element click intercepted. Other element would receive the click: Selenium Python

I have seen other questions about this error but my case is that in my program the other element should receive the click. In details: the webdriver is scrolling through google search and it must click every website it finds but the program is preventing that. How can I make it NOT search the previous site it clicked?
This is the function. The program is looping it and after the first loop it scrolls down and the error occurs:
def get_info():
browser.switch_to.window(browser.window_handles[2])
description = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "h3"))
).text
site = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "cite"))
)
site.click()
url=browser.current_url
#removes the https:// and the / of the url
#to get just the domain of the website
try:
link=url.split("https://")
link1=link[1].split("/")
link2=link1[0]
link3=link2.split("www.")
real_link=link3[1]
except IndexError:
link=url.split("https://")
link1=link[1].split("/")
real_link=link1[0]
time.sleep(3)
screenshot=browser.save_screenshot("photos/"+"(" + real_link + ")" + ".png")
global content
content=[]
content.append(real_link)
content.append(description)
print(content)
browser.back()
time.sleep(5)
browser.execute_script("window.scrollBy(0,400)","")
time.sleep(5)
You can create a list of clicked website and check every time if that link is clicked or not. Here's the demo code :
clicked_website=[]
url=browser.current_url
clicked_website.append(url)
# Now while clicking
if <new_url> not in clicked_website:
<>.click()
This is just an idea how to implement. Your code is mess, I didn't understand clearly so, implement in your code by yourself.

i need to click unordered links in the below URL using selenium, python

In the below URL i need to click the message icon links which containing 'svg' tags inside it.
https://www.sciencedirect.com/science/article/pii/S0898656817301687
for that iam using below code
lenoftags = driver.find_elements_by_xpath('//a[#class="author size-m workspace-trigger"]//*[local-name()="svg"]')
tagcount = len(lenoftags)
newcount = range(1, tagcount)
if len(lenoftags) == 0:
driver.back()
elif len(lenoftags) >= 1:
for jj in newcount:
try:
driver.find_element_by_xpath('//a[#class="author size-m workspace-trigger"][%d]//*[local-name()="svg"]'%jj).click()
except (NoSuchElementException, TimeoutException, WebDriverException):
try:
driver.find_element_by_xpath('//a[#class="author size-m workspace-trigger"]//*[local-name()="svg"]').click()
except (NoSuchElementException, TimeoutException, WebDriverException):
continue
driver.back()
driver.back()
else:
driver.back()
the code is working when the links in order but in above URL it is clicking only first link.
any one please resolve this
You can avoid implementing extra logic. Try below instead:
tags = driver.find_elements_by_xpath('//a[contains(#class, "author")]//*[local-name()="svg"]')
if tags:
# Links found
for tag in tags:
tag.click()
else:
# Links not found
# Do something else

web scraping contat list selenium python

How can i loop through contacts in group in Discord using selenium in Python?
I tried this code, and i have this error:
selenium.common.exceptions.StaleElementReferenceException: Message: The element reference of is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed
Problem is scroller and contacts are constantly updating...
I tried this code:
while True:
num=0
try:
users_list = driver.find_elements_by_css_selector("div.memberOnline-1CIh-0.member-3W1lQa")
for user in users_list:
num+=1
user.click()
driver.execute_script("arguments[0].scrollIntoView();",user)
print('User number {}'.format(num))
except StaleElementReferenceException and ElementClickInterceptedException:
print('bad')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight",users_list)
From your given code, you only scroll element, so the reason of Stale exception is you not wait page load complete, or at least not wait the contacts not load complete.
For debug purpose, you can simple add a long sleep before the loop, like sleep(15), and replace to explicit wait if production code, like
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "myDynamicElement"))
)
Detail of Explicit Wait at here
If you call click() in loop, you need to find the elements again in loop:
while True:
num=0
try:
time.sleep(15)
users_list = driver
.find_elements_by_css_selector("div.memberOnline-1CIh-0.member-3W1lQa")
length = len(users_list)
for num in range(0, length):
user = users_list[num]
user.click()
time.sleep(15)
driver.execute_script("arguments[0].scrollIntoView();",user)
print('User number {}'.format(num+1))
// because the above `click` make page happen changes
// so selenium will treat it as a new page,
// those element reference found on `old` page, can not work on `new` page
// you need to find elements belongs to `old` page again on `new` page
// find users_list again from `new` page
users_list = driver
.find_elements_by_css_selector("div.memberOnline-1CIh-0.member-3W1lQa")
except StaleElementReferenceException and ElementClickInterceptedException:
print('bad')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight",
users_list)

Categories

Resources