i am doing scraping using selenium but not able to get all the href of 25 pages and all 626 products listed by getting all the href of products and multiple features from product and i want to scrape all the products on the 25 pages .
but while extracting all the 25 pages href it only gives 1 to 7 then jump to 25 directly not able to get all 25 pages links . and product listed there.
then i click on product link by sending keys and storing the href of all the products in url of pages .
import selenium
import pandas as pd
from selenium import webdriver
import getpass, time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException,StaleElementReferenceException
#First we will connect to webdriver
driver=webdriver.Chrome(r'/Users/ankit/chromedriver')
#Open the webpage with webdriver
driver.get('https://www.getapp.com/hr-employee-management-software/human-resources/')om/hr-employee-management-software/human-resources/')
URL2 = [] # for product pages
URL = [] # for storing all the pages
URL3=[] # for storing all video links
for i in range(1, 28):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[#data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
for i in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(i) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(#src,'ytimg')]")
for element in elements[0:1]:
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[#data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(3)
try:
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(#id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
except NoSuchElementException:
URL3.append('--')
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[#title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, i)
#we will open 1st product link to get all the necessary paths.
click=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div[2]/div[2]/a/p").click()
NAME=[]
OVERVIEW=[]
Image_url1=[]
Image_url2=[]
Image_url3=[]
Image_url4=[]
Image_url5=[]
#extracting and storing the Features of the product
FEATURE1=[]
FEATURE2=[]
FEATURE3=[]
FEATURE4=[]
FEATURE5=[]
PRICING=[]
for i in URL2:
driver.get(i)
try:
name=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/h2/span")
NAME.append(name.text.replace('product overview', '-'))
except NoSuchElementException:
NAME.append('--')
try:
overview=driver.find_element_by_xpath('//*[#id="__next"]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[2]/p')
OVERVIEW.append(overview.text)
except NoSuchElementException:
OVERVIEW.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url1.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url1.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url2.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url2.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[2]/img")
Image_url3.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url3.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[3]/img")
Image_url4.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url4.append('--')
try:
i=driver.find_element_by_tag_name("img")
Image_url5.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url5.append('--')
try:
feature1=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[1]/div")
FEATURE1.append(feature1.text)
except NoSuchElementException:
FEATURE1.append('--')
try:
feature2=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div")
FEATURE2.append(feature2.text)
except NoSuchElementException:
FEATURE2.append('--')
try:
feature3=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[3]/div")
FEATURE3.append(feature3.text)
except NoSuchElementException:
FEATURE3.append('--')
try:
feature4=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[4]/div")
FEATURE4.append(feature4.text)
except NoSuchElementException:
FEATURE4.append('--')
try:
feature5=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[2]/div[1]/div")
FEATURE5.append(feature4.text)
except NoSuchElementException:
FEATURE5.append('--')
try:
Pricing=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[1]/div/div[1]/div[2]/div[1]/div/p[1]")
PRICING.append( Pricing.text)
except NoSuchElementException:
PRICING.append('--') ```
You are not getting all the pages because the pagination is dynamically loaded on the website. You need to click on the pagination to load the other pages(and the href/link of those pages).
But a smart way would be to make the URLs manually rather than scraping because they are similar.
like this :
URL =[]
for i in range(1,27):
URL.append(f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
I understood that your next goal is to click on the Read More About.... But here is what you are doing wrong/ making some inefficient approach. After entering the first page, you immediately clicked on the Read More About ....
Instead, scrape all the Read More About... links PER PAGE. Then visit these scraped links one by one for the features.
Here is my complete approach:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
URL2 = [] # for product pages
URL = [] # for storing all the pages
for i in range(1, 27):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/"
)
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[#data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
# then collect the features by visiting the URL2 list
It seems like videos are at the end of the preview section, and the links for the videos are not visible directly. They are available when they are clicked because they are embedded.
To achieve our goal, we can take these steps.
Make them properly visible for clicking
Click on the videos (Some Products have multiple)
Extract Links from the iframe.
Close the video preview panel (because products that have multiple videos need to be properly visible before clicking the other video ).
Code for this approach(steps explained with comments)
for ul in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(ul) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(#src,'ytimg')]")
for element in elements[0:1]: # use limit here for number of video links
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[#data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(10)
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(#id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[#title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, ul)
NOTE: In the case of iframe (in selenium) we need to switch to the iframe or handle it in a diffrent way. But luckily for you video links are available outside the iframe.
Related
Noticed that the beatmap packages that are available officially in OSU have 98% songs I don't care for to play. Same with the unofficial mega packs you can find that have 20gigs of songs on a per year basis 2011,2012,2013,2013,etc..
I did find that the "most favourites" page in osu: https://osu.ppy.sh/beatmapsets?sort=favourites_desc have a good chunk of songs that I like or would play.
So I tried to create a python script which would click the download button on every beatmap panel.
I learned alot during this process-->"Actions move_to_element (hover menu), Wait.until_clickable, Stale Element Exceptions, Scroll Page execute script(s).
Kept having a hard time with elements disappearing from Page/DOM to make a "for element in elements" work properly I decided to have it scroll multiple times to load more beatmaps and than scrape for HREF links with the word "Download" in it and this worked great for capturing "most" of the links. Atleast captured over 3000 unique links.
I put it in a text file and it looks like this:
...
https://osu.ppy.sh/beatmapsets/1457867/download
https://osu.ppy.sh/beatmapsets/881996/download
https://osu.ppy.sh/beatmapsets/779173/download
https://osu.ppy.sh/beatmapsets/10112/download
https://osu.ppy.sh/beatmapsets/996628/download
https://osu.ppy.sh/beatmapsets/415886/download
https://osu.ppy.sh/beatmapsets/490662/download
...
The "Download" button on each panel all have this HREF link. If you click the button you download the beatmap file which is a .osz filetype. However, if you "right-click -> copy-link" from the "Download" button and you open it from a new-page or new-tab it will re-direct to the beatmaps page and not download the file.
I make it work by using the Pandas module to read a .xlxs excel file for URLs and loop for each url. Once the url page is opened it clicks the Download button:
def read_excel():
import pandas as pd
df = pd.read_excel('book.xlsx') # Get all the urls from the excel
mylist = df['urls'].tolist() #urls is the column name
print(mylist) # will print all the urls
# now loop through each url & perform actions.
for url in mylist:
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("user-data- dir=C:\\Users\\%UserName%\\AppData\\Local\\Google\\Chrome\\User Data\\Profile1")
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)
driver.get(url)
try:
WebDriverWait(driver, 3).until(EC.alert_is_present(),'Timed out waiting for alert.')
alert = driver.switch_to.alert
alert.accept()
print("alert accepted")
except TimeoutException:
print("no alert")
time.sleep(1)
wait = WebDriverWait(driver, 10)
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.osu-layout__section.osu-layout__section--full.js-content.beatmaps_show > div > div > div:nth-child(2) > div.beatmapset-header > div > div.beatmapset-header__box.beatmapset-header__box--main > div.beatmapset-header__buttons > a:nth-child(2) > span"))).click()
time.sleep(1)
except Exception:
print("Can't find the Element Download")
time.sleep(10)
download_file()
driver.close()
This a sequence "one at a time" function, the download_file() function is a loop which checks the download folder to see if there's a file being downloaded, if not it goes to the next url.
This works. Ofcourse the website as limitations. Can only download max 8 at a time and after a 100 to 200 downloads you can't download anymore and you have to wait a bit. but the loop keeps going and tries each URL unless you stop the script. Luckily you can see the last beatmap that was downloaded and reference it to where it is in the Excel spreadsheet and remove the rows above and start the script again. I'm sure I can code it so it stops the loop when there's no new file that pops up in the Download folder.
Finally the question: Is there a way so it opens these download links and downloads the file without having to click the "Download Button" after opening the page? It redirects to the beatmap page instead of downloading the file automatically. Must be some java/html data I don't know about.
def read_excel():
import pandas as pd
df = pd.read_excel('book.xlsx') # Get all the urls from the excel
mylist = df['urls'].tolist() #urls is the column name
print(mylist) # will print all the urls
# now loop through each url & perform actions.
for url in mylist:
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("user-data- dir=C:\\Users\\%UserName%\\AppData\\Local\\Google\\Chrome\\User Data\\Profile1")
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)
driver.get(url)
try:
WebDriverWait(driver, 3).until(EC.alert_is_present(),'Timed out waiting for alert.')
alert = driver.switch_to.alert
alert.accept()
print("alert accepted")
except TimeoutException:
print("no alert")
time.sleep(1)
wait = WebDriverWait(driver, 10)
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.osu-layout__section.osu-layout__section--full.js-content.beatmaps_show > div > div > div:nth-child(2) > div.beatmapset-header > div > div.beatmapset-header__box.beatmapset-header__box--main > div.beatmapset-header__buttons > a:nth-child(2) > span"))).click()
time.sleep(1)
except Exception:
print("Can't find the Element Download")
time.sleep(10)
download_file()
driver.close()
I use the python package selenium to click the "load more" button automatically, which is successful. But why do I cannot get data after "load more"?
I want to crawl reviews from imdb using python. It only displays 25 reviews until I click "load more" button. I use the python package selenium to click the "load more" button automatically, which is successful. But why do I cannot get data after "load more" and just get the first 25 reviews data repeatedly?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
seed = 'https://www.imdb.com/title/tt4209788/reviews'
movie_review = requests.get(seed)
PATIENCE_TIME = 60
LOAD_MORE_BUTTON_XPATH = '//*[#id="browse-itemsprimary"]/li[2]/button/span/span[2]'
driver = webdriver.Chrome('D:/chromedriver_win32/chromedriver.exe')
driver.get(seed)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='ipl-load-more__button']")
review_soup = BeautifulSoup(movie_review.text, 'html.parser')
review_containers = review_soup.find_all('div', class_ ='imdb-user-review')
print('length: ',len(review_containers))
for review_container in review_containers:
review_title = review_container.find('a', class_ = 'title').text
print(review_title)
time.sleep(2)
loadMoreButton.click()
time.sleep(5)
except Exception as e:
print(e)
break
print("Complete")
I want all the reviews, but now I can only get the first 25.
You have several issues in your script. Hardcoded wait is very inconsistent and certainly the worst option to comply. The way you have written your scraping logic within while True: loop, will slower the parsing process by collecting the same items over and over again. Moreover, every title produces a huge line gap in the output which needs to be properly stripped. I've slightly changed your script to reflect the suggestion I've given above.
Try this to get the required output:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
URL = "https://www.imdb.com/title/tt4209788/reviews"
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
driver.get(URL)
soup = BeautifulSoup(driver.page_source, 'lxml')
while True:
try:
driver.find_element_by_css_selector("button#load-more-trigger").click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
soup = BeautifulSoup(driver.page_source, 'lxml')
except Exception:break
for elem in soup.find_all(class_='imdb-user-review'):
name = elem.find(class_='title').get_text(strip=True)
print(name)
driver.quit()
Your code is fine. Great even. But, you never fetch the 'updated' HTML for the web page after hitting the 'Load More' button. That's why you are getting the same 25 reviews listed all the time.
When you use Selenium to control the web browser, you are clicking the 'Load More' button. This creates an XHR request (or more commonly called AJAX request) that you can see in the 'Network' tab of your web browser's developer tools.
The bottom line is that JavaScript (which is run in the web browser) updates the page. But in your Python program, you only get the HTML once for the page statically using the Requests library.
seed = 'https://www.imdb.com/title/tt4209788/reviews'
movie_review = requests.get(seed) #<-- SEE HERE? This is always the same HTML. You fetched in once in the beginning.
PATIENCE_TIME = 60
To fix this problem, you need to use Selenium to get the innerHTML of the div box containing the reviews. Then, have BeautifulSoup parse the HTML again. We want to avoid picking up the entire page's HTML again and again because it takes computation resources to have to parse that updated HTML over and over again.
So, find the div on the page that contains the reviews, and parse it again with BeautifulSoup. Something like this should work:
while True:
try:
allReviewsDiv = driver.find_element_by_xpath("//div[#class='lister-list']")
allReviewsHTML = allReviewsDiv.get_attribute('innerHTML')
loadMoreButton = driver.find_element_by_xpath("//button[#class='ipl-load-more__button']")
review_soup = BeautifulSoup(allReviewsHTML, 'html.parser')
review_containers = review_soup.find_all('div', class_ ='imdb-user-review')
pdb.set_trace()
print('length: ',len(review_containers))
for review_container in review_containers:
review_title = review_container.find('a', class_ = 'title').text
print(review_title)
time.sleep(2)
loadMoreButton.click()
time.sleep(5)
except Exception as e:
print(e)
break
I've written some script in python using selenium to scrape name and price of different products from redmart website. My scraper clicks on a link, goes to its target page, parses data from there. However, the issue I'm facing with this crawler is it scrapes very few items from a page because of the webpage's slow-loading method. How can I get all the data from each page controlling the lazy-loading process? I tried with "execute script" method but i did it wrongly. Here is the script I'm trying with:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://redmart.com/bakery")
wait = WebDriverWait(driver, 10)
counter = 0
while True:
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "li.image-facets-pill")))
driver.find_elements_by_css_selector('img.image-facets-pill-image')[counter].click()
counter += 1
except IndexError:
break
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
for elems in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.productPreview"))):
name = elems.find_element_by_css_selector('h4[title] a').text
price = elems.find_element_by_css_selector('span[class^="ProductPrice__"]').text
print(name, price)
driver.back()
driver.quit()
I guess you could use Selenium for this but if speed is your concern aften #Andersson crafted the code for you in another question on Stackoverflow, well, you should replicate the API calls, that the site uses instead and extract the data from the JSON - like the site does.
If you use Chrome Inspector you'll see that the site for each of those categories that are in your outer while-loop (the try-block in your original code) calls an API, that returns the overall categories of the site. All this data can be retrieved like so:
categories_api = 'https://api.redmart.com/v1.5.8/catalog/search?extent=0&depth=1'
r = requests.get(categories_api).json()
For the next API calls you need to grab the uris concerning the bakery stuff. This can be done like so:
bakery_item = [e for e in r['categories'] if e['title'] == 'Bakery]
children = bakery_item[0]['children']
uris = [c['uri'] for c in children]
Uris will now be a list of strings (['bakery-bread', 'breakfast-treats-212', 'sliced-bread-212', 'wraps-pita-indian-breads', 'rolls-buns-212', 'baked-goods-desserts', 'loaves-artisanal-breads-212', 'frozen-part-bake', 'long-life-bread-toast', 'speciality-212']) that you'll pass on to another API found by Chrome Inspector, and that the site uses to load content.
This API has the following form (default returns a smaller pageSize but I bumped it to 500 to be somewhat sure you get all data in one request):
items_API = 'https://api.redmart.com/v1.5.8/catalog/search?pageSize=500&sort=1024&category={}'
for uri in uris:
r = requests.get(items_API.format(uri)).json()
products = r['products']
for product in products:
name = product['title']
# testing for promo_price - if its 0.0 go with the normal price
price = product['pricing']['promo_price']
if price == 0.0:
price = product['pricing']['price']
print("Name: {}. Price: {}".format(name, price))
Edit: If you want to stick to selenium still, you could insert something like this to hansle the lazy loading. Questions on scrolling has been answered several times before, so yours is actually a duplicate. In the future you should showcase what you tried (you own effort on the execute part) and show the traceback.
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
height = driver.execute_script("return document.body.scrollHeight;")
if height == check_height:
break
check_height = height
Ever since the API has been deprecated, its been very hard to retrieve the google image search url using Selenium. I've scoured stackoverflow, but most of the results to this question are from years ago when scraping search engines was simpler.
Looking for a way to return the url of the first image in a google search query. I've used everything in selenium from clicks, to retrieve innerhtml of elements, to my most recent attempt, using actionchains to attempt to navigate to the url of the pic and then returning the current url.
def GoogleImager(searchterm, musedict):
page = "http://www.google.com/"
landing = driver.get(page)
actions = ActionChains(driver)
WebDriverWait(landing, '10')
images = driver.find_element_by_link_text('Images').click()
actions.move_to_element(images)
searchbox = driver.find_element_by_css_selector('#lst-ib')
WebDriverWait(searchbox, '10')
sendsearch = searchbox.send_keys('{} "logo" {}'.format('Museum of Bad Art', 'bos')+Keys.ENTER)
WebDriverWait(sendsearch, '10')
logo = driver.find_element_by_xpath('//*[#id="rg_s"]/div[1]/a').click()
WebDriverWait(logo, '10')
logolink = driver.find_element_by_xpath('//*[#id="irc_cc"]/div[3]/div[1]/div[2]/div[2]/a')
WebDriverWait(logolink, '10')
actions.move_to_element(logolink).click(logolink)
print(driver.current_url)
return driver.current_url
I'm using this to return the first image for a museum name and city in the search.
I tried to make your code work with Google, got frustrated and switched to Yahoo instead. I couldn't make heads or tails of your musedict access loops so I substituted a simple dictionary for demonstration purposes:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
museum_dictionary = { "louvre": "Paris", "prado": "Madrid"}
driver = webdriver.Firefox()
def YahooImager(searchterm):
page = "https://images.search.yahoo.com"
landing = driver.get(page)
WebDriverWait(driver, 4)
assert "Yahoo Image Search" in driver.title
searchbox = driver.find_element_by_name("p") # Find the query box
city = museum_dictionary[searchterm]
searchbox.send_keys("{} {}".format(searchterm, city) + Keys.RETURN)
WebDriverWait(driver, 4)
try:
driver.find_element_by_xpath('//*[#id="resitem-0"]/a').click()
except NoSuchElementException:
assert 0, '//*[#id="resitem-0"]/a'
driver.close()
WebDriverWait(driver, 4)
try:
driver.find_element_by_link_text("View Image").click()
except NoSuchElementException:
assert 0, "View Image"
driver.close()
WebDriverWait(driver, 4)
# driver.close()
return driver.current_url
image_url = YahooImager("prado")
print(repr(image_url))
It works, but takes quite a while. (That's probably something someone who knows these libraries better could optimize -- I just wanted to see it work at all.) This example is fragile and occasionally just fails.
I am newbie to Selenium Python. I am trying to fetch the profile URLs which will be 10 per page. Without using while, I am able to fetch all 10 URLs but for only the first page alone. When I use while, it iterates, but fetches only 3 or 4 URLs per page.
I need to fetch all the 10 links and keep iterating through pages. I think, I must do something with StaleElementReferenceException
Kindly help me solve this problem.
Given the code below.
def test_connect_fetch_profiles(self):
driver = self.driver
search_data = driver.find_element_by_id("main-search-box")
search_data.clear()
search_data.send_keys("Selenium Python")
search_submit = driver.find_element_by_name("search")
search_submit.click()
noprofile = driver.find_elements_by_xpath("//*[text() = 'Sorry, no results containing all your search terms were found.']")
self.assertFalse(noprofile)
while True:
wait = WebDriverWait(driver, 150)
try:
profile_links = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[contains(#href,'www.linkedin.com/profile/view?id=')][text()='LinkedIn Member'or contains(#href,'Type=NAME_SEARCH')][contains(#class,'main-headline')]")))
for each_link in profile_links:
page_links = each_link.get_attribute('href')
print(page_links)
driver.implicitly_wait(15)
appendFile = open("C:\\Users\\jayaramb\\Documents\\profile-links.csv", 'a')
appendFile.write(page_links + "\n")
appendFile.close()
driver.implicitly_wait(15)
next = wait.until(EC.visibility_of(driver.find_element_by_partial_link_text("Next")))
if next.is_displayed():
next.click()
else:
print("End of Page")
break
except ValueError:
print("It seems no values to fetch")
except NoSuchElementException:
print("No Elements to Fetch")
except StaleElementReferenceException:
print("No Change in Element Location")
else:
break
Please let me know if there are any other effective ways to fetch the required profile URL and keep iterating through pages.
I created a similar setup which works alright for me. I've had some problems with selenium trying to click on the next-button but it throwing a WebDriverException instead, likely because the next-button is not in view. Hence, instead of clicking the next-button I get its href-attribute and load the new page up with driver.get() and thus avoiding an actual click making the test more stable.
def test_fetch_google_links():
links = []
# Setup driver
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.maximize_window()
# Visit google
driver.get("https://www.google.com")
# Enter search query
search_data = driver.find_element_by_name("q")
search_data.send_keys("test")
# Submit search query
search_button = driver.find_element_by_xpath("//button[#type='submit']")
search_button.click()
while True:
# Find and collect all anchors
anchors = driver.find_elements_by_xpath("//h3//a")
links += [a.get_attribute("href") for a in anchors]
try:
# Find the next page button
next_button = driver.find_element_by_xpath("//a[#id='pnnext']")
location = next_button.get_attribute("href")
driver.get(location)
except NoSuchElementException:
break
# Do something with the links
for l in links:
print l
print "Found {} links".format(len(links))
driver.quit()