iterate over result pages using selenium and python: StaleElementReferenceException - python

I think for people who understood the selenium tool will now laugh but maybe you can share you're knowledge because really want to laugh now, too.
My code is this:
def getZooverLinks(country):
global countries
countries = country
zooverWeb = "http://www.zoover.nl/"
url = zooverWeb + country
driver = webdriver.Firefox()
driver.get(url)
button = driver.find_element_by_class_name('next')
links = []
for page in xrange(1,4):
WebDriverWait(driver, 60).until(lambda driver :driver.find_element_by_class_name('next'))
divList = driver.find_elements_by_class_name('blue2')
for div in divList:
hrefTag = div.find_element_by_css_selector('a').get_attribute('href')
print(hrefTag)
newLink = zooverWeb + hrefTag
links.append(newLink)
button.click()
driver.implicitly_wait(10)
time.sleep(60)
return links
So I want to iterate over all result pages and always get the links from the divs having the class="blue2" and then follow the "next"-link to get to the next result page.
But always I get a StaleElementReferenceException saying:
"Message: Element not found in the cache - perhaps the page has changed since it was looked up"
But the layout of pages is always the same. So what is the problem here? Is the url after the click not handed over to the driver since the page changes too? How can I do that?

It is a little bit tricky to follow the pagination on this particular site.
Here is the set of things that helped me to overcome the issue with StaleElementReferenceException:
find elements inside the loop since the page changes
use Explicit Waits to wait for the specific page numbers to become active
Working code:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
country = "albanie"
zooverWeb = "http://www.zoover.nl/"
url = zooverWeb + country
driver = webdriver.Firefox()
driver.get(url)
driver.implicitly_wait(10)
links = []
for page in xrange(1, 4):
# tricky part - waiting for the page number on the top to appear
if page > 1:
WebDriverWait(driver, 60).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'div.entityPagingTop strong'), str(page)))
else:
WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.CLASS_NAME, 'next')))
divList = driver.find_elements_by_class_name('blue2')
for div in divList:
hrefTag = div.find_element_by_css_selector('a').get_attribute('href')
newLink = zooverWeb + hrefTag
links.append(newLink)
driver.find_element_by_class_name("next").click()
print links

Related

How to get only links that has a particular id from list of links using selenium

I am new to the selenium framework and I must say it is an awesome library. I am basically trying to get all links from a webpage that has a particular id "pagination", and isolate them from links that don't have such id, reasons because I want to go through all the pages in this link.
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
print(j.get_property('href')))
The code above gets all the links with and without pagination.
example links with pagination.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2015/results/
https://www.oddsportal.com/soccer/england/premier-league-2020-2021/results/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2021/results/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2019/results/
example links without pagination.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
In my code, I try to find if the given ID exists on the page, pagination = browser.find_element(By.ID, "pagination") but I stumble on an error, I understand the reason for the error, and it is coming from the fact that the ID "pagination" does not exist on some of the links.
no such element: Unable to locate element: {"method":"css selector","selector":"[id="pagination"]"}
I changed the above code to pagination = browser.find_elements(By.ID, "pagination"), which returns links with and without pagination. so my question is how can I get links that has a particular id from list of links.
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
import time
import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#define our URL
url = 'https://oddsportal.com/results/'
path = r'C:\Users\Glodaris\OneDrive\Desktop\Repo\Scraper\chromedriver.exe'
options = ChromeOptions()
options.headless = True
# options=options
browser = Chrome(executable_path=path, options=options)
browser.get(url)
title = browser.title
print('Title', title)
links = []
for i in browser.find_elements(By.CSS_SELECTOR, "div#archive-tables tbody tr[xsid='1'] td a[href]"):
links.append(i.get_property('href'))
arr = []
condition = True
while condition:
for link in (links):
second_link = browser.get(link)
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
browser.implicitly_wait(2)
pagination = browser.find_element(By.ID, "pagination")
if pagination:
print(pagination.get_property('href')))
else:
print(j.get_property('href')))
try:
browser.find_elements("xpath", "//*[#id='pagination']/a[6]")
except:
condition = False
As you are using Selenium, you are able to actually click on the pagination's forward button to navigate through pages.
The following example will test for cookie button, will scrape the data from the main table as a dataframe, will check if there is pagination, and if not, will stop there. If there is pagination, will navigate to next page, get the data from the table, navigate to the next page and so on, until the table data from the page is identical with table data from previous page, and then will stop. It is able to handle an n number of pages. The setup in the code below is for linux, what you need to pay attention to is the imports part, as well as the part after you define the browser/driver.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
# url='https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/'
url = 'https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2021/results/'
browser.get(url)
try:
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))).click()
except Exception as e:
print('no cookie button!')
games_table = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[id='tournamentTable']")))
try:
initial_games_table_data = games_table.get_attribute('outerHTML')
dfs = pd.read_html(initial_games_table_data)
print(dfs[0])
except Exception as e:
print(e, 'Unfortunately, no matches can be displayed because there are no odds available from your selected bookmakers.')
while True:
browser.execute_script("window.scrollTo(0,document.body.scrollHeight);")
t.sleep(1)
try:
forward_button = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='pagination']//span[text()='»']")))
forward_button.click()
except Exception as e:
print(e, 'no pagination, stopping here')
break
games_table = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[id='tournamentTable']")))
dfs = pd.read_html(games_table.get_attribute('outerHTML'))
games_table_data = games_table.get_attribute('outerHTML')
if games_table_data == initial_games_table_data:
print('this is the last page')
break
print(dfs[0])
initial_games_table_data = games_table_data
print('went to next page')
t.sleep(3)
You are seeing the error message...
no such element: Unable to locate element: {"method":"css selector","selector":"[id="pagination"]"}
...as all the pages doesn't contain the element:
<div id="pagination">
<a ...>
<a ...>
<a ...>
</div>
Solution
In these cases your best approach would be to wrapup the code block with in a try-except{} block as follows:
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
try:
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, "pagination")))
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#pagination a[href*='page']")))])
except:
print("Pagination not available")
continue
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Update
A couple of things to note.
The (By.ID, "pagination") element doesn't have a href attribute but the several decendants have. So you may find conflicting results.
As you are using WebDriverWait remember to remove all the instances of implicitly_wait() as mixing implicit and explicit waits can cause unpredictable wait times. For example setting an implicit wait of 10 seconds and an explicit wait of 15 seconds, could cause a timeout to occur after 20 seconds.

Web Scraping - Although tag is shown during inspection can't find it while scraping using Python

Nowadays, Amazon is not showing prices on the category page for many categories when one is trying to scrape from a different location(country). In fact, it is not showing pricing in the product page as well. Only when the user clicks on "See all buying options" do they get to see the pricing of the product. On inspection that price is under the span of class a-offscreen but using BeautifulSoup that tag is returning empty values. Why is this happening and what is the solution/effective way to get these prices?
from selenium import webdriver
driver = webdriver.Chrome()
driver.get(required Amazon product url)
soup = BeautifulSoup(driver.page_source,"html.parser")
prices = soup.find_all('span',{'class':'a-offscreen'})
But prices does not contain the required information. It contains data from other a-offscreen classes but not the prices which we get to see by clicking on "See all buying options".
This is the main function which runs for the particular search term and scrapes effectively. get_url is the function that generates the particular url for the search term and extract_record is the function that contains the code for extraction.
Note - I am extracting for multiple pages for particular search terms as evident from the code. Where do I fit in the code to change the zip code here?
def main(search_term):
driver = webdriver.Chrome()
records=[]
url=get_url(search_term)
for page in range(1,10):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source,"html.parser")
results = soup.find_all('div',{'data-component-type':'s-search-result'})
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
with open('csv.csv','w',newline='',encoding='utf-8') as f:
writer=csv.writer(f)
writer.writerow(['A', 'B', 'C', 'D','E'])
writer.writerows(records)
Please overlook Indentation error if any.
This confirm button needs to be pressed.
So far I have added another line to get_zip(driver) function but it's not working-
driver.find_element_by_xpath('//*[#id="GLUXConfirmClose"]').click()
Please enter valid zip code problem
Please enter valid zip code screenshot
To get the prices specifically of the different products, this could help you.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.amazon.com/Amazing-Spider-Man-Omnibus-Vol/dp/130290082X") # example url
try:
driver.find_element_by_xpath('//*[#id="buybox-see-all-buying-choices"]/span/a').click()
search = WebDriverWait(driver, 40).until(
EC.presence_of_all_elements_located(
(By.ID, 'all-offers-display-scroller'))
)
except:
driver.quit()
soup = BeautifulSoup(driver.page_source, "html.parser")
prices = [elem.text for elem in soup.find_all('span', {'class': 'a-offscreen'}) if "$" in
elem.text]
print(prices)
Click the "See all buying options" button, wait for the display scroller to be located, and scrape all the elements with the class a-offscreen from the page source.
Here's how you would change the zip code. In this example, it will show results for Portland (zip code 97217). Unless you close the driver or stop your program, you should only need to enter the zip code once, so I would do it before scraping anything.
Update: I added some delays, tell me if this works. If not, it might be because the xpaths in my code are referencing the wrong elements for the page you are trying to scrape. I am only confident that this works on the amazon.com homepage
Update 2: You said that there's an error with my get_zip() function not clicking a "confirm" button
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
def get_zip(driver):
driver.get("https://www.amazon.com")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="nav-global-location-popover-link"]')
)).click()
zipcode_e = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="GLUXZipUpdateInput"]')
))
zipcode_e.click()
zipcode_e.send_keys('97217') # <- example zip code
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="GLUXZipUpdate"]/span/input')
)).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="a-popover-3"]/div/div[2]/span/span')
)).click()
Basic functionality
I added a function to clear cookies. I also updated get_zip function
# skipped imports
def get_zip(driver):
driver.get("https://www.amazon.com")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="nav-global-location-popover-link"]')
)).click()
zipcode_e = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="GLUXZipUpdateInput"]')
))
zipcode_e.click()
zipcode_e.send_keys('97217') # <- example zip code
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="GLUXZipUpdate"]/span/input')
)).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="a-popover-3"]/div/div[2]/span/span')
)).click()
def main(search_term):
driver = webdriver.Chrome()
driver.delete_all_cookies() # clear all cookies
get_zip(driver)
records = []
url = get_url(search_term)
for page in range(1, 10):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, "html.parser")
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
with open('csv.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['A', 'B', 'C', 'D', 'E'])
writer.writerows(records)
main("shampoo")
Tell me in the comments if you require anything further, or if something doesn't work.

Not sure how to get elements from dynamically loading webpage using selenium

So I am scraping reviews and skin type from Sephora and have run into a problem identifying how to get elements off of the page.
Sephora.com loads reviews dynamically after you scroll down the page so I have switched from beautiful soup to Selenium to get the reviews.
The Reviews have no ID, no name, nor a CSS identifier that seems to be stable. The Xpath doesn't seem to be recognized each time I try to use it by copying from chrome nor from firefox.
Here is an example of the HTML from the inspected element that I loaded in chrome:
Inspect Element view from the desired page
My Attempts thus far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome("/Users/myName/Downloads/chromedriver")
url = 'https://www.sephora.com/product/the-porefessional-face-primer-P264900'
driver.get(url)
reviews = driver.find_elements_by_xpath(
"//div[#id='ratings-reviews']//div[#data-comp='Ellipsis Box ']")
print("REVIEWS:", reviews)
Output:
| => /Users/myName/anaconda3/bin/python "/Users/myName/Documents/ScrapeyFile Group/attempt32.py"
REVIEWS: []
(base)
So basically an empty list.
ATTEMPT 2:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
# Open up a Firefox browser and navigate to web page.
driver = webdriver.Firefox()
driver.get(
"https://www.sephora.com/product/squalane-antioxidant-cleansing-oil-P416560?skuId=2051902&om_mmc=ppc-GG_1165716902_56760225087_pla-420378096665_2051902_257731959107_9061275_c&country_switch=us&lang=en&ds_rl=1261471&gclid=EAIaIQobChMIisW0iLbK6AIVaR6tBh005wUTEAYYBCABEgJVdvD_BwE&gclsrc=aw.ds"
)
#Scroll to bottom of page b/c its dynamically loading
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
#scrape stats and comments
comments = driver.find_elements_by_css_selector("div.css-7rv8g1")
print("!!!!!!Comments!!!!!")
print(comments)
OUTPUT:
| => /Users/MYNAME/anaconda3/bin/python /Users/MYNAME/Downloads/attempt33.py
!!!!!!Comments!!!!!
[]
(base)
Empty again. :(
I get the same results when I try to use different element selectors:
#scrape stats and comments
comments = driver.find_elements_by_class_name("css-7rv8g1")
I also get nothing when I tried this:
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box ']")
and This (notice the space after Ellipsis Box is gone :
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box']")
I have tried using the solutions outlined here and here but ti no avail -- I think there is something I don't understand about the page or selenium that I am missing since this is my first time using selenium so i'm a super nube :(
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"")
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://www.sephora.fr/p/black-ink---classic-line-felt-liner---eyeliner-feutre-precis-waterproof-P3622017.html")
scrolls = 1
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
reviewText=wait.until(EC.presence_of_all_elements_located((By.XPATH, "//ol[#class='bv-content-list bv-content-list-reviews']//li//div[#class='bv-content-summary-body']//div[1]")))
for textreview in reviewText:
print textreview.text
Output:
I've been scraping reviews from Sephora and basically, even if there is plenty of room for improvement, it works like this :
Clicks on "reviews" to access reviews
Loads all reviews by scrolling until there aren't any review left to load
Finds review text and skin type by CSS SELECTOR
def load_all_reviews(driver):
while True:
try:
driver.execute_script(
"arguments[0].scrollIntoView(true);",
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
driver.execute_script(
"arguments[0].click();",
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
except Exception as e:
break
def get_review_text(review):
try:
return review.find_element(By.CLASS_NAME, "bv-content-summary-body-text").text
except:
return "NA" # in case it doesnt find a review
def get_skin_type(review):
try:
return review.find_element(By.XPATH, '//*[#id="BVRRContainer"]/div/div/div/div/ol/li[2]/div[1]/div/div[2]/div[5]/ul/li[4]/span[2]').text
except:
return "NA" # in case it doesnt find a skin type
to use those you've got to create a webdriver and first call the load_all_reviews() function.
Then you've got to find reviews with :
reviews = driver.find_elements(By.CSS_SELECTOR, ".bv-content-review")
and finally you can call for each review the get_review() and get_skin_type() functions :
for review in reviews :
print(get_review_text(review))
print(get_skin_type(review))

how can I get the next page's reviews with selenium?

i'm trying to scrape more than 10 pages of reviews from https://www.innisfree.com/kr/ko/ProductReviewList.do
However when i move to the next page and try to get the new page's reviews, i still get the first page's reviews only.
i used driver.execute_script("goPage(2)") and also time.sleep(5) but my code only gives me the first page's reviews.
''' i did not use for-loop just to see whether the results are different between page1 and page2'''
''' i imported beautifulsoup and selenium'''
here is my code:
url = "https://www.innisfree.com/kr/ko/ProductReviewList.do"
chromedriver = r'C:\Users\hhm\Downloads\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
driver.get(url)
print("this is page 1")
driver.execute_script("goPage(1)")
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
print('\n')
print("this is page 2")
driver.execute_script("goPage(2)")
time.sleep(5)
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
If your second page open as new window then you need to switch to another page and switch your selenium control to another window
Example:
# Opens a new tab
self.driver.execute_script("window.open()")
# Switch to the newly opened tab
self.driver.switch_to.window(self.driver.window_handles[1])
Source:
How to switch to new window in Selenium for Python?
https://www.techbeamers.com/switch-between-windows-selenium-python/
Try the following code.You need to click on each pagination link to reach to next page.you will get all 100 review comments.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
url = "https://www.innisfree.com/kr/ko/ProductReviewList.do"
chromedriver = r'C:\Users\hhm\Downloads\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
driver.get(url)
for i in range(2,12):
time.sleep(2)
soup=BeautifulSoup(driver.page_source,'html.parser')
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
if i==11:
break
nextbutton=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='num']/a[text()='" +str(i)+"']")))
driver.execute_script("arguments[0].click();",nextbutton)

Can't scrape titles from a website while clicking on the next page button

I've written a script in python in combination with selenium to scrape the links of different posts from different pages while clicking on the next page button and get the title of each post from its inner page. Although the content I'm trying to deal here are static ones, I used selenium to see how it parses items while clicking on the next pages. I'm only after any soultion related to selenium.
Website address
If I define a blank list and extend all the links to it then eventually I can parse all the titles reusing those links from their inner pages when clicking on the next page button is done but that is not what I want.
However, what I intend to do is collect all the links from each of the pages and parse title of each post from their inner pages while clicking on the next page button. In short, I wish do the two things simultaneously.
I've tried with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://stackoverflow.com/questions/tagged/web-scraping"
def get_links(url):
driver.get(url)
while True:
items = [item.get_attribute("href") for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".summary .question-hyperlink")))]
yield from get_info(items)
try:
elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']")))
driver.execute_script("arguments[0].scrollIntoView();",elem)
elem.click()
time.sleep(2)
except Exception:
break
def get_info(links):
for link in links:
driver.get(link)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.question-hyperlink"))).text
yield name
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
for item in get_links(link):
print(item)
When I run the above script, It parses the title of different posts by reusing the link from the first page but breaks throwing this error raise TimeoutException(message, screen, stacktrace)
when it hits this elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']"))) line.
How can scrape the title of each post from their inner pages collecting link from first page and then click on the next page button in order to repeat the process until it is done?
The reason you are getting no next button because when traverse each inner link at the end of that loop it can't find the next button.
You need to take each nexturl like below and execute.
urlnext = 'https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page={}&pagesize=30'.format(pageno) #where page will start from 2
Try below code.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://stackoverflow.com/questions/tagged/web-scraping"
def get_links(url):
urlnext = 'https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page={}&pagesize=30'
npage = 2
driver.get(url)
while True:
items = [item.get_attribute("href") for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".summary .question-hyperlink")))]
yield from get_info(items)
driver.get(urlnext.format(npage))
try:
elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']")))
npage=npage+1
time.sleep(2)
except Exception:
break
def get_info(links):
for link in links:
driver.get(link)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.question-hyperlink"))).text
yield name
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
for item in get_links(link):
print(item)

Categories

Resources