How can I scrape a webpage that only loads it in increments - python

I am trying to count the number of items that contain the word "kudoed" from a particular webpage. Now, the webpage itself only loads a limited number of items initally and then requires a button to be pressed to load the rest. Please see the image below:
enter image description here
I wrote a selenium + beautiful soup code to do this. The reason why I had to use selenium is due to some proxy errors. Here is my full code so far:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Edge(executable_path = r"C:\Users\H\Desktop\Automated_Tasks\msedgedriver.exe") # Modify the path here...
# Navigate to URL
driver.get("https://powerusers.microsoft.com/t5/notificationfeed/page")
# Wait for the page to load
wait = WebDriverWait(driver, 10)
# Get all elements on the page
time.sleep(8)
click_button=driver.find_element("xpath", '/html/body/div[2]/center/div[4]/div/div/div/div[1]/div[3]/div/div/div/div/span/a').click()
element = driver.find_element("ID", 'viewMoreLink')
driver.execute_script("arguments[0].click();", element)
from bs4 import BeautifulSoup
# Get the page source
page_source = driver.page_source
# Create a BeautifulSoup object
soup = BeautifulSoup(page_source, 'html.parser')
items = soup.find_all("div", class_="lia-quilt-column-alley lia-quilt-column-alley-right")
count = 0
for item in items:
if "kudoed" in item.text:
count += 1
print(f"Number of items containing 'kudoed': {count}")
Is there a way for me to click the button without having to tell selenium to click the button, wait for the next items to load and repeat these steps until the entire list has been loaded?.
When it gets to the code:
click_button=driver.find_element("xpath", '/html/body/div[2]/center/div[4]/div/div/div/div[1]/div[3]/div/div/div/div/span/a').click()
I get the following error:
ElementClickInterceptedException: element click intercepted: Element is not clickable at point (476, 2184)
(Session info: MicrosoftEdge=109.0.1518.61)
I tried searching by ID and it still did not work. Here is the full HTML for the button:
enter image description here

Usually ElementClickInterceptedException means that you are trying to click an element not visible on the page, so before clicking it you have to scroll to it.
Is there a way for me to click the button without having to tell
selenium to click the button, wait for the next items to load and
repeat these steps until the entire list has been loaded?
I don't think so, but is not that hard to do the job:
items_old, items = [], []
while 1:
while len(items) == len(items_old):
items = WebDriverWait(driver, 9).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'lia-notification-feed-item')))
print(f'{len(items)=}')
show_more_btn = driver.find_elements(By.ID, 'viewMoreLink')
if show_more_btn:
print('load more items')
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', show_more_btn[0])
time.sleep(2)
show_more_btn[0].click()
items_old = items.copy()
else:
print('all items loaded')
break
print(f"Number of items containing 'kudoed': {sum(['kudoed' in x.text for x in items])}")
Output
len(items)=25
load more items
len(items)=32
all items loaded
Number of items containing 'kudoed': 2

Related

How to do pagination with scroll in Selenium?

I need to do pagination for this page:
I read this question and I try this:
scrolls = 10
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
I need to scroll down for getting all the products, but I don't know how many time I need to scroll for getting all the products.
I also tried to have a big screen
'SELENIUM_DRIVER_ARGUMENTS': ['--no-sandbox', '--window-size=1920,30000'],
and scroll down
time.sleep(10)
self.driver.execute_script("window.scrollBy(0, 30000);")
Does someone have an Idea how to get all products ?
I'm open to another solution, if Selenium is not the best for this case.
Thanks.
UPDATE 1:
I need to have all product IDs. for having the product IDs I use this:
products = response.css('div.jfJiHa > .iepIep')
for product in products:
detail_link = product.css('a.jXwbaQ::attr("href")').get()
product_id = re.findall(r'products/(\d+)', detail_link)[0]
As commented, without seeing your whole spider it is hard to see where you are going wrong here, but if we assume that your parsing is using the scrapy response then that is why you are always just getting 30 products.
You need to create a new selector from the driver after each scroll and query that. A full example of code that gets 300 items from the page is
import re
import time
from pprint import pprint
import parsel
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox
with Firefox() as driver:
driver.get("https://www.compraonline.bonpreuesclat.cat/products/search?q=pasta")
all_items = {}
while True:
sel = parsel.Selector(driver.page_source)
for product in sel.css("div[data-test] h3 > a"):
name = product.css("::text").get()
product_id = re.search("(\d+)", product.attrib["href"]).group()
all_items[product_id] = name
try:
element = driver.find_element_by_css_selector(
"div[data-test] + div.iepIep:not([data-test])"
)
except NoSuchElementException:
break
driver.execute_script("arguments[0].scrollIntoView(true);", element)
time.sleep(1)
pprint(all_items)
print("Number of items =", len(all_items))
The key bits of this
After getting the page using driver.get we start looping
We create a new Selector (here I directly use parsel.Selector which is what scrapy uses internally)
We extract the info we need. Displayed products all have a data-test attribute. If this was a scrapy.Spider I'd yield the information, but here I just add it to a dictionary of all items.
After getting all the visible items, we try to find the first following sibling of a div with a data-test attribute , that doesn't have a data-test attribute (using the css + symbol)
If no such element exists (because we have seen all items) then break out of the loop, otherwise scroll that element into view and pause a second
Repeat until all items have been parsed
Try scrolling visible screen height amount page down each time reading the presented products until the //button[#data-test='footer-feedback-button'] or any other element located on the bottom is visible
This code may help -
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)
driver.get('https://www.compraonline.bonpreuesclat.cat/products/search?q=pasta')
BaseDivs = driver.find_elements_by_xpath("//div[contains(#class,\"base__Wrapper\")]")
for div in BaseDivs:
try:
wait.until(EC.visibility_of_element_located((By.XPATH, "./descendant::img")))
driver.execute_script("return arguments[0].scrollIntoView(true);", div)
except StaleElementReferenceException:
continue
This code will wait for the image to load and then focus on the element. This way it will automatically scroll down till the end of the page.
Mark it answer if this is what you are looking for.
I solved my problem but not with Selenium, We can have all the products of search by another request:
https://www.compraonline.bonpreuesclat.cat/api/v4/products/search?limit=1000&offset=0&sort=favorite&term=pasta

How to web-scrape in for loop, without losing DOM? (Python, Selenium)

I`m trying to get data from polish Wiki-dictonary. Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get("https://pl.wiktionary.org/wiki/Kategoria:J%C4%99zyk_polski_-_rzeczowniki")
page = driver.find_element_by_xpath('//*[#id="mw-pages"]/div/div')
words = page.find_elements_by_tag_name('li') #loading all the words
delay = 30
for word in words:
myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, '//*[#id="mw-pages"]/a[2]')))
word.find_element_by_tag_name('a').click() #entering word
#COLLECTING DATA
driver.back()
# also tried with driver.execute_script("window.history.go(-1)") - same reasult
time.sleep(5) #added to make sure that time is not an obstacle
I get this error while trying to enter next word:
StaleElementReferenceException: stale element reference: element is not attached to the page document
(Session info: chrome=88.0.4324.190)
When you click you're changing the page which renders the previous elements stale.
So you need to either collect the pages you want to go to FIRST and step through them or you need to keep track of which element you're viewing and increment when you go back:
i = 0
for word in words:
myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, '//*[#id="mw-pages"]/a[2]')))
word.find_elements_by_tag_name('a')[i].click() #entering word
#COLLECTING DATA
driver.back()
i++
# also tried with driver.execute_script("window.history.go(-1)") - same reasult
time.sleep(5) #added to make sure that time is not an obstacle
But, as you can find in stack overflow, there are ways to launch the link in a NEW window, switch_to that window, grab the data, then close that window and proceed onto the next link element.
Normally when we are working with ahref tags we get their href values and then loop and driver.get() them.
driver.get("https://pl.wiktionary.org/wiki/Kategoria:J%C4%99zyk_polski_-_rzeczowniki")
ahrefs= [x.get_attribute('href') for x in driver.find_elements_by_xpath('//*[#id="mw-pages"]/div/div//li/a')]
for ahref in ahrefs:
driver.get(ahref)

How do I click-iterate over a list of web elements in Selenium?

I'd like to be able to click on several links on a page of search results. To do this, I've tried to create a list of web elements finding all of the xpaths on a search results page that correspond with links. I would then like to be able to click on those links for each xpath and return to the search results page and click on the next link in the list.
So far I've been able to create a nested loop that scrapes all of the href tags on a page to create/find the xpath. The second level of the loop is intended to click on the link corresponding to that element in the list and then return to the search page to click on the next element in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
driver = webdriver.Chrome('C:\\Users\\km13\\chromedriver.exe')
driver.get("http://www.congreso.gob.pe/pley-2016-2021")
WebDriverWait(driver, 50).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, 'ventana02')))
soup = BeautifulSoup(driver.page_source, 'lxml')
table = soup.find('table', {'cellpadding' : '2'})
table_items = table.find_all('a')
for item in table_items:
elements = driver.find_elements_by_xpath("//a[#href='" + item.get('href') + "']")
print(elements)
for elem in elements:
elem.click()
driver.back()
This coded will click on the first link on the search results page, go to that link and then click back to the search results page. But it does not iterate on to the next link in the element list. I'm not sure if there is something wrong with the loop I wrote?
The moment you click on the element and page get refresh/loads then the selenium element references will be refreshed. So, you can not access them with the old references and when you try with old references you will get StaleElementException.
To work with the latest references of the element, you have to get the element(s) every time the page refreshed.
Here is your script with minimal changes to accommodate the request.
for iLink in range(iLinks):
print(iLink)
link = driver.find_elements_by_xpath("(//table)[2]//td//a")[iLink]
print (link.get_attribute('href'))
# add link related logic here
link.click()
# you have to wait for the next element to display
time.sleep(1)
# click on back in browser
driver.back()
time.sleep(3)
driver.switch_to.frame('ventana02')

Can't scrape titles from a website while clicking on the next page button

I've written a script in python in combination with selenium to scrape the links of different posts from different pages while clicking on the next page button and get the title of each post from its inner page. Although the content I'm trying to deal here are static ones, I used selenium to see how it parses items while clicking on the next pages. I'm only after any soultion related to selenium.
Website address
If I define a blank list and extend all the links to it then eventually I can parse all the titles reusing those links from their inner pages when clicking on the next page button is done but that is not what I want.
However, what I intend to do is collect all the links from each of the pages and parse title of each post from their inner pages while clicking on the next page button. In short, I wish do the two things simultaneously.
I've tried with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://stackoverflow.com/questions/tagged/web-scraping"
def get_links(url):
driver.get(url)
while True:
items = [item.get_attribute("href") for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".summary .question-hyperlink")))]
yield from get_info(items)
try:
elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']")))
driver.execute_script("arguments[0].scrollIntoView();",elem)
elem.click()
time.sleep(2)
except Exception:
break
def get_info(links):
for link in links:
driver.get(link)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.question-hyperlink"))).text
yield name
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
for item in get_links(link):
print(item)
When I run the above script, It parses the title of different posts by reusing the link from the first page but breaks throwing this error raise TimeoutException(message, screen, stacktrace)
when it hits this elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']"))) line.
How can scrape the title of each post from their inner pages collecting link from first page and then click on the next page button in order to repeat the process until it is done?
The reason you are getting no next button because when traverse each inner link at the end of that loop it can't find the next button.
You need to take each nexturl like below and execute.
urlnext = 'https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page={}&pagesize=30'.format(pageno) #where page will start from 2
Try below code.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://stackoverflow.com/questions/tagged/web-scraping"
def get_links(url):
urlnext = 'https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page={}&pagesize=30'
npage = 2
driver.get(url)
while True:
items = [item.get_attribute("href") for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".summary .question-hyperlink")))]
yield from get_info(items)
driver.get(urlnext.format(npage))
try:
elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".pager > a[rel='next']")))
npage=npage+1
time.sleep(2)
except Exception:
break
def get_info(links):
for link in links:
driver.get(link)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.question-hyperlink"))).text
yield name
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
for item in get_links(link):
print(item)

My script encounters an error while clicking links

I've written a script in python in association with selenium to initiate clicks on some links connected to each profile out of a webpage. My script is doing good for the first link. As there is no valid links (all of them are javascript links) connected to each profile, I had to do the clicking. However, when it is supposed to click on the next linkk and so on, It throws an error stale element reference: element is not attached to the page document. If It were not for clicking, I could have easily navigated to each profile without having that error following the logic I applied within my script. In case of clicking links cyclically, I don't find any idea to move along.
What can I do now to get rid of that error and click all the links?
This is my try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def search_links(driver,link):
driver.get(link)
items = [item for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#main table tr a"))) if "#" not in item.get_attribute("href")]
for item in items:
item.click() #The error is thrown here as soon as the script goes for the second click
wait.until(EC.staleness_of(item))
if __name__ == '__main__':
url = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
search_links(driver,url)
finally:
driver.quit()
This works
def search_links(driver,link):
driver.get(link)
items = [item for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#main table tr a"))) if "#" not in item.get_attribute("href")]
for i in range(0, len(items)):
items = driver.find_elements_by_css_selector("#main table tr a")
items[i].click()
driver.find_element_by_link_text("back to directory").click()
i = i+1;
The list of items you are iterating over contains WebElements. As soon as you click() one, you trigger navigation which makes all of the original WebElements stale (because you have navigated away from the DOM they were attached to).
As a workaround, you could get all the href url's from the elements and use those. Since they don't contain state, it won't matter what page you are on. So rather than clicking, you could do something like:
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#main table tr a")))
hrefs = [item.get_attribute("href") for item in elements if "#" not in item.get_attribute("href")]
for href in hrefs:
driver.get(href)

Categories

Resources