Unable to scrape the Next page URLs using Selenium and scrapy - python

I am struggling to parse/scrape each page after clicking the Next button using Selenium. I am able to go to the second page, however, it fails after that. Not sure how to solve this, any suggestions?
Here is the code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('path')
def parse(self,response):
self.driver.get(response.url)
while True:
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
#self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...

You can wait until URL changed and then scrape it
from selenium.webdriver.support.ui import WebDriverWait
url = self.driver.current_url
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url

Related

How to fix Scrapy-Selenium not yielding output?

Selenium requests works but not using scrapy-selenium. The page loads and I get a 200 response from the website, but I get no error as it isn't yielding any output.
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
table = r.xpath('//*[#id="table-apps"]/tbody')
rows = table.css('tr[class= "app"]')[0:2]
for element in rows:
info_link = "https://steamdb.info" + element.css('::attr(href)').get()
name = element.css('a ::text').get()
yield {"Name": name, "Link": info_link}
Actually, SeleniumRequest with scrapy is not always perfect. The same selement selection is worwking selenium with bs4 but getting empty output like you along with scrapy.
Scrapy-SeleniumRequest not working
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
rows = r.css('table#table-apps tbody tr')
for element in rows:
info_link = "https://steamdb.info" + element.css('td:nth-child(3) > a::attr(href)').get()
name = element.css('td:nth-child(3) > a::text').get()
yield {"Name": name, "Link": info_link}
Selenium with bs4 is working fine:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
driver.get("https://steamdb.info/graph/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tr in soup.select('table#table-apps tbody tr'):
link=tr.select_one('td:nth-child(3) > a').get('href')
link="https://steamdb.info" + link
name = tr.select_one('td:nth-child(3) > a').text
print(link)
print(name)
Output:
https://steamdb.info/app/730/graphs/
Counter-Strike: Global Offensive
https://steamdb.info/app/570/graphs/
Dota 2
https://steamdb.info/app/578080/graphs/
PUBG: BATTLEGROUNDS
https://steamdb.info/app/1172470/graphs/
Apex Legends
https://steamdb.info/app/1599340/graphs/
Lost Ark
https://steamdb.info/app/271590/graphs/
Grand Theft Auto V
https://steamdb.info/app/440/graphs/
Team Fortress 2
https://steamdb.info/app/1446780/graphs/
MONSTER HUNTER RISE
https://steamdb.info/app/346110/graphs/
ARK: Survival Evolved
https://steamdb.info/app/252490/graphs/
Rust
https://steamdb.info/app/431960/graphs/
Wallpaper Engine
https://steamdb.info/app/1506830/graphs/
FIFA 22
https://steamdb.info/app/1085660/graphs/
Destiny 2
https://steamdb.info/app/1569040/graphs/
Football Manager 2022
https://steamdb.info/app/230410/graphs/
Warframe
https://steamdb.info/app/1203220/graphs/
NARAKA: BLADEPOINT
https://steamdb.info/app/359550/graphs/
Tom Clancy's Rainbow Six Siege
https://steamdb.info/app/381210/graphs/
Dead by Daylight
https://steamdb.info/app/236390/graphs/
.. so on

scrape only 1 page not to scrape multiple pages

They will scrape only the data of 1page only not to move on second page is any solution then provide me have trying different approches but I am not successfull to solve these problem if any solution then provide me these is page URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome(
'C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
self.driver.get(
"https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
title = response.xpath(
"//span[#id='HeadingContent_lblTitle']//text()").get()
d1 = response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1 = d1.strip()
d2 = response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2 = d2.strip()
d3 = response.xpath(
"//div[#class='col-md-10']//p[3]//span//text()").get()
d3 = d3.strip()
d4 = response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4 = d4.strip()
yield{
"title1": title,
"title2": d1,
"title3": d2,
"title4": d3,
"title5": d4,
}
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, 4):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
They give me output of 1 page:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup your driver, open the page, etc.
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
# use can use regex here
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, max_page):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()

TimeoutException Selenium

After starting the scraper something strange happens: it either works properly, ends after visiting the second page and clicking the Next button, or it somehow ends up on the property page, when I use the code with the line that is currently commented out. However, when that line is placed as it is now, it seems to work, it visits all the pages and scrapes them, and eventually, I get a time out. I am unsure what the issue is? Any tips?
The current code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path')
def parse(self,response):
url = self.driver.get(response.url)
while True:
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
#WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
It seem that Selenium "recognize" even disabled Next button as clickable element and still tries to click it even on last page. You can try below code to make it work:
def parse(self,response):
self.driver.get(response.url)
url = self.driver.current_url
while True:
try:
elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[#id="pagerNext" and not(#class="disabled")]')))
elem.click()
except TimeoutException:
break
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
Note that I replaced (By.ID, "pagerNext") locator with (By.XPATH, '//span[#id="pagerNext" and not(#class="disabled")]'), so now only enabled Next button will be clicked

Unable to paginate with Selenium and Scrapy

I scrape a website with Scrapy. My problem is that the pagination is using javascript. So I can't loop through a link.
I try to figure that out with Selenium but I have multiple errors with a lot of (referer: None) or Unable to locate element: {"method":"xpath","selector":"//li[#class="btn-next"]/a"
My spider code:
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from lp_spider.items import AnnonceItem
from selenium import webdriver
class AnnonceSpider(scrapy.Spider):
name = 'lp_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data(
"lp_spider", "json/input/db_scrap_url_lp_js_10000_reduced2.json")
self.data = json.loads(data_file)
self.driver = webdriver.Chrome()
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['url_lp'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath(
"//li[#class='btn-next']/a")
try:
item = response.meta['item']
item['results'] = []
for caritem in response.css("li.li-result"):
data = AnnonceItem()
data["marque"] = caritem.css("span.brand::text").extract_first(
)
item['results'].append(data)
yield item
next.click()
except:
break
self.driver.close()
A screenshot of the pagination's HTML:
EDIT: I post the XHR of the console when I click on the next button:

scrapy crawl a set of links that might contains next pages

I want to:
Extract links for a certain page
For each link, I need some contents for that link, and the contents of 'next pages' of that link.
Then export it as json file(not important as far as I think regarding my problem)
Currently my spider is like this:
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
nextPage = obtainNextPage()
if nextPage:
yield scrapy.Request(url=next_url, callback=self.parse_next)
The problem is for a set of links that the spider processed, the spider could only reach 'next page' for the last link of that set of links, I viewed that through selenium + chromedriver. For example, I have 10 links(from No.1 to No.10), my spider could only get the next pages for the No.10 link. I don't know if the problem occurred was because of some structural problem of my spider. Below is the full code:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(15)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')[:2]:
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_next)
# checked
def pageScroll(self, url):
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_next(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
try: reply = sel.find_element_by_css_selector('ul.j_lzl_m_w').text
except: reply = ''
yield {'name': name, 'content': content, 'reply': reply}
#follow to next page
next_sel = self.driver.find_element_by_link_text("下一页")
next_url_name = next_sel.text
if next_sel and next_url_name == '下一页':
next_url = next_sel.get_attribute('href')
yield scrapy.Request(url=next_url, callback=self.parse_next)
Thanks for your help, and welcome any suggestions referring my code above
In reference to scraping content from one page, store it, and allow the spider to continue the crawl to the scrape and store items on subsequent pages. You should be configuring your items.py file with the item names and pass the items through each scrapy.Request using a meta.
You should check out https://github.com/scrapy/scrapy/issues/1138
To illustrate how this works, it goes something like this...
1. First, we set up the item.py file with the total items to be scraped on every page.
#items.py
import scrapy
class ScrapyProjectItem(scrapy.Item):
page_one_item = scrapy.Field()
page_two_item = scrapy.Field()
page_three_item = scrapy.Field()
Then its importing the items.py item class to you scrapy spider.
from scrapyproject.items import ScrapyProjectItem
The in your scraper, through each page iteration that has content you want, its initializing the items.py class the pass the items using 'meta' to the next request.
#spider.py
def parse(self, response):
# Initializing the item class
item = ScrapyProjectItem()
# Itemizing the... item lol
item['page_one_item'] = response.css("etcetc::").extract() # set desired attribute
# Here we pass the items to the next concurrent request
for url in someurls: # Theres a million ways to skin a cat, dont know your exact use case.
yield scrapy.Request(response.urljoin(url),
callback=self.parse_next, meta={'item': item})
def parse_next(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_two_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
def parse_again(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_three_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
# At the end of each iteration of the crawl loop we can yield the result
yield item
As to the problem about crawler only reaching the last link, I would like to have more info instead of guessing what the problem could be. In your "parse_next", you should add a "print(response.url)" to see if the pages are being reached at all? Im sorry if I didnt understand your problem and wasted everyones time lol.
EDIT
I think I understand better you issue ... You have a list of urls, and each urls has its own set of urls yes?
In your code, the "obtainNextPage()" might be the issue? I have in the past when encountering this type of case have had to use some xpath and/or regex magic to properly obtain the next pages. Im not sure what "obtainNextPage" is doing but... have you thought of parsing the content and use selector to find the next page?? For example.
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
#nextPage = obtainNextPage()
next_page = response.xpath('//path/to/nextbutton/orPage'):
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
callback=self.parse_next)
You should still add that "print(response.url)" to see if the url thats being requested is being called correctly, might be urljoin issue.

Categories

Resources