After starting the scraper something strange happens: it either works properly, ends after visiting the second page and clicking the Next button, or it somehow ends up on the property page, when I use the code with the line that is currently commented out. However, when that line is placed as it is now, it seems to work, it visits all the pages and scrapes them, and eventually, I get a time out. I am unsure what the issue is? Any tips?
The current code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('my_path')
def parse(self,response):
url = self.driver.get(response.url)
while True:
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
#WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
It seem that Selenium "recognize" even disabled Next button as clickable element and still tries to click it even on last page. You can try below code to make it work:
def parse(self,response):
self.driver.get(response.url)
url = self.driver.current_url
while True:
try:
elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[#id="pagerNext" and not(#class="disabled")]')))
elem.click()
except TimeoutException:
break
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
Note that I replaced (By.ID, "pagerNext") locator with (By.XPATH, '//span[#id="pagerNext" and not(#class="disabled")]'), so now only enabled Next button will be clicked
Related
I'm working on a scaper that collects property information.
The original code works perfectly.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class huntsmanCSS(scrapy.Spider):
name = "huntsman"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('parcel_ids.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def parse(self, response):
yield {
'propId': response.css('#dnn_ctr388_View_tdPropertyID::text').extract_first(),
'address': response.css('#dnn_ctr388_View_tdPropertyAddress::text').extract_first(),
'owner': response.css('#dnn_ctr388_View_divOwnersLabel::text').extract_first(),
'propertyClass': response.css('#dnn_ctr388_View_tdGIPropertyClass::text').extract_first(),
'hood': response.css('#dnn_ctr388_View_tdGINeighborhood::text').extract_first(),
'buildType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(3)::text').extract_first(),
'improveType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(4)::text').extract_first(),
'yrBuilt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'saleDate': response.css('#dnn_ctr388_View_tblSalesHistoryData tr:nth-child(2) > td:nth-child(1)::text').extract_first(),
'TAV': response.css('#dnn_ctr388_View_tdPropertyValueHeader::text').extract_first(),
'price': response.css('#dnn_ctr388_View_tblSalesHistoryData > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'sqFt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(6)::text').extract_first()
}
Using a list of all parcels, it adjusts the URL to go to the next page.
Broken Code:
There is a link to a pdf that is embedded in a javascript button. The pdf contains more information that I want to scrape.
It will retrieve the first link but then throws errors.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class resDatasheetLink(scrapy.Spider):
name = "resDatasheetLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
self.driver.close()
yield {
'datasheet': link
}
Error:
2021-12-30 10:40:36 [scrapy.core.engine] DEBUG:
Crawled (200) <GET
https://orion.lancaster.ne.gov/Property-
Detail/PropertyQuickRefID/R402438> (referer: None)
2021-12-30 10:40:36
[selenium.webdriver.remote.remote_connection]
DEBUG: POST
http://localhost:19113/session/5acb1d8f4ebdb13482ab40a67f846d1d/url {"url": "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438"}
2021-12-30 10:40:36 [urllib3.connectionpool] DEBUG: http://localhost:19113 "POST /session/5acb1d8f4ebdb13482ab40a67f846d1d/url HTTP/1.1" 404 878
2021-12-30 10:40:36 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-12-30 10:40:36 [scrapy.core.scraper] ERROR: Spider error processing <GET https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438> (referer: None)
Traceback (most recent call last):
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id
break will take you out of the while loop. You need to unindent the last few lines just below try-except{} and invoke self.driver.close() (preferably self.driver.quit()) line at the end of parsing as follows:
def parse(self, response):
self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
yield {
'datasheet': link
}
self.driver.close()
based on the way the spider is configured the loop was the issue.
class rDataLink(scrapy.Spider):
name = "rDataLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
WebDriverWait(self.driver, 7).until(EC.url_changes(response.url))
link = self.driver.current_url
yield {
'datasheet': link
}
I am working on certain stock-related projects where I have had a task to scrape all data on a daily basis for the last 5 years. i.e from 2016 to date. I particularly thought of using selenium because I can use crawler and bot to scrape the data based on the date. So I used the use of button click with selenium and now I want the same data that is displayed by the selenium browser to be fed by scrappy.
This is the website I am working on right now.
I have written the following code inside scrappy spider.
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016', up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(z, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.url = driver.page_source
yield Request(url=self.url, callback=self.parse)
def parse(self, response, **kwargs):
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Update: Error after answer is
2022-01-14 14:11:36 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
I want to send current web html content to scrapy feeder but I am getting unusal error for past 2 days any help or suggestions will be very much appreciated.
The 2 solutions are not very different. Solution #2 fits better to your question, but choose whatever you prefer.
Solution 1 - create a response with the html's body from the driver and scraping it right away (you can also pass it as an argument to a function):
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver = webdriver.Chrome()
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = driver.page_source
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
# return an empty requests list
return []
Solution 2 - with super simple downloader middleware:
(You might have a delay here in parse method so be patient).
import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
class SeleniumMiddleware(object):
def process_request(self, request, spider):
url = spider.driver.current_url
body = spider.driver.page_source
return HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'tempbuffer.spiders.yetanotherspider.SeleniumMiddleware': 543,
# 'projects_name.path.to.your.pipeline': 543
}
}
driver = webdriver.Chrome()
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
self.driver.get(
"https://merolagani.com/Floorsheet.aspx")
self.driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
self.driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = self.driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
self.driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = self.driver.page_source
self.url = self.driver.current_url
yield Request(url=self.url, callback=self.parse, dont_filter=True)
def parse(self, response, **kwargs):
print('test ok')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Notice that I've used chrome so change it back to firefox like in your original code.
I am trying to get some data from Google but it I get the "before you continue" google popup. I am trying to make selenium locate the button and click it and return to the getting data but it seems even if I have the button ID in the code it doesn't find it
"""
Search on Google and returns the list of PAA questions in SERP.
"""
def newSearch(browser,query):
if lang== "en":
browser.get("https://www.google.com?hl=en")
WebDriverWait(browser, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe")))
agree = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="L2AGLb"]/span/span')))
agree.click()
browser.switch_to_default_content()
searchbox = browser.find_element_by_xpath("//input[#aria-label='Search']")
else:
browser.get("https://www.google.com?hl=es")
searchbox = browser.find_element_by_xpath("//input[#aria-label='Buscar']")
searchbox.send_keys(query)
sleepBar(2)
tabNTimes()
if lang== "en":
searchbtn = browser.find_elements_by_xpath("//input[#aria-label='Google Search']")
else:
searchbtn = browser.find_elements_by_xpath("//input[#aria-label='Buscar con Google']")
try:
searchbtn[-1].click()
except:
searchbtn[0].click()
sleepBar(2)
paa = browser.find_elements_by_xpath("//span/following-sibling::div[contains(#class,'match-mod-horizontal-padding')]")
hideGBar()
return paa
Try clicking the inner div of the button itself. HTML of the agree popup:
<button id="L2AGLb" class="tHlp8d" data-ved="0ahUKEwj89p7Swob1AhVBxhoKHS0gDxIQiZAHCCE">
<div class="QS5gu sy4vM" role="none">
Acepto
</div>
</button>
Your selector should look like this:
(By.CSS_SELECTOR, "#L2AGLb > div")
Here is a working full example:
def test_google_...(self):
driver = self.driver
if self.LANGUAGE == "en":
driver.get("https://www.google.com?hl=en")
else:
driver.get("https://www.google.com?hl=es")
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#L2AGLb > div"))
)
driver.find_element(By.CSS_SELECTOR, "#L2AGLb > div").click()
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="q"]'))
)
query = "your search query"
driver.find_element(By.CSS_SELECTOR, 'input[name="q"]').send_keys(query)
driver.find_element(By.CSS_SELECTOR, 'input[name="q"]').send_keys(Keys.RETURN)
...
I am struggling to parse/scrape each page after clicking the Next button using Selenium. I am able to go to the second page, however, it fails after that. Not sure how to solve this, any suggestions?
Here is the code:
class PropertyFoxSpider(scrapy.Spider):
name = 'property_fox'
start_urls = [
'https://propertyfox.co.za/listing-search?currentpage=1&term_id=62515&keywords=Western+Cape&orderby=createddate:desc&status%5B%5D=Active'
]
def __init__(self):
#path to driver
self.driver = webdriver.Chrome('path')
def parse(self,response):
self.driver.get(response.url)
while True:
try:
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
url = self.driver.current_url
yield scrapy.Request(url=url, callback=self.parse_page, dont_filter=False)
except TimeoutException:
break
def parse_page(self, response):
#self.driver.get(response.url)
for prop in response.css('div.property-item'):
link = prop.css('a::attr(href)').get()
banner = prop.css('div.property-figure-icon div::text').get()
sold_tag = None
if banner:
banner = banner.strip()
sold_tag = 'sold' if 'sold' in banner.lower() else None
yield scrapy.Request(
link,
callback=self.parse_property,
meta={'item': {
'agency': self.name,
'url': link,
'offering': 'buy',
'banners': banner,
'sold_tag': sold_tag,
}},
)
def parse_property(self, response):
item = response.meta.get('item')
...
You can wait until URL changed and then scrape it
from selenium.webdriver.support.ui import WebDriverWait
url = self.driver.current_url
elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, "pagerNext")))
elem.click()
WebDriverWait(self.driver, 10).until(lambda driver: self.driver.current_url != url)
url = self.driver.current_url
I want to:
Extract links for a certain page
For each link, I need some contents for that link, and the contents of 'next pages' of that link.
Then export it as json file(not important as far as I think regarding my problem)
Currently my spider is like this:
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
nextPage = obtainNextPage()
if nextPage:
yield scrapy.Request(url=next_url, callback=self.parse_next)
The problem is for a set of links that the spider processed, the spider could only reach 'next page' for the last link of that set of links, I viewed that through selenium + chromedriver. For example, I have 10 links(from No.1 to No.10), my spider could only get the next pages for the No.10 link. I don't know if the problem occurred was because of some structural problem of my spider. Below is the full code:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(15)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')[:2]:
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_next)
# checked
def pageScroll(self, url):
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_next(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
try: reply = sel.find_element_by_css_selector('ul.j_lzl_m_w').text
except: reply = ''
yield {'name': name, 'content': content, 'reply': reply}
#follow to next page
next_sel = self.driver.find_element_by_link_text("下一页")
next_url_name = next_sel.text
if next_sel and next_url_name == '下一页':
next_url = next_sel.get_attribute('href')
yield scrapy.Request(url=next_url, callback=self.parse_next)
Thanks for your help, and welcome any suggestions referring my code above
In reference to scraping content from one page, store it, and allow the spider to continue the crawl to the scrape and store items on subsequent pages. You should be configuring your items.py file with the item names and pass the items through each scrapy.Request using a meta.
You should check out https://github.com/scrapy/scrapy/issues/1138
To illustrate how this works, it goes something like this...
1. First, we set up the item.py file with the total items to be scraped on every page.
#items.py
import scrapy
class ScrapyProjectItem(scrapy.Item):
page_one_item = scrapy.Field()
page_two_item = scrapy.Field()
page_three_item = scrapy.Field()
Then its importing the items.py item class to you scrapy spider.
from scrapyproject.items import ScrapyProjectItem
The in your scraper, through each page iteration that has content you want, its initializing the items.py class the pass the items using 'meta' to the next request.
#spider.py
def parse(self, response):
# Initializing the item class
item = ScrapyProjectItem()
# Itemizing the... item lol
item['page_one_item'] = response.css("etcetc::").extract() # set desired attribute
# Here we pass the items to the next concurrent request
for url in someurls: # Theres a million ways to skin a cat, dont know your exact use case.
yield scrapy.Request(response.urljoin(url),
callback=self.parse_next, meta={'item': item})
def parse_next(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_two_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
def parse_again(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_three_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
# At the end of each iteration of the crawl loop we can yield the result
yield item
As to the problem about crawler only reaching the last link, I would like to have more info instead of guessing what the problem could be. In your "parse_next", you should add a "print(response.url)" to see if the pages are being reached at all? Im sorry if I didnt understand your problem and wasted everyones time lol.
EDIT
I think I understand better you issue ... You have a list of urls, and each urls has its own set of urls yes?
In your code, the "obtainNextPage()" might be the issue? I have in the past when encountering this type of case have had to use some xpath and/or regex magic to properly obtain the next pages. Im not sure what "obtainNextPage" is doing but... have you thought of parsing the content and use selector to find the next page?? For example.
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
#nextPage = obtainNextPage()
next_page = response.xpath('//path/to/nextbutton/orPage'):
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
callback=self.parse_next)
You should still add that "print(response.url)" to see if the url thats being requested is being called correctly, might be urljoin issue.