Crawled but not scraped - python

I am trying to scrape using Scrapy the following website (https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date'), and I see that the page gets crawled but none of the items are returned. Everything works withing Scrapy Shell.
Here is the code I have:
class LeadHomeSpider(scrapy.Spider):
name = "lead_home"
start_urls = [
'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date',
]
# parse search page
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
link = 'https://www.leadhome.co.za' + prop.css('a::attr(href)').get()
a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
#prop_type = attempt_get_property_type(a[0]) if len(a) != 0 else None
area = a[1] if len(a) > 1 else None
yield scrapy.Request(
link,
meta={'item': {
'agency': self.name,
'url': link,
'area': area,
'offering': offering,
#'property_type': prop_type,
}},
callback=self.parse_property,
)
# follow to next page
next_page_number = response.xpath(
'//a[contains(#class, "styles__PageNumber-zln67a-0 jRCKhp")]/following-sibling::a/text()').get()
if next_page_number is not None:
new_page_link = 'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date&page=' + next_page_number
next_page = response.urljoin(new_page_link)
yield scrapy.Request(next_page, callback=self.parse)
# parse property
def parse_property(self, response):
item = response.meta.get('item')
item['parking'] = response.xpath('//p[contains(text(), "Uncovered Parking:")]/following-sibling::p/text()').get()
...
Any idea what might be wrong here? Any suggestions are welcome! Thank you in advance!

You're using random class values (1j5dndx-0.bsqBpI etc) in your CSS expressions that's why your code don't work. Here is the same code but using XPath's contains to match a part of a class:
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
# for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
for prop in response.xpath('//div[contains(#class, "search__PropertyCardWrapper-sc-")]'):
link = prop.xpath('.//a/#href').get()
# a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
prop_type = prop.xpath('(.//p[contains(#class, "styles__Label-")])[1]/text()').get()
# area = a[1] if len(a) > 1 else None
link = response.urljoin(link)
yield scrapy.Request(
url=link,
meta={'item': {
'agency': self.name,
'url': link,
# 'area': area,
'offering': offering,
'property_type': prop_type,
}},
callback=self.parse_property,
)

Related

Scrapy Pagination follow upto 2 pages but it has to follow more

Pagination got results of page_1 and page_2 while it has to follow more than that i.e upto 10 pages. I change next_page .ccs selector with .xpath but nothing work for me.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.urljoin(response.css('a.pagination--next::attr(href)').extract_first())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items
I ran the script right now and found that it is doing fine. If you see that the script is grabbing the content from first page only, you surely wanna check out this link manually to be sure whether you have been rate limited. When you visit the page manually and see the captcha page, make sure to take half-an-hour break and then run the script again.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.css('a.pagination--next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items

Scraping all pages on quote.toscrape with scrapy

I'm trying to scrape some information on the website : http://quotes.toscrape.com/
But I cannot find a way to scrape all the pages, I just had the first pages for now.
Here's my script so far :
import scrapy
from ..items import QuotetutorialItem
class QuoteSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tags = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tags'] = tags
yield items
next_page = response.xpath('//*[#class="next"]/a/#href').extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
I have also tried this :
next_page = response.xpath('//*[#class="next"]/a/#href').get()
absolute_next_page_url = response.urljoin(next_page)
if absolute_next_page_url is not None:
yield scrapy.Request(absolute_next_page_url)
And this :
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
But none of these solutions seems to work.
ANy ideas ? :)
Thanks !

Stop scrapy going to the next page once condition is met

I am trying to understand how scrapy works and want to know how to stop the spider once a condition is met. I am using the scrapy tutorial to show that once the author name Pablo Neruda is scraped then the spider should not continue on to the next page. It can finish scraping the page just do not go onto the next page. Any help would be appreciated.
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'aq1'
start_urls = ['http://quotes.toscrape.com/']
stop_page = 0
def parse(self, response):
author_page_links = response.css('.author + a')
yield from response.follow_all(author_page_links, self.parse_author)
if AuthorSpider.stop_page == 0:
pagination_links = response.css('li.next a')
yield from response.follow_all(pagination_links, self.parse)
else:
pagination_links = " "
yield from response.follow_all(pagination_links, self.parse)
def parse_author(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
yield {
'Name': extract_with_css('h3.author-title::text'),
}
if extract_with_css('h3.author-title::text') == "Pablo Neruda":
AuthorSpider.stop_page = 1

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}
I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

Scrapy crawl multiple domains with reoccurring urls per domain

I am trying to crawl some selected domains and take only the essential pages from those websites. My approach is to crawl one webpage of the domain and take a limit set of urls, these urls will crawled for reoccurring URLs that i found on the first webpage. This way i try to eliminate all the URLs that didn't reoccur (content urls, such as products etc.). The reason i am asking for help is because scrapy.Request is not being executed more than once.
This is what i have so far:
class Finder(scrapy.Spider):
name = "finder"
start_urls = ['http://www.nu.nl/']
uniqueDomainUrl = dict()
maximumReoccurringPages = 5
rules = (
Rule(
LinkExtractor(
allow=('.nl', '.nu', '.info', '.net', '.com', '.org', '.info'),
deny=('facebook','amazon', 'wordpress', 'blogspot', 'free', 'reddit',
'videos', 'youtube', 'google', 'doubleclick', 'microsoft', 'yahoo',
'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia', 'creativecommons',
'mediawiki', 'wikidata'),
),
process_request='parse',
follow=True
),
)
def parse(self, response):
self.logger.info('Entering URL: %s', response.url)
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
if currentDomain in self.uniqueDomainUrl:
yield
self.uniqueDomainUrl[currentDomain] = currentDomain
item = ImportUrlList()
response.meta['item'] = item
# Reoccurring URLs
item = self.findReoccurringUrls(response)
list = item['list']
self.logger.info('Output: %s', list)
# Crawl reoccurring urls
#for href in list:
# yield scrapy.Request(response.urljoin(href), callback=self.parse)
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']
return item
def test(self, response):
self.logger.info('Page title: %s', response.css('title').extract())
item = response.meta['item']
urls = self.findUrlsOnCurrentPage( response )
item['list'] = set(item['list']) & set(urls)
return item
def findUrlsOnCurrentPage(self, response):
newUrls = []
currentUrlParse = urlparse.urlparse( response.url )
currentDomain = currentUrlParse.hostname
currentUrl = currentUrlParse.scheme +'://'+ currentUrlParse.hostname
for href in response.css('a::attr(href)').extract():
newUrl = urlparse.urljoin(currentUrl, href)
urlParse = urlparse.urlparse(newUrl)
domain = urlParse.hostname
if href.startswith( '#' ):
continue
if domain != currentDomain:
continue
if newUrl not in newUrls:
newUrls.append(newUrl)
return newUrls
It seems to be only executing the first page, the other Request() are not called as i can see on the callback.
What ImportUrlList() does? You implemented it?
You also forgot to call scrapy.Request on findReoccuringUrls
request = scrapy.Request(value, callback=self.test, meta={'item':item})
def findReoccurringUrls(self, response):
self.logger.info('Finding reoccurring URLs in: %s', response.url)
item = response.meta['item']
urls = self.findUrlsOnCurrentPage(response)
item['list'] = urls
response.meta['item'] = item
# Get all URLs on each web page (limit 5 pages)
i = 0
for value in urls:
i += 1
if i > self.maximumReoccurringPages:
break
self.logger.info('Parse: %s', value)
request = scrapy.Request(value, callback=self.test, meta={'item':item})
item = request.meta['item']

Categories

Resources