Scraping all pages on quote.toscrape with scrapy - python

I'm trying to scrape some information on the website : http://quotes.toscrape.com/
But I cannot find a way to scrape all the pages, I just had the first pages for now.
Here's my script so far :
import scrapy
from ..items import QuotetutorialItem
class QuoteSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tags = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tags'] = tags
yield items
next_page = response.xpath('//*[#class="next"]/a/#href').extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
I have also tried this :
next_page = response.xpath('//*[#class="next"]/a/#href').get()
absolute_next_page_url = response.urljoin(next_page)
if absolute_next_page_url is not None:
yield scrapy.Request(absolute_next_page_url)
And this :
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
But none of these solutions seems to work.
ANy ideas ? :)
Thanks !

Related

How do I go to the next page with Scrapy in Python

I am trying to scrape job vacancies from indeed. Everything in my scraper works, except for the fact that it only scrapes the first page. Does anybody know what might be the problem.
class IndeedSpider(scrapy.Spider):
name = 'indeed'
allowed_domains = ['nl.indeed.com']
start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']
def parse(self, response):
urls= response.xpath('//h2[contains(#class, "jobTitle")]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
next_page_url = response.css('ul.pagination-list li:nth-child(7) a::attr(href)').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
Page = response.url
Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first()
Location = response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
Description = response.xpath('normalize-space(//div[contains(#class, "jobsearch-jobDescriptionText")])').extract_first()
Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()
yield {
'Page': Page,
'Title': Title,
'Company': Company,
'Location': Location,
'Description': Description,
'Date':Date
}
Anybody that can help me?
The CSS selector is wrong, "next_page_url" is None.
Next page is the 6'th child, but instead of using "nth-child" I used "last-child".
import scrapy
class IndeedSpider(scrapy.Spider):
name = 'indeed'
allowed_domains = ['nl.indeed.com']
start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']
def parse(self, response):
urls= response.xpath('//h2[contains(#class, "jobTitle")]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
# yield scrapy.Request(url=url, callback=self.parse_details)
# example with css:
# next_page_url = response.css('ul.pagination-list li:last-child a::attr(href)').get()
# example with xpath:
next_page_url = response.xpath('//ul[#class="pagination-list"]/li[last()]/a/#href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
print(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
Page = response.url
Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first()
Location = response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
Description = response.xpath('normalize-space(//div[contains(#class, "jobsearch-jobDescriptionText")])').extract_first()
Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()
yield {
'Page': Page,
'Title': Title,
'Company': Company,
'Location': Location,
'Description': Description,
'Date':Date
}

Scrapy Pagination follow upto 2 pages but it has to follow more

Pagination got results of page_1 and page_2 while it has to follow more than that i.e upto 10 pages. I change next_page .ccs selector with .xpath but nothing work for me.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.urljoin(response.css('a.pagination--next::attr(href)').extract_first())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items
I ran the script right now and found that it is doing fine. If you see that the script is grabbing the content from first page only, you surely wanna check out this link manually to be sure whether you have been rate limited. When you visit the page manually and see the captcha page, make sure to take half-an-hour break and then run the script again.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.css('a.pagination--next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items

Scrapy not able to scrape for the next page

I wanted to scrape the information for the following pages, however, the code only allows me to scrape the information from the first page.
My code is as follows:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
That's all about your allowed_domains (but you need to fix your indent too). Also I'm sure that you want to define your item inside your loop:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)
maybe due to indent?
try change:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
to
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

Python Crawler return the same response on any different URL request

I am building a very simple scraper but there is a very silly mistake i am doing somewhere which i am not able to find.
In response method, I am getting the same response for any URL passed using loop of all the products on the product list page
I am adding my code below please help.
def parse(self, response):
item = {}
count = 0
for single in response.xpath('//div[#class="_3O0U0u"]/div'):
count+=1
# print(count)
item['data_id'] = single.xpath('.//#data-id').extract_first()
item['price'] = single.xpath('.//div[#class="_1vC4OE"]/text()').extract_first()
item['url'] = single.xpath('.//div[#class="_1UoZlX"]/a[#class="_31qSD5"]/#href').extract_first()
if not item['url']:
item['url'] = single.xpath('.//div[#class="_3liAhj _1R0K0g"]/a[#class="Zhf2z-"]/#href').extract_first()
#print(item)
if item['url']:
yield scrapy.Request('https://www.somewebsite.com' + item['url'], callback = self.get_product_detail, priority = 1, meta={'item': item})
# break
next_page = response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
if next_page:
next_page = 'https://www.somewebsite.com'+response.xpath('//div[#class="_2zg3yZ"]/nav/a[#class="_3fVaIS"]/span[contains(text(),"Next")]/parent::a/#href').extract_first()
yield scrapy.Request(next_page, callback=self.parse ,priority=1)
def get_product_detail(self, response):
dict_item = response.meta['item']
sku = dict_item['data_id']
print('dict SKU ======== ', sku)

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)
Save items['url'] = response.url in the parse_author function.

Categories

Resources