How do I go to the next page with Scrapy in Python - python

I am trying to scrape job vacancies from indeed. Everything in my scraper works, except for the fact that it only scrapes the first page. Does anybody know what might be the problem.
class IndeedSpider(scrapy.Spider):
name = 'indeed'
allowed_domains = ['nl.indeed.com']
start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']
def parse(self, response):
urls= response.xpath('//h2[contains(#class, "jobTitle")]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
next_page_url = response.css('ul.pagination-list li:nth-child(7) a::attr(href)').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
Page = response.url
Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first()
Location = response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
Description = response.xpath('normalize-space(//div[contains(#class, "jobsearch-jobDescriptionText")])').extract_first()
Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()
yield {
'Page': Page,
'Title': Title,
'Company': Company,
'Location': Location,
'Description': Description,
'Date':Date
}
Anybody that can help me?

The CSS selector is wrong, "next_page_url" is None.
Next page is the 6'th child, but instead of using "nth-child" I used "last-child".
import scrapy
class IndeedSpider(scrapy.Spider):
name = 'indeed'
allowed_domains = ['nl.indeed.com']
start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']
def parse(self, response):
urls= response.xpath('//h2[contains(#class, "jobTitle")]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
# yield scrapy.Request(url=url, callback=self.parse_details)
# example with css:
# next_page_url = response.css('ul.pagination-list li:last-child a::attr(href)').get()
# example with xpath:
next_page_url = response.xpath('//ul[#class="pagination-list"]/li[last()]/a/#href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
print(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
Page = response.url
Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first()
Location = response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
Description = response.xpath('normalize-space(//div[contains(#class, "jobsearch-jobDescriptionText")])').extract_first()
Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()
yield {
'Page': Page,
'Title': Title,
'Company': Company,
'Location': Location,
'Description': Description,
'Date':Date
}

Related

Scrapy Pagination follow upto 2 pages but it has to follow more

Pagination got results of page_1 and page_2 while it has to follow more than that i.e upto 10 pages. I change next_page .ccs selector with .xpath but nothing work for me.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.urljoin(response.css('a.pagination--next::attr(href)').extract_first())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items
I ran the script right now and found that it is doing fine. If you see that the script is grabbing the content from first page only, you surely wanna check out this link manually to be sure whether you have been rate limited. When you visit the page manually and see the captcha page, make sure to take half-an-hour break and then run the script again.
class YellSpider(scrapy.Spider):
name = 'yell'
base_url = 'https://www.yell.com{}'
start_urls = ['https://www.yell.com/ucs/UcsSearchAction.do?scrambleSeed=770796459&keywords=hospitals&location=united+kingdom']
def parse(self, response):
for data in response.css('div.row.businessCapsule--mainRow'):
title = data.css('.text-h2::text').get()
avg_rating = response.css('span.starRating--average::text').get()
business_url = data.css('a.businessCapsule--title::attr(href)').get()
final_url = self.base_url.format(business_url)
yield scrapy.Request(final_url,callback=self.parse_site,cb_kwargs={"title":title,"avg_rating":avg_rating})
next_page = response.css('a.pagination--next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_site(self,response,title,avg_rating):
opening_hours = response.css('strong::text').get()
opening_hours = opening_hours.strip() if opening_hours else ""
items = {
'Title': title,
'Average Rating': avg_rating,
'Hours': opening_hours
}
yield items

How to get list of url and use in scrapy python for web data extraction

I am creating web scraper using scrapy python. Here is my code
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = [
'https://perfumehut.com.pk/shop/',
]
def parse(self, response):
yield {
'product_link': response.css('a.product-image-link::attr("href")').get(),
'product_title': response.css('h3.product-title>a::text').get(),
'product_price': response.css('span.price > span > bdi::text').get(),
}
next_page = response.css('ul.page-numbers>li>a.next.page-numbers::attr("href")').get()
if next_page is not None:
print()
print(next_page)
print()
yield scrapy.Request(next_page)
def parse(self, response):
yield {
'title': response.css('h1::text').get(),
'batt': response.css('td.woocommerce-product-attributes-item__value p::text')[3].get(),
'brand': response.css('div.woodmart-product-brand img::attr(alt)').get(),
'brandimg': response.css('div.woodmart-product-brand img::attr(src)').get(),
'price': response.css('p.price').xpath('./span/bdi/text()').get(),
'r-price': response.css('p.price').xpath('./del/span/bdi/text()').get(),
's-sale': response.css('p.price').xpath('./ins/span/bdi/text()').get(),
'breadcrumbs': response.css('nav.woocommerce-breadcrumb a::text').getall(),
'tags': response.css('span.tagged_as a::text').getall(),
'attributes': response.css('td.woocommerce-product-attributes-item__value p::text').getall(),
'img': response.css('figure.woocommerce-product-gallery__image a::attr("href")').getall(),
'description': response.css('div.woocommerce-product-details__short-description p::text').get(),
'description1': response.css('#tab-description > div > div > p::text').getall(),
'description2': response.css('#tab-description > div > div > div > div > div > div > div > div > p::text').getall()
}
It's a woocommerce website.
There are total of 57 pages and 12 products per page.
Total of 684 products estimated.
But my code returns nothing.
What I did wrong while scraping the URLs ?
To extract the all page information you need to extract the next page url and then parse the url.
Here is a simple example, I think that help you to sort out the issue.
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = [
'https://perfumehut.com.pk/shop/',
]
def parse(self, response):
yield {
'product_link': response.css('a.product-image-link::attr("href")').get(),
'product_title': response.css('h3.product-title>a::text').get(),
'product_price': response.css('span.price > span > bdi::text').get(),
}
next_page = response.css('ul.page-numbers>li>a.next.page-numbers::attr("href")').get()
if next_page is not None:
print()
print(next_page)
print()
yield scrapy.Request(next_page)
Okay, this should do it:
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = [
'https://perfumehut.com.pk/shop/',
]
def parse(self, response):
for item in response.css(".product-grid-item"):
yield {
'product_link': item.css('a.product-image-link::attr("href")').get(),
'product_title': item.css('h3.product-title > a::text').get(),
'product_price': item.css('span.price > span > bdi::text').get(),
}
next_page = response.css('a.next:contains(→)::attr("href")').get()
if next_page:
yield scrapy.Request(next_page)

Scraping all pages on quote.toscrape with scrapy

I'm trying to scrape some information on the website : http://quotes.toscrape.com/
But I cannot find a way to scrape all the pages, I just had the first pages for now.
Here's my script so far :
import scrapy
from ..items import QuotetutorialItem
class QuoteSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tags = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tags'] = tags
yield items
next_page = response.xpath('//*[#class="next"]/a/#href').extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
I have also tried this :
next_page = response.xpath('//*[#class="next"]/a/#href').get()
absolute_next_page_url = response.urljoin(next_page)
if absolute_next_page_url is not None:
yield scrapy.Request(absolute_next_page_url)
And this :
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
But none of these solutions seems to work.
ANy ideas ? :)
Thanks !

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)
Save items['url'] = response.url in the parse_author function.

ERROR: Spider error processing <GET https://www.imovirtual.com/comprar/apartamento/lisboa/> (referer: None)

I'm trying to create a web crawler (in python, using scrapy) that extracts information from an ad, extract what is on the main page and enter the sub page of the same ad and extract the remaining information, but is giving this error when I run the code. Any suggestion?
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
start_urls = ['https://www.imovirtual.com/comprar/apartamento/lisboa/']
def parse(self,response):
for Property in response.css('div.offer-item-details'):
youritem = {
'preco':Property.css('span.offer-item title::text').extract_first(),
'autor':Property.css('li.offer-item-price::text').extract(),
'data':Property.css('li.offer-item-area::text').extract(),
'data_2':Property.css('li.offer-item-price-perm::text').extract()
}
yield scrapy.Request(subpage_link, callback=self.parse_subpage)
# next_page = response.css('li.pager-next a::attr(href)').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_subpage(self,youritem):
for i in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = {
'info': i.css('ul.main-list::text').extract(),
}
yield youritem
There are a few things to change to make it run:
You have to set subpage_link (It does not seem to be defined)
Request callbacks have only one parameter which is (Scrapy response) so you
should replace parse_subpage(self,youritem) by parse_subpage(self, reponse)
To send your item with the Request you'd better use Request meta parameter which allows you to transfer data from one scrapy response to another. If you replace scrapy.Request(subpage_link, callback=self.parse_subpage) by scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item': youritem}) you will have access to youritem when scrapy will call parse_subpage by doing response.meta.get('item')
This should work.
def parse(self,response):
for Property in response.css('div.offer-item-details'):
youritem = {
'preco':Property.css('span.offer-item title::text').extract_first(),
'autor':Property.css('li.offer-item-price::text').extract(),
'data':Property.css('li.offer-item-area::text').extract(),
'data_2':Property.css('li.offer-item-price-perm::text').extract()
}
subpage_link = ......
yield scrapy.Request(subpage_link, callback=self.parse_subpage,
meta={'item': youritem})
# next_page = response.css('li.pager-next a::attr(href)').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_subpage(self, response):
for i in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = response.meta.get('item')
youritem['info'] = i.css('ul.main-list::text').extract()
yield youritem

Categories

Resources