Scrapy not able to scrape for the next page

Scrapy not able to scrape for the next page - python

I wanted to scrape the information for the following pages, however, the code only allows me to scrape the information from the first page.
My code is as follows:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

That's all about your allowed_domains (but you need to fix your indent too). Also I'm sure that you want to define your item inside your loop:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)

maybe due to indent?
try change:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
to
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Xpath selection only returns first response result

I'm still new to scrapy. When trying to read data from quotes.toscrape, I don't get any content back when using xpath selectors. As soon as I use css selectors everything works as intended. I just can't find the error even though the example is super simple.
quotes.py
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selektor
item['author_name'] = quote.xpath('//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
items.py
import scrapy
from scrapy.loader import ItemLoader
class QuotesLoaderItem(scrapy.Item):
# define the fields for your item here like:
author_name = scrapy.Field()
quote_text = scrapy.Field()
author_link = scrapy.Field()
tags = scrapy.Field()
Result
author_name,quote_text,author_link,tags
Albert Einstein,“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,/author/Albert-Einstein,change
Albert Einstein, ...
...
(20 times)
thank you for your commitment

I use a selector object instead of a respons object and therefore the syntax has to look like this.
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selector
item['author_name'] = quote.xpath('.//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('.//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('.//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('.//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('.//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)

Scraping all pages on quote.toscrape with scrapy

I'm trying to scrape some information on the website : http://quotes.toscrape.com/
But I cannot find a way to scrape all the pages, I just had the first pages for now.
Here's my script so far :
import scrapy
from ..items import QuotetutorialItem
class QuoteSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tags = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tags'] = tags
yield items
next_page = response.xpath('//*[#class="next"]/a/#href').extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
I have also tried this :
next_page = response.xpath('//*[#class="next"]/a/#href').get()
absolute_next_page_url = response.urljoin(next_page)
if absolute_next_page_url is not None:
yield scrapy.Request(absolute_next_page_url)
And this :
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
But none of these solutions seems to work.
ANy ideas ? :)
Thanks !

How to scrape link within site using scrapy

I'm trying to use scrapy to scrape from a site, and a link within the content of the site. However, when I do this I get an error on the line above the yield statemant in parse:
TypeError: 'NoneType' object does not support item assignment
Here is my code:
class PostsSpider(scrapy.Spider):
name = "posts"
start_urls = ['https://www.nba.com/teams/bucks']
allowed_domains = ['nba.com']
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
item = yield scrapy.Request(playerPage, callback=self.helper)
item['number'] = post.css('span.nba-player-trending-item__number::text').get(),
yield item
def helper(self, response):
print("--->"+response.css("title").get())
item = Item()
item['title'] = response.css("title::text").get()
yield item
class Item(scrapy.Item):
# define the fields for your item here like:
number = scrapy.Field()
title = scrapy.Field()
ppg = scrapy.Field()

What you can do is pass number data to helper instead of doing this way.
Something like this:
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
meta = response.meta.copy()
meta['number'] = post.css('span.nba-player-trending-item__number::text').get()
yield scrapy.Request(playerPage, callback=self.helper, meta=meta)
def helper(self, response):
# here you will get `number` in response.meta['number'] that you can yield further.
item = Item()
item['number'] = response.meta.get('number)
yield item

scrapy getting values from multiple sites

I'm trying to pass a value from a function.
i looked up the docs and just didn't understand it.
ref:
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
yield item
here is a psudo code of what i want to achive:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com]
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
price = scrapy.Request(second.com, callback = self.parse_check)
yield(name, price)
def parse_check(self, response):
price = response.xpath(...)
return price

This is how you can pass any value, link etc to other methods:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com']
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
link = response.xpath(...) # link for second.com where you may find the price
request = scrapy.Request(url=link, callback = self.parse_check)
request.meta['name'] = name
yield request
def parse_check(self, response):
name = response.meta['name']
price = response.xpath(...)
yield {"name":name,"price":price} #Assuming that in your "items.py" the fields are declared as name, price

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy not able to scrape for the next page - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

Xpath selection only returns first response result

Scraping all pages on quote.toscrape with scrapy

How to scrape link within site using scrapy

scrapy getting values from multiple sites

Categories

Resources