Scrapy request does not callback - python

I am trying to create a spider that takes data from a csv (two links and a name per row), and scrapes a simple element (price) from each of those links, returning an item for each row, with the item's name being the name in the csv, and two scraped prices (one from each link).
Everything works as expected except the fact that instead of returning the prices, that would be returned from the callback function of each request, I get a request object like this :
< GET https://link.com>..
The callback functions don't get called at all, why is that?
Here is the spider:
f = open('data.csv')
f_reader = csv.reader(f)
f_data = list(f_reader)
parsed_data = []
for product in f_data:
product = product[0].split(';')
parsed_data.append(product)
f.close()
class ProductSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['domain1', 'domain2']
start_urls = ["domain1_but_its_fairly_useless"]
def parse(self, response):
global parsed_data
for product in parsed_data:
item = Product()
item['name'] = product[0]
item['first_price'] = scrapy.Request(product[1], callback=self.parse_first)
item['second_price'] = scrapy.Request(product[2], callback=self.parse_second)
yield item
def parse_first(self, response):
digits = response.css('.price_info .price span').extract()
decimals = response.css('.price_info .price .price_demicals').extract()
yield float(str(digits)+'.'+str(decimals))
def parse_second(self, response):
digits = response.css('.lr-prod-pricebox-price .lr-prod-pricebox-price-primary span[itemprop="price"]').extract()
yield digits
Thanks in advance for your help!

TL;DR: You are yielding an item with Request objects inside of it when you should yield either Item or Request.
Long version:
Parse methods in your spider should either return a scrapy.Item - in which case the chain for that crawl will stop and scrapy will put out an item or a scrapy.Requests in which case scrapy will schedule a request to continue the chain.
Scrapy is asynchronious so to create an item from multiple requests means you need to chain all of those requests while carrying your item to every one of item and fill it up little by little.
Request object has meta attribute where you can store anything you want to (well pretty much) and it will be carried to your callback function. It's very common to use it to chain requests for items that require multiple requests to form a single item.
Your spider should look something like this:
class ProductSpider(scrapy.Spider):
# <...>
def parse(self, response):
for product in parsed_data:
item = Product()
item['name'] = product[0]
# carry next url you want to crawl in meta
# and carry your item in meta
yield Request(product[1], self.parse_first,
meta={"product3": product[2], "item":item})
def parse_first(self, response):
# retrieve your item that you made in parse() func
item = response.meta['item']
# fill it up
digits = response.css('.price_info .price span').extract()
decimals = response.css('.price_info .price .price_demicals').extract()
item['first_price'] = float(str(digits)+'.'+str(decimals))
# retrieve next url from meta
# carry over your item to the next url
yield Request(response.meta['product3'], self.parse_second,
meta={"item":item})
def parse_second(self, response):
# again, retrieve your item
item = response.meta['item']
# fill it up
digits = response.css('.lr-prod-pricebox-price .lr-prod-pricebox-price-primary
span[itemprop="price"]').extract()
item['secodn_price'] = digits
# and finally return the item after 3 requests!
yield item

Related

List elements retrieved by Xpath in scrapy do not output correctly item by item(for,yield)

I am outputting the URL of the first page of the order results page of an exhibitor extracted from a specific EC site to a csv file, reading it in start_requests, and looping through it with a for statement.
Each order result page contains information on 30 products.
https://www.buyma.com/buyer/2597809/sales_1.html
itempage
Specify the links for the 30 items on each order results page and list? type, and I tried to retrieve them one by one and store them in the item as shown in the code below, but it does not work.
class AllSaledataSpider(CrawlSpider):
name = 'all_salesdata_copy2'
allowed_domains = ['www.buyma.com']
def start_requests(self):
with open('/Users/morni/researchtool/AllshoppersURL.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for row in reader:
for n in range(1, 300):
url =str((row[2])[:-5]+'/sales_'+str(n)+'.html')
yield scrapy.Request(
url=url,
callback=self.parse_firstpage_item,
dont_filter=True
)
def parse_firstpage_item(self, response):
loader = ItemLoader(item = ResearchtoolItem(), response = response)
Conversion_date = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[3]/text()').getall()
product_name = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/text()').getall()
product_URL = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/#href').getall()
for i in range(30):
loader.add_value("Conversion_date", Conversion_date[i])
loader.add_value("product_name", product_name[i])
loader.add_value("product_URL", product_URL[i])
yield loader.load_item()
Specify the links for the 30 items on each order results page and list? type, and I tried to retrieve them one by one and store them in the item as shown in the code below, but it does not work.
The output is as follows, where each item contains multiple items of information at once.
Current status:
{"product_name": ["product1", "product2"]), "Conversion_date":["Conversion_date1", "Conversion_date2" ], "product_URL":["product_URL1", "product_URL2"]},
Ideal:
[{"product_name": "product1", "Conversion_date": Conversion_date1", "product_URL": "product_URL1"},{"product_name": "product2", "Conversion_date": Conversion_date2", "product_URL": "product_URL2"}]
This may be due to my lack of understanding of basic for statements and yield.
You need to create a new loader each iteration
for i in range(30):
loader = ItemLoader(item = ResearchtoolItem(), response = response)
loader.add_value("Conversion_date", Conversion_date[i])
loader.add_value("product_name", product_name[i])
loader.add_value("product_URL", product_URL[i])
yield loader.load_item()
EDIT:
add_value appends a value to the list. Since you had zero elements in the list, then after you append you'll have a list with one element.
In order to get the values as a string you can use a processor. Example:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class ProductItem(scrapy.Item):
name = scrapy.Field(output_processor=TakeFirst())
price = scrapy.Field(output_processor=TakeFirst())
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
start_urls = ['https://scrapingclub.com/exercise/list_infinite_scroll/']
def parse(self, response, **kwargs):
names = response.xpath('//div[#class="card-body"]//h4/a/text()').getall()
prices = response.xpath('//div[#class="card-body"]//h5//text()').getall()
length = len(names)
for i in range(length):
loader = ItemLoader(item=ProductItem(), response=response)
loader.add_value('name', names[i])
loader.add_value('price', prices[i])
yield loader.load_item()

Trying to add multiple yields into a single json file using Scrapy

I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

scrapy : scrape multiple items from 2 levels

I'm fairly new to scrapy and I'm looking for a solution for my personal exercise. What I'm trying to do is to crawl IMDB top chart movies to get the ranking, the title, the year, and the plot.
I manage to go through the links and crawl the movie pages but I can't find a way to get the ranking for each movie.
Currently my code looks like this :
import scrapy
from tutorial.items import IMDB_dict # We need this so that Python knows about the item object
class MppaddressesSpider(scrapy.Spider):
name = "mppaddresses" # The name of this spider
# The allowed domain and the URLs where the spider should start crawling:
allowed_domains = ["imdb.com"]
start_urls = ['https://www.imdb.com/chart/top/']
def parse(self, response):
# The main method of the spider. It scrapes the URL(s) specified in the
# 'start_url' argument above. The content of the scraped URL is passed on
# as the 'response' object.
for rank in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract():
rank=" ".join(rank.split())
item = IMDB_dict()
item['rank'] = rank
for url in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/a/#href").extract():
# This loops through all the URLs found inside an element of class 'mppcell'
# Constructs an absolute URL by combining the response’s URL with a possible relative URL:
full_url = response.urljoin(url)
print("FOOOOOOOOOnd URL: "+full_url)
# The following tells Scrapy to scrape the URL in the 'full_url' variable
# and calls the 'get_details() method below with the content of this
# URL:
#yield {'namyy' : response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract().strip("\t\r\n '\""),}
yield scrapy.Request(full_url, callback=self.get_details)
def get_details(self, response):
# This method is called on by the 'parse' method above. It scrapes the URLs
# that have been extracted in the previous step.
#item = OntariomppsItem() # Creating a new Item object
# Store scraped data into that item:
item = IMDB_dict()
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
Besides, my item.py has:
import scrapy
class IMDB_dict(scrapy.Item):
# define the fields for your item here like:
rank = scrapy.Field()
name = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
Main question: How can I get the ranking associated with the title?
Last question (if possible): I can access URLs like I did when URLs are relative (with urljoin), but I can't find a way to access URLs when they are absolute...
Many thanks for your help.
Best,
You need to send rank to your get_details callback using meta:
def parse(self, response):
for movie in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']"):
movie_rank = movie.xpath('./text()').re_first(r'(\d+)')
movie_url = movie.xpath('./a/#href').extract_first()
movie_full_url = response.urljoin(movie_url)
print("FOOOOOOOOOnd URL: " + movie_url)
yield scrapy.Request(movie_full_url, callback=self.get_details, meta={"rank": movie_rank})
def get_details(self, response):
item = IMDB_dict()
item['rank'] = response.meta["rank"]
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
UPDATE
If you check logs you'll find this error
AttributeError: 'NoneType' object has no attribute 'strip'
Sometimes .extract_first() returns None and you can't strip() it. I recommend you to use Scrapy Item Loaders

In Scrapy, how do I pass the urls generated in one class to the next class in the script?

The following is my spider's code:
import scrapy
class ProductMainPageSpider(scrapy.Spider):
name = 'ProductMainPageSpider'
start_urls = ['http://domain.com/main-product-page']
def parse(self, response):
for product in response.css('article.isotopeItem'):
yield {
'title': product.css('h3 a::text').extract_first().encode("utf-8"),
'category': product.css('h6 a::text').extract_first(),
'img': product.css('figure a img::attr("src")').extract_first(),
'url': product.css('h3 a::attr("href")').extract_first()
}
class ProductSecondaryPageSpider(scrapy.Spider):
name = 'ProductSecondaryPageSpider'
start_urls = """ URLS IN product['url'] FROM PREVIOUS CLASS """
def parse(self, response):
for product in response.css('article.isotopeItem'):
yield {
'title': product.css('h3 a::text').extract_first().encode("utf-8"),
'thumbnail': product.css('figure a img::attr("src")').extract_first(),
'short_description': product.css('div.summary').extract_first(),
'description': product.css('div.description').extract_first(),
'gallery_images': product.css('figure a img.gallery-item ::attr("src")').extract_first()
}
The first class/part works correctly if I remove the second class/part. It generates my json file correctly with the items requested in it. However, the website I need to crawl is a two-parter. It has a product archive page that shows a products as a thumbnail, title, and category (and this info is not in the next page). Then if you click on one of the thumbnails or titles you get sent to a single product page where there is specific info on the product.
There are a lot of products so I would like to pipe (yield?) the urls in product['url'] to the second class as the "start_urls" list. But I simply don't know how to do that. My knowledge doesn't go far enough to even know what I'm missing or what is going wrong so that I can find a solution.
Check out on line 20 what I want to do.
You don't have to create two spiders for this - you can simply go to the next url and carry over your item i.e.:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//name/text()").extract()
next_page_url = response.xpath("//a[#class='next']/#href").extract_first()
yield Request(next_page_url,
self.parse_next,
meta={'item': item} # carry over our item
)
def parse_next(self, response):
# get our carried item from response meta
item = response.meta['item']
item['description'] = response.xpath("//description/text()").extract()
yield item
However if for some reason you realy want to split logic of these two steps you can simply save the results in a file (a json for example: scrapy crawl first_spider -o results.json) and open/iterate through it in your second spider in start_requests() class method which would yield urls, i.e.:
import json
from scrapy import spider
class MySecondSpider(spider):
def start_requests(self):
# this overrides `start_urls` logic
with open('results.json', 'r') as f:
data = json.loads(f.read())
for item in data:
yield Request(item['url'])

Python Scrapy, return from child page to carry on scraping

My spider function is on a page and I need to go to a link and get some data from that page to add to my item but I need to go to various pages from the parent page without creating more items. How would I go about doing that because from what I can read in the documentation I can only go in a linear fashion:
parent page > next page > next page
But I need to:
parent page > next page
> next page
> next page
You should return Request instances and pass item around in meta. And you would have to make it in a linear fashion and build a chain of requests and callbacks. In order to achieve it, you can pass around a list of requests for completing an item and return an item from the last callback:
def parse_main_page(self, response):
item = MyItem()
item['main_url'] = response.url
url1 = response.xpath('//a[#class="link1"]/#href').extract()[0]
request1 = scrapy.Request(url1, callback=self.parse_page1)
url2 = response.xpath('//a[#class="link2"]/#href').extract()[0]
request2 = scrapy.Request(url2, callback=self.parse_page2)
url3 = response.xpath('//a[#class="link3"]/#href').extract()[0]
request3 = scrapy.Request(url3, callback=self.parse_page3)
request.meta['item'] = item
request.meta['requests'] = [request2, request3]
return request1
def parse_page1(self, response):
item = response.meta['item']
item['data1'] = response.xpath('//div[#class="data1"]/text()').extract()[0]
return request.meta['requests'].pop(0)
def parse_page2(self, response):
item = response.meta['item']
item['data2'] = response.xpath('//div[#class="data2"]/text()').extract()[0]
return request.meta['requests'].pop(0)
def parse_page3(self, response):
item = response.meta['item']
item['data3'] = response.xpath('//div[#class="data3"]/text()').extract()[0]
return item
Also see:
How can i use multiple requests and pass items in between them in scrapy python
Almost Asynchronous Requests for Single Item Processing in Scrapy
Using the Scrapy Requests you can perform extra operations on the next URL in the scrapy.Request's callback .

Categories

Resources