Using request meta in Scrapy passes arguments in different order - python

Im' trying to scrape data from a page which has a listing of products, I'm currently getting all the links and scraping the details OK, but the problem is that the product manufacturer/brand is only in the listing page, not in the product page.
I've tried using passing request meta on the callback but the manufacturer data is passed unordered, resulting in the rows showing incorrect manufacturer.
This is the example page: https://www.toolmania.cl/sierras-sable-561
This is the code now:
def parse(self, response):
"""Process toolmania.cl products"""
# define product url xpath
XPATH_PRODUCT_LINK = "//a[#class='thumbnail product-thumbnail']/#href"
products = response.xpath(XPATH_PRODUCT_LINK).extract()
XPATH_PRODUCT_BRAND = ".//h4[#class='product-manufacturer']/text()"
for product in products:
# obtain product brand
brand = response.xpath(XPATH_PRODUCT_BRAND).get()
#url = product
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
# follow pagination link
XPATH_NEXT_PAGE = "//li[#class='page-item directional js-search-link']//a[#rel='next']/#href"
next_page = response.xpath(XPATH_NEXT_PAGE).get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_product(self, response):
"""Get details from single product"""
XPATH_SINGLE_PRODUCT = "//div[#class='single-product']"
for product in response.xpath(XPATH_SINGLE_PRODUCT):
# define xpaths for product details
XPATH_PRODUCT_MODEL = ".//h5[#class='product-reference-single']/text()"
XPATH_PRODUCT_NAME = ".//h1[#class='product-name-single mb-md-4']/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='product-prices margin__bottom__20']//span[#itemprop='price']/#content"
product_model = product.xpath(XPATH_PRODUCT_MODEL).get()
# clean product model
product_model = re.sub('Código de referencia: ', '', product_model)
yield {
'product_brand': response.meta['brand'],
'product_model': product_model,
'product_price': product.xpath(XPATH_PRODUCT_PRICE).extract(),
'product_name': product.xpath(XPATH_PRODUCT_NAME).extract(),
'product_link': response.url,
}

use product instead of response in the following loop, and also see I am using css instead of xpath
def parse(self, response):
"""Process toolmania.cl products"""
products = response.css('div.product-list')
for product in products:
# use "product" instead of "response"
brand = product.css('.product-manufacturer::text').get()
url = product.css(".thumbnail::attr(href)").get()
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})

Related

Python Scrapy, How to get second image on the page with scrapy?

I only want to extract exact one image on every page that scrapy looking for. For example I want to extract http://eshop.erhanteknik.com.tr/photo/foto_w720_604e44853371a920a52b0a31a3548b8b.jpg from http://eshop.erhanteknik.com.tr/tos_svitavy/tos_svitavy/uc_ayakli_aynalar_t0803?DS7641935 page which scrapy looks first. With this code I am currently get whole images with .getall command but I cannot figure how can get specific image.
from scrapy.http import Request
class BooksSpider(Spider):
name = 'books'
allowed_domains = ['eshop.erhanteknik.com.tr']
start_urls = ['http://eshop.erhanteknik.com.tr/urunlerimiz?categoryId=1']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[#rel="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
image_url = response.xpath('//img/#src').getall()
yield {
'title': title,
'image_url': image_url,
}
pass
You need to target the src of the images under the slide class.
image_url = response.css('.slide img::attr(src)').extract_first()
extract_first() will grab the first item of the list.
If you use extract(), you will get a list.

How Do I Set Up Pagination correctly?

I'm currently working on a Scrapy code that will extract 3 types of data for each product. I called the data "title, price, and upc". For each product I have made my program able to scrape title and price correctly but i am having trouble scraping for upc since the upc is on another page.
What I want my program to do for each product, is to extract the title and price on the mainpage, then go inside another page to extract UPC code. Once it gets the upc code, I want the program to go to the next product on main page and repeat the same method for the remaining products.
Here is my code.
import scrapy
from scrapy.utils.response import open_in_browser
from ..items import QuotetutorialItem
data={hidden}
headers={hidden}
class BrickseekSpider(scrapy.Spider):
name = 'brickseek1'
allowed_domains = ['brickseek.com']
def start_requests(self):
dont_filter = True
yield scrapy.http.FormRequest(url='https://brickseek.com/login/', headers=headers, formdata=data,
callback=self.parse)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
title = product.css('.item-list__title span::text').extract()
price = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
#another_page = response.css('div.item-list__tile a::attr(href)').get()
#if another_page:
#upc = product.css('div.item-overview__meta-item::text').extract()[6]
#yield response.follow(another_page, callback=self.parse)
items['title'] = title
items['price'] = price
#items['upc'] = upc
yield items
All you need to do is to put your item (after filling title,price) in meta when you visit the next page (assuming you css selectors are correct)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
item = QuotetutorialItem()
item['title'] = product.css('.item-list__title span::text').extract()
item['price'] = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
another_page = response.css('div.item-list__tile a::attr(href)').get()
if another_page:
yield response.follow(another_page, callback=self.parse_upc,meta={'item':item})
else:
yield item
def parse_upc(self,response):
item=response.meta['item']
item['upc'] = product.css('div.item-overview__meta-item::text').extract()[6]
yield item

Scrapy, crawling a dynamic page with multiple pages

For an assignment I am trying to build a spider which is able to fetch data from the "www.kaercher.com" webshop. All the products in the webshop are being called by an AJAX call. In order to load in more products, a button named "show more products", has to be pressed. I managed to fetch the required data from the corresponding URL which is being called by the AJAX Call.
However, for my assignment, I am suppose to fetch all (all products/pages) of a certain product. I've been digging around but I can't find a solution. I suppose I am suppose to do something with "isTruncated = true", true indicates that more products can be loaded, false means that there are no more products. (FIXED)
When I manage to fetch the data from all the pages, I need to find a way to fetch all the data from a list of products (create a .csv file with multiple kaercher products, each product has a unique ID which can be seen in the URL, in this case the ID 20035386 is for the high pressure washer). (FIXED)
Links:
Webshop: https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html
High pressure washer: https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html
API Url (page1): https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
OLD CODE
Spider file
import scrapy
from krc.items import KrcItem
import json
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def parse(self, response):
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
Items file
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
pass
NEW CODE
EDIT: 15/08/2019
Thanks to #gangabass I managed to fetch data from all of the product pages. I also manages to fetch the data from different products which are listed in a keyword.csv file. This enables me to fetch data from a list of products. See below for the new code:
Spider file (.py)
import scrapy
from krc.items import KrcItem
import json
import os
import csv
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(
search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
current_page = response.meta.get("page", 1)
next_page = current_page + 1
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
yield item
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page},
)
Items file (.py)
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
producttype=scrapy.Field()
pass
keywords file (.csv)
keyword,keywordtype
20035386,Hogedrukreiniger
20072956,Floor Cleaner
You can use response.meta to send current page number between requests:
def parse(self, response):
current_page = response.meta.get("page", 1)
next_page = current_page + 1
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page},
)

Scrapy close after 2 minutes running

I need to monitor webpage to find available products and I use scrapy framework.
If I found a product I'll notify it.
This webpage has in main page the list of product with some information about them and other information in product page.
class Spider(scrapy.Spider):
name = 'productpage'
start_urls = ['https://www.productpage.com']
def parse(self, response):
for product in response.css('article'):
link = 'https://www. productpage' + product.css('a::attr(href)').get()
id = link.split('/')[-1]
title = product.css('a > span::attr(content)').get()
price = product.css('a > figure::text').get()
product = Product(self.name, id, title, price, image, size, link)
yield scrapy.Request('{}.json'.format(link), callback=self.parse_product, meta={'product': product})
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)
# The program pass this line and after some minutes it closes without error
def parse_product(self, response):
product = response.meta['product']
jsonresponse = json.loads(response.body_as_unicode())
product.image = jsonresponse['images'][0]['small_url']
for size in jsonresponse['available_sizes']:
product.size.append(u'{} | {}'.format(size['name'], size['id']))
send(product)
why the program go through this line?
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

Categories

Resources