I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]
I only want to extract exact one image on every page that scrapy looking for. For example I want to extract http://eshop.erhanteknik.com.tr/photo/foto_w720_604e44853371a920a52b0a31a3548b8b.jpg from http://eshop.erhanteknik.com.tr/tos_svitavy/tos_svitavy/uc_ayakli_aynalar_t0803?DS7641935 page which scrapy looks first. With this code I am currently get whole images with .getall command but I cannot figure how can get specific image.
from scrapy.http import Request
class BooksSpider(Spider):
name = 'books'
allowed_domains = ['eshop.erhanteknik.com.tr']
start_urls = ['http://eshop.erhanteknik.com.tr/urunlerimiz?categoryId=1']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[#rel="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
image_url = response.xpath('//img/#src').getall()
yield {
'title': title,
'image_url': image_url,
}
pass
You need to target the src of the images under the slide class.
image_url = response.css('.slide img::attr(src)').extract_first()
extract_first() will grab the first item of the list.
If you use extract(), you will get a list.
Im' trying to scrape data from a page which has a listing of products, I'm currently getting all the links and scraping the details OK, but the problem is that the product manufacturer/brand is only in the listing page, not in the product page.
I've tried using passing request meta on the callback but the manufacturer data is passed unordered, resulting in the rows showing incorrect manufacturer.
This is the example page: https://www.toolmania.cl/sierras-sable-561
This is the code now:
def parse(self, response):
"""Process toolmania.cl products"""
# define product url xpath
XPATH_PRODUCT_LINK = "//a[#class='thumbnail product-thumbnail']/#href"
products = response.xpath(XPATH_PRODUCT_LINK).extract()
XPATH_PRODUCT_BRAND = ".//h4[#class='product-manufacturer']/text()"
for product in products:
# obtain product brand
brand = response.xpath(XPATH_PRODUCT_BRAND).get()
#url = product
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
# follow pagination link
XPATH_NEXT_PAGE = "//li[#class='page-item directional js-search-link']//a[#rel='next']/#href"
next_page = response.xpath(XPATH_NEXT_PAGE).get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_product(self, response):
"""Get details from single product"""
XPATH_SINGLE_PRODUCT = "//div[#class='single-product']"
for product in response.xpath(XPATH_SINGLE_PRODUCT):
# define xpaths for product details
XPATH_PRODUCT_MODEL = ".//h5[#class='product-reference-single']/text()"
XPATH_PRODUCT_NAME = ".//h1[#class='product-name-single mb-md-4']/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='product-prices margin__bottom__20']//span[#itemprop='price']/#content"
product_model = product.xpath(XPATH_PRODUCT_MODEL).get()
# clean product model
product_model = re.sub('Código de referencia: ', '', product_model)
yield {
'product_brand': response.meta['brand'],
'product_model': product_model,
'product_price': product.xpath(XPATH_PRODUCT_PRICE).extract(),
'product_name': product.xpath(XPATH_PRODUCT_NAME).extract(),
'product_link': response.url,
}
use product instead of response in the following loop, and also see I am using css instead of xpath
def parse(self, response):
"""Process toolmania.cl products"""
products = response.css('div.product-list')
for product in products:
# use "product" instead of "response"
brand = product.css('.product-manufacturer::text').get()
url = product.css(".thumbnail::attr(href)").get()
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
I need to monitor webpage to find available products and I use scrapy framework.
If I found a product I'll notify it.
This webpage has in main page the list of product with some information about them and other information in product page.
class Spider(scrapy.Spider):
name = 'productpage'
start_urls = ['https://www.productpage.com']
def parse(self, response):
for product in response.css('article'):
link = 'https://www. productpage' + product.css('a::attr(href)').get()
id = link.split('/')[-1]
title = product.css('a > span::attr(content)').get()
price = product.css('a > figure::text').get()
product = Product(self.name, id, title, price, image, size, link)
yield scrapy.Request('{}.json'.format(link), callback=self.parse_product, meta={'product': product})
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)
# The program pass this line and after some minutes it closes without error
def parse_product(self, response):
product = response.meta['product']
jsonresponse = json.loads(response.body_as_unicode())
product.image = jsonresponse['images'][0]['small_url']
for size in jsonresponse['available_sizes']:
product.size.append(u'{} | {}'.format(size['name'], size['id']))
send(product)
why the program go through this line?
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)
I want to crawl the township directory of China. The website is structured in 4 levels, which are province page, city page, county page, and township page. For example, on the province page, all the provinces are listed. If we click the link of one province, then it takes us to the city page and a list of the cities in that province is displayed.
I want each of my item to be a township. It includes town_name, town_id(gbcode), and corresponding county_name, city_name, prov_name. So the spider should collect information along the way as it goes deeper into the township page. However, my current approach using for loop does not seem to work. There is no problem with prov_name. But city and county name are mostly incorrect, they are always the last city/county in the list of their corresponding page. I think the problem is that the spider does not go deep enough, only go to parse_county request at the end of the loop. But, changing depth priority in the setting does not solve the problem.
---------- Sample Result --------
town_name, year, gbcode, city, province, county
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区
import scrapy
import re
from scrapy.spiders import Spider
from admincode.items import AdmincodeItem
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in self.parse_provincetr(response, response.selector.css(".provincetr")):
yield item
def get_text_href(self, td):
if not td.xpath('a'):
return td.xpath('text()').extract()[0], None
else:
return td.xpath('a/text()').extract()[0], td.xpath('a/#href').extract()[0]
def parse_provincetr(self, response, trs):
year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])')
year = year_pattern.search(response.url).group(2)
for td in trs.xpath('td'):
scraped = {}
scraped['year'] = year
scraped['prov_name'], href = self.get_text_href(td)
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_citytr,
meta={'scraped': scraped})
def parse_2td(self, response, trs, var_name, nextparse):
for tr in trs:
scraped = response.meta['scraped']
scraped[var_name], href = self.get_text_href(tr.xpath('td')[1])
if nextparse:
url = response.urljoin(href)
yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped})
else:
item = AdmincodeItem()
item['year'] = scraped['year']
item['prov_name'] = scraped['prov_name']
item['city_name'] = scraped['city_name']
item['county_name'] = scraped['county_name']
item['town_name'] = scraped['town_name']
item['gbcode'], href = self.get_text_href(
tr.xpath('td')[0])
yield item
def parse_citytr(self, response):
for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr):
yield city
def parse_countytr(self, response):
for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr):
yield county
def parse_towntr(self, response):
for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None):
yield town
I think you just made things a bit complex. This is a simple scraper, what you need to do is pass information from one page to another page using meta. Since meta is a dictionary in memory we need to make sure we create copies of the information for the items to come. For that we use copy.deepcopy. This will make sure data is not overwritten before yielding the items
Below is scraper which does that
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in response.css(".provincetr a"):
name = item.xpath("./text()").extract_first().strip()
link = item.xpath("./#href").extract_first().strip()
yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}})
def parse_province(self, response):
meta = response.meta['item']
for cityrow in response.css(".citytr"):
city_link = cityrow.xpath("./td[2]/a/#href").extract_first()
city_name = cityrow.xpath("./td[2]/a/text()").extract_first()
city_code = cityrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['city_name'] = city_name
meta_new['city_code'] = city_code
yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new})
def parse_city(self, response):
meta = response.meta['item']
for countyrow in response.css(".countytr"):
county_link = countyrow.xpath("./td[2]/a/#href").extract_first()
county_name = countyrow.xpath("./td[2]/a/text()").extract_first()
county_code = countyrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['county_name'] = county_name
meta_new['county_code'] = county_code
yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new})
def parse_county(self, response):
meta = response.meta['item']
for townrow in response.css(".towntr"):
town_link = townrow.xpath("./td[2]/a/#href").extract_first()
town_name = townrow.xpath("./td[2]/a/text()").extract_first()
town_code = townrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['town_name'] = town_name
meta_new['town_code'] = town_code
yield meta_new