How to crawl multiple pages in a single spider using scrapy

How to crawl multiple pages in a single spider using scrapy - python

I need to fetch the urls of each product from this page http://www.stalkbuylove.com/new-arrivals/week-2.html#/page/1
and then need to fetch the details of each product from the product link. I am not sure how to do it.
import scrapy
import json
import redis
r_server = redis.Redis('localhost')
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["stalkbuylove.com"]
start_urls = [
"http://www.stalkbuylove.com/new-arrivals/week-2.html#/page/1"
]
def parse(self, response):
for sel in response.css('.product-detail-slide'):
name = sel.xpath('div/a/#title').extract()
price = sel.xpath('div/span/span/text()').extract()
productUrl = sel.xpath('div/a/#href').extract()
request = scrapy.Request(''.join(productUrl), callback=self.parseProductPage)
r_server.hset(name,"Name",name)
r_server.hset(name,"Price",price)
r_server.hset(name,"ProductUrl",productUrl)
print name, price, productUrl
def parseProductPage(self, response):
for sel in response.css('.top-details-product'):
availability = sel.xpath('div/link/#href').extract()
print availability
Can anyone help? When I got the product url how to crawl that url? Right now I am calling parseProductUrlPage which is not working.

Related

How to scrape JSON web pages

Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()

The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json

How to do multiple page scraping using Scrapy?

#----\
#-----*-----\
#----/ \
\
#----\ \
#-----*-------- * <-- START
#----/ /
/
#----\ /
#-----*-----/
#----/
Here is a structure of a website I want to scrap with scrapy, where * is a page and --- indicates link. I want to scrape data of # pages.
I have already done a scraper which can scrape data from a single # page.
import scrapy
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/ency/article/000178.htm']
def parse(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
The starting webpage is https://medlineplus.gov/encyclopedia.html
I want to scrape info about all diseases in the encyclopedia.

You would need to start with the "encyclopedia.html" page, follow the "alpha" links (the A-Z articles links), then, for every followed page, follow the links to the articles.
You can do this with a CrawlSpider and the Link Extractors, but, since the crawling depth is small, we can do this with a regular Spider:
from urlparse import urljoin # Python 2 only
import scrapy
from scrapy.http import Request
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class MedicalSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/encyclopedia.html']
def parse(self, response):
for link in response.css("ul.alpha-links li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_alpha_page)
def parse_alpha_page(self, response):
for link in response.css("ul#index li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_page)
def parse_page(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
Note that it looks like there is a better way to get the desired data from the MedlinePlus (check out the "For Developers" page).

Scrapy crawler will not crawl any webpages

I have been trying to get this crawler working but I keep getting errors.
Can anyone suggest any ways to get it to run?
The main spider code is
import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
class gameSpider(scrapy.Spider):
name = "game_spider.py"
allowed_domains = ["*"]
start_urls = [
"http://www.game.co.uk/en/grand-theft-auto-v-with-gta-online-3-500-000-1085837?categoryIdentifier=706209&catGroupId="
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#class="directory-url"]/li')
items = []
for site in sites:
item = Website()
item['name'] = site.xpath('//*[#id="details301149"]/div/div/h2/text()').extract()
"""item['link'] = site.xpath('//a/#href').extract()
item['description'] = site.xpath('//*[#id="overview"]/div[3]()').re('-\s[^\n]*\\r')"""
items.append(item)
print items
return items
The item code is
import scrapy
class GameItem(Item):
name = Field()
pass

Your start_urls link returns erorr 500.
There's no items.
In [7]: sites = response.xpath('//ul[#class="directory-url"]/li')
In [8]: sites
Out[8]: []

Extracting data with Scrapy which loops subpages

There is a page on my website that contains a list of staff members. Each staff member name links to their own individual pages.
I want to output a csv file that has lists each staff member's name and title, so the spider will need to loop through each of the links on the stafflist page, pulling the names and titles.
So far, this code words only to pull out the very last name and title on the list. The problem I'm having is making it go through each person's page to get a complete list.
How do I go about making this loop work?
class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
'http://example.com/stafflist/',
]
def parse(self, response):
for href in response.xpath('//div[contains(concat(" ",normalize-space(#class)," "), "span8")]//a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_SCRAPE)
def parse_SCRAPE(self, response):
items = []
for sel in response.xpath('//div[contains(concat(" ",normalize-space(#class)," "), "span9")]'):
item = scrapeItem()
item['name'] = sel.xpath('h1/text()').extract()
item['titles'] = sel.xpath('h2/text()').extract()
print item['name'], item['titles']
items.append(item)
return items

Use CrawlSpider. e.g.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myspider.items import PersonItem
from pyquery import PyQuery as pq # PyQuery is awesome!
from urlparse import urlparse, parse_qs
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.si']
start_urls = ['http://example.com/stafflist/']
rules = (
# if you have paginator this Rule will extract links
Rule(LinkExtractor(
restrict_xpaths=('//div[#class="paging"]//a[last()]')),
follow=True),
# restrict crawler to look for links only inside restrict_xpaths
# and then process those links with 'parse_item'
Rule(LinkExtractor(
restrict_xpaths=('//div[contains(concat(" ",normalize-space(#class)," "), "span8")]//a/#href')),
callback='parse_item',
follow=False),
)
def parse_item(self, response):
"""
process persons page
"""
self.response = response
self.doc = pq(self.response.body)
i = PersonItem()
i["name"] = self.doc("h1").text()
i["titles"] = self.doc("h2").text()
...
return i

Scrapy Crawls only 1st page

heya I am making a Project using scrapy in which I need to scrap the business details from a business directory http://directory.thesun.co.uk/find/uk/computer-repair
the problem I am facing is: when I am trying to crawl the page my crawler fetches the details of only 1st page whereas I need to fetch the details of the rest 9 pages also; that is all 10 pages..
i am showing below my Spider code and items.py and settings .py
please see my code and help me to solve it
spider code::
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
start_urls = [
"http://directory.thesun.co.uk/find/uk/computer-repair"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
items = []
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
items.append(item)
return items
My items.py code is as follows::
from scrapy.item import Item, Field
class Project2Item(Item):
Catogory = Field()
Bussiness_name = Field()
Description = Field()
Number = Field()
Web_url = Field()
adress_name = Field()
Photo_name = Field()
Photo_path = Field()
my settings.py is:::
BOT_NAME = 'project2'
SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
please help
me to extract details from other pages too...

Fetching description .select('span/text()') you are selecting text from ALL spans in //div[#class="abTbl "].
To extract last span you can use 'span[last()]/text()' xpath
btw this http://www.w3schools.com/xpath/xpath_syntax.asp should help you with XPathes

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to crawl multiple pages in a single spider using scrapy - python

Related

How to scrape JSON web pages

How to do multiple page scraping using Scrapy?

Scrapy crawler will not crawl any webpages

Extracting data with Scrapy which loops subpages

Scrapy Crawls only 1st page

Categories

Resources