I have the following script for crawling a website recursively:
#!/usr/bin/python
import scrapy
from scrapy.selector import Selector
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
class GivenSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/",
# "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
# "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
rules = (Rule(LinkExtractor(allow=r'/'), callback=parse, follow=True),)
def parse(self, response):
select = Selector(response)
titles = select.xpath('//a[#class="listinglink"]/text()').extract()
print ' [*] Start crawling at %s ' % response.url
for title in titles:
print '\t %s' % title
#configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(GivenSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
When I invoke it:
$ python spide.py
NameError: name 'Rule' is not defined
If you go by the documentation and search for the word Rule, you'll find this:
http://doc.scrapy.org/en/0.20/topics/spiders.html?highlight=rule#crawling-rules
As you didn't import anything, it is clear that Rule isn't being defined.
class scrapy.contrib.spiders.Rule(link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None)
So, in theory, you should be able to import the Rule class with from scrapy.contrib.spiders import Rule
Loïc Faure-Lacroix is right. But in the current version of Scrapy (1.6), you need to import Rule from scrapy.spiders like this:
from scrapy.spiders import Rule
See documentation for more information
Related
I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')
Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains
I'm very new to scrapy so it's hard for me to find out what i am doing wrong in case of having no results in csv file. I can see results in the console though. Here is what I tried with:
Main folder is named "realyp".
Spider file is named "yp.py" and the code:
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from realyp.items import RealypItem
class MySpider(BaseSpider):
name="YellowPage"
allowed_domains=["yellowpages.com"]
start_urls=["https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=2"]
def parse(self, response):
title = Selector(response)
page=title.xpath('//div[#class="info"]')
items = []
for titles in page:
item = RealypItem()
item["name"] = titles.xpath('.//span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()').extract()
item["phone"] = titles.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
items.append(item)
return items
"items.py" file includes:
from scrapy.item import Item, Field
class RealypItem(Item):
name= Field()
address = Field()
phone= Field()
To get the csv output my command line is:
cd desktop
cd realyp
scrapy crawl YellowPage -o items.csv -t csv
Any help will be greatly appreciated.
As stated by #Granitosauros, you should use yield instead of return. The yield should be inside the for cycle.
In the for cycle, if the path starts with // then all elements in the document which fulfill following criteria are selected (see here).
Here's a (rough) code that works for me:
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from realyp.items import RealypItem
class MySpider(BaseSpider):
name="YellowPage"
allowed_domains=["yellowpages.com"]
start_urls=["https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page=2"]
def parse(self, response):
for titles in response.xpath('//div[#class = "result"]/div'):
item = RealypItem()
item["name"] = titles.xpath('div[2]/div[2]/h2 /a/span[#itemprop="name"]/text()').extract()
item["address"] = titles.xpath('string(div[2]/div[2]/div/p[#itemprop="address"])').extract()
item["phone"] = titles.xpath('div[2]/div[2]/div/div[#itemprop="telephone" and #class="phones phone primary"]/text()').extract()
yield item
Ihave wriiten a crawler in scrapy but I would want to initiate the crwaling by using main method
import sys, getopt
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def __init__(self, *args):
try:
opts, args = getopt.getopt(args, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit(2)
super(MySpider, self).__init__(self,*args)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
yield item
Instead of using scrapy runspider Crawler.py arg1 arg2
I would like to have a seprate class with main function and initiate scrapy from there. How to this?
There are different ways to approach this, but I suggest the following:
Have a main.py file on the same directory that will open a new process and launch the spider with the parameters you need.
The main.py file would have something like the following:
import subprocess
scrapy_command = 'scrapy runspider {spider_name} -a param_1="{param_1}"'.format(spider_name='your_spider', param_1='your_value')
process = subprocess.Popen(scrapy_command, shell=True)
With this code, you just need to call your main file.
python main.py
Hope it helps.
I currently have a Spider-based spider that I wrote for crawling an input JSON array of start_urls:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from foo.items import AtlanticFirearmsItem
from scrapy.contrib.loader import ItemLoader
import json
import datetime
import re
class AtlanticFirearmsSpider(Spider):
name = "atlantic_firearms"
allowed_domains = ["atlanticfirearms.com"]
def __init__(self, start_urls='[]', *args, **kwargs):
super(AtlanticFirearmsSpider, self).__init__(*args, **kwargs)
self.start_urls = json.loads(start_urls)
def parse(self, response):
l = ItemLoader(item=AtlanticFirearmsItem(), response=response)
product = l.load_item()
return product
I can call it from the command line like so, and it does a wonderful job:
scrapy crawl atlantic_firearms -a start_urls='["http://www.atlanticfirearms.com/component/virtuemart/shipping-rifles/ak-47-receiver-aam-47-detail.html", "http://www.atlanticfirearms.com/component/virtuemart/shipping-accessories/nitride-ak47-7-62x39mm-barrel-detail.html"]'
However, I'm trying to add a CrawlSpider-based spider for crawling the entire site that inherits from it and re-uses the parse method logic. My first attempt looked like this:
class AtlanticFirearmsCrawlSpider(CrawlSpider, AtlanticFirearmsSpider):
name = "atlantic_firearms_crawler"
start_urls = [
"http://www.atlanticfirearms.com"
]
rules = (
# I know, I need to update these to LxmlLinkExtractor
Rule(SgmlLinkExtractor(allow=['detail.html']), callback='parse'),
Rule(SgmlLinkExtractor(allow=[], deny=['/bro', '/news', '/howtobuy', '/component/search', 'askquestion'])),
)
Running this spider with
scrapy crawl atlantic_firearms_crawler
crawls the site but never parses any items. I think it's because CrawlSpider apparently has its own definition of parse, so somehow I'm screwing things up.
When I change callback='parse' to callback='parse_item' and rename the parse method in AtlanticFirearmsSpider to parse_item, it works wonderfully, crawling the whole site and parsing items successfully. But then if I try to call my original atlantic_firearms spider again, it errors out with NotImplementedError, apparently because Spider-based spiders really want one to define the parse method as parse.
What's the best way for me to re-use my logic between these spiders so that I can both feed a JSON array of start_urls as well as do full-site crawls?
You can avoid multiple inheritance here.
Combine both spiders in a single one. If start_urls would be passed from the command-line - it would behave like a CrawlSpider, otherwise like a regular spider:
from scrapy import Item
from scrapy.contrib.spiders import CrawlSpider, Rule
from foo.items import AtlanticFirearmsItem
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.linkextractors import LinkExtractor
import json
class AtlanticFirearmsSpider(CrawlSpider):
name = "atlantic_firearms"
allowed_domains = ["atlanticfirearms.com"]
def __init__(self, start_urls=None, *args, **kwargs):
if start_urls:
self.start_urls = json.loads(start_urls)
self.rules = []
self.parse = self.parse_response
else:
self.start_urls = ["http://www.atlanticfirearms.com/"]
self.rules = [
Rule(LinkExtractor(allow=['detail.html']), callback='parse_response'),
Rule(LinkExtractor(allow=[], deny=['/bro', '/news', '/howtobuy', '/component/search', 'askquestion']))
]
super(AtlanticFirearmsSpider, self).__init__(*args, **kwargs)
def parse_response(self, response):
l = ItemLoader(item=AtlanticFirearmsItem(), response=response)
product = l.load_item()
return product
Or, alternatively, just extract the logic inside the parse() method into a library function and call from both spiders that would not be related, separate spiders.
Following document, I can run scrapy from a Python script, but I can't get the scrapy result.
This is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from items import DmozItem
class DmozSpider(BaseSpider):
name = "douban"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/group/xxx/discussion"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select("//table[#class='olt']/tr/td[#class='title']/a")
items = []
# print sites
for row in rows:
item = DmozItem()
item["title"] = row.select('text()').extract()[0]
item["link"] = row.select('#href').extract()[0]
items.append(item)
return items
Notice the last line, I try to use the returned parse result, if I run:
scrapy crawl douban
the terminal could print the return result
But I can't get the return result from the Python script. Here is my Python script:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy import log, signals
from spiders.dmoz_spider import DmozSpider
from scrapy.xlib.pydispatch import dispatcher
def stop_reactor():
reactor.stop()
dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = DmozSpider(domain='www.douban.com')
crawler = Crawler(Settings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
log.msg("------------>Running reactor")
result = reactor.run()
print result
log.msg("------------>Running stoped")
I try to get the result at the reactor.run(), but it return nothing,
How can I get the result?
Terminal prints the result because the default log level is set to DEBUG.
When you are running your spider from the script and call log.start(), the default log level is set to INFO.
Just replace:
log.start()
with
log.start(loglevel=log.DEBUG)
UPD:
To get the result as string, you can log everything to a file and then read from it, e.g.:
log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
with open("results.log", "r") as f:
result = f.read()
print result
Hope that helps.
I found your question while asking myself the same thing, namely: "How can I get the result?". Since this wasn't answered here I endeavoured to find the answer myself and now that I have I can share it:
items = []
def add_item(item):
items.append(item)
dispatcher.connect(add_item, signal=signals.item_passed)
Or for scrapy 0.22 (http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script) replace the last line of my solution by:
crawler.signals.connect(add_item, signals.item_passed)
My solution is freely adapted from http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/.
in my case, i placed the script file at scrapy project level e.g. if scrapyproject/scrapyproject/spiders then i placed it at scrapyproject/myscript.py