Running Scrapy crawler from a main function - python

Ihave wriiten a crawler in scrapy but I would want to initiate the crwaling by using main method
import sys, getopt
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def __init__(self, *args):
try:
opts, args = getopt.getopt(args, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit(2)
super(MySpider, self).__init__(self,*args)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
yield item
Instead of using scrapy runspider Crawler.py arg1 arg2
I would like to have a seprate class with main function and initiate scrapy from there. How to this?

There are different ways to approach this, but I suggest the following:
Have a main.py file on the same directory that will open a new process and launch the spider with the parameters you need.
The main.py file would have something like the following:
import subprocess
scrapy_command = 'scrapy runspider {spider_name} -a param_1="{param_1}"'.format(spider_name='your_spider', param_1='your_value')
process = subprocess.Popen(scrapy_command, shell=True)
With this code, you just need to call your main file.
python main.py
Hope it helps.

Related

can't get an output from scrapy

when I put
scrapy runspider divar.py -o data.json
in the terminal, I get an empty file. am I doing something wrong here? I want to get result of categories and subcategories from the URL I put in the start_urls and then put in the result and the print it and also get an json file. mostly json file.
import scrapy
class ws(scrapy.Spider):
name = 'wsDivar'
result = []
start_urls =["https://divar.ir/s/tehran"]
def parse(self, response):
for category in response.xpath("//*/ul[#class='kt-accordion-item__header']"):
x = {'cats' : category.xpath("//*/ul[#class='kt-accordion-item__header']/a").extract_first()}
result.append(x)
yield(x)
print(result)
next_L =response.xpath("//li[#class='next']/a/#href").extract_first()
if next_L is not None:
next_link = response.urljoin(next_L)
yield scrapy.Request(url=next_link, callback=self.parse)
import scrapy
class ws(scrapy.Spider):
name = 'wsDivar'
start_urls =["https://divar.ir/s/tehran"]
def parse(self, response):
for category in response.xpath("//*/ul[#class='kt-accordion-item__header']"):
x = {'cats' : category.xpath("//*/ul[#class='kt-accordion-item__header']/a").extract_first()}
yield(x)
next_L =response.xpath("//li[#class='next']/a/#href").extract_first()
if next_L is not None:
next_link = response.urljoin(next_L)
yield scrapy.Request(url=next_link, callback=self.parse)
if your XPath works fine. yield also print you the result.
instead of this:
scrapy runspider divar.py -o data.json
use this:
scrapy crawl wsDivar -o data.json
Also, run the command in project directory which supposed to include scrapy.cfg file.

2 functions in scrapy spider and the second one not running

I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')
Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains

Scrapy CLI output - CSV_DELIMITER parameter not working

I am trying to run the scrapy exporter with a custom delimiter via CLI like this:
scrapy runspider beneficiari_2016.py -o beneficiari_2016.csv -t csv -a CSV_DELIMITER="\n"
The export works perfectly, but the delimiter is still the default comma(",").
Please let me know if you have any idea how it can be fixed. Thank you!
The code:
import scrapy
from scrapy.item import Item, Field
import urllib.parse
class anmdm(Item):
nume_beneficiar = Field()
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['http://www.anm.ro/sponsorizari/afisare-2016/beneficiari?
page=1']
def parse(self, response):
doctor = anmdm()
doctors = []
for item in response.xpath('//tbody/tr'):
doctor['nume_beneficiar'] =
item.xpath('td[5]//text()').extract_first()
yield doctor
next_page = response.xpath("//ul/li[#class='active']/following-
sibling::li/a/#href").extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
print(next_page)
yield response.follow(next_page, self.parse)
CSV_DELIMITER needs to be changed in settings and not like an spider argument -a.
To change settings on the command line use -s:
scrapy runspider beneficiari_2016.py -o beneficiari_2016.csv -t csv -s CSV_DELIMITER="\n"

Scrapy put two spiders in single file

I have written two spiders in single file. When I ran scrapy runspider two_spiders.py, only the first Spider was executed. How can I run both of them without splitting the file into two files.
two_spiders.py:
import scrapy
class MySpider1(scrapy.Spider):
# first spider definition
...
class MySpider2(scrapy.Spider):
# second spider definition
...
Let's read the documentation:
Running multiple spiders in the same process
By default, Scrapy runs a
single spider per process when you run scrapy crawl. However, Scrapy
supports running multiple spiders per process using the internal API.
Here is an example that runs multiple spiders simultaneously:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider1(scrapy.Spider):
# Your first spider definition
...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
process = CrawlerProcess()
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start() # the script will block here until all crawling jobs are finished
(there are few more examples in the documentation)
From your question it is not clear how have you put two spiders into one file. It was not enough to concatenate content of two files with single spiders.
Try to do what is written in the documentation. Or at least show us your code. Without it we can't help you.
Here is a full Scrapy project with 2 spiders in one file.
# quote_spiders.py
import json
import string
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Item, Field
class TextCleaningPipeline(object):
def _clean_text(self, text):
text = text.replace('“', '').replace('”', '')
table = str.maketrans({key: None for key in string.punctuation})
clean_text = text.translate(table)
return clean_text.lower()
def process_item(self, item, spider):
item['text'] = self._clean_text(item['text'])
return item
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open(spider.settings['JSON_FILE'], 'a')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class QuoteItem(Item):
text = Field()
author = Field()
tags = Field()
spider = Field()
class QuotesSpiderOne(scrapy.Spider):
name = "quotes1"
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/1/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item
class QuotesSpiderTwo(scrapy.Spider):
name = "quotes2"
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/2/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item
if __name__ == '__main__':
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['HTTPCACHE_ENABLED'] = True
settings['JSON_FILE'] = 'items.jl'
settings['ITEM_PIPELINES'] = dict()
settings['ITEM_PIPELINES']['__main__.TextCleaningPipeline'] = 800
settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 801
process = CrawlerProcess(settings=settings)
process.crawl(QuotesSpiderOne)
process.crawl(QuotesSpiderTwo)
process.start()
Install Scrapy and run the script
$ pip install Scrapy
$ python quote_spiders.py
No other file is needed.
This example coupled with graphical debugger of pycharm/vscode can help understand scrapy workflow and make debugging easier.

Confused about running Scrapy from within a Python script

Following document, I can run scrapy from a Python script, but I can't get the scrapy result.
This is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from items import DmozItem
class DmozSpider(BaseSpider):
name = "douban"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/group/xxx/discussion"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select("//table[#class='olt']/tr/td[#class='title']/a")
items = []
# print sites
for row in rows:
item = DmozItem()
item["title"] = row.select('text()').extract()[0]
item["link"] = row.select('#href').extract()[0]
items.append(item)
return items
Notice the last line, I try to use the returned parse result, if I run:
scrapy crawl douban
the terminal could print the return result
But I can't get the return result from the Python script. Here is my Python script:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy import log, signals
from spiders.dmoz_spider import DmozSpider
from scrapy.xlib.pydispatch import dispatcher
def stop_reactor():
reactor.stop()
dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = DmozSpider(domain='www.douban.com')
crawler = Crawler(Settings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
log.msg("------------>Running reactor")
result = reactor.run()
print result
log.msg("------------>Running stoped")
I try to get the result at the reactor.run(), but it return nothing,
How can I get the result?
Terminal prints the result because the default log level is set to DEBUG.
When you are running your spider from the script and call log.start(), the default log level is set to INFO.
Just replace:
log.start()
with
log.start(loglevel=log.DEBUG)
UPD:
To get the result as string, you can log everything to a file and then read from it, e.g.:
log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
with open("results.log", "r") as f:
result = f.read()
print result
Hope that helps.
I found your question while asking myself the same thing, namely: "How can I get the result?". Since this wasn't answered here I endeavoured to find the answer myself and now that I have I can share it:
items = []
def add_item(item):
items.append(item)
dispatcher.connect(add_item, signal=signals.item_passed)
Or for scrapy 0.22 (http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script) replace the last line of my solution by:
crawler.signals.connect(add_item, signals.item_passed)
My solution is freely adapted from http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/.
in my case, i placed the script file at scrapy project level e.g. if scrapyproject/scrapyproject/spiders then i placed it at scrapyproject/myscript.py

Categories

Resources