Running scrapy tasks in a loop - python

I have this code:
from logging import INFO
import scrapy
class LinkedInAnonymousSpider(scrapy.Spider):
name = "linkedin_anonymous"
allowed_domains = ["linkedin.com"]
start_urls = []
base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search"
def __init__(self, input=None, first=None, last=None):
self.input = input # source file name
self.first = first
self.last = last
def start_requests(self):
if self.first and self.last: # taking input from command line parameters
url = self.base_url % (self.first, self.last)
yield self.make_requests_from_url(url)
elif self.input: # taking input from file
i = 0
self.log('Input from file: %s' % self.input, INFO)
for line in open(self.input, 'r').readlines():
i += 1
if line.strip(): # no blank line
t = line.split("\t")
name = t[0]
parts = [n.strip() for n in name.split(' ')]
last = parts.pop()
first = " ".join(parts)
if first and last:
url = self.base_url % (first, last)
yield self.make_requests_from_url(url)
else:
raise Exception('No input.')
def parse(self, response):
# if there is exactly one match the person's profile page is returned
if response.xpath('//div[#class="profile-overview-content"]').extract():
yield scrapy.Request(response.url, callback=self.parse_full_profile_page)
else:
# extracting profile urls from search result
for sel in response.css('div.profile-card'):
url = sel.xpath('./*/h3/a/#href').extract()[0] # Person's full profile URL in LinkedIn
yield scrapy.Request(url, callback=self.parse_full_profile_page)
........
With this code, I get the profile details of a list of people from linkedin.
I have written such a main function in order to do that.
import scrapy
import sys
from linkedin_anonymous_spider import LinkedInAnonymousSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
if __name__ == "__main__":
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
for a in range(len(firstname)):
settings = get_project_settings()
crawler = CrawlerProcess(settings)
spider = LinkedInAnonymousSpider()
crawler.crawl(spider, [], firstname[a], lastname[a])
crawler.start()
When the loop comes to the 2nd step, I get this error:
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
How can I fix the problem?
Thanks.

You can only run one reactor, so just calling crawler.start() once.
Try passing crawler.start() out of the loop.

Here is a correct version:
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
settings = get_project_settings()
crawler = CrawlerProcess(settings)
for a in range(len(firstname)):
crawler.crawl(LinkedInAnonymousSpider, [], firstname[a], lastname[a])
crawler.start()

Related

How can I properly run scrapy spiders from an external python script and get its item output

So I'm making a couple of scrapers and now I'm trying to make a script that runs the corresponding spiders with URLs collected from a DB but I can't find a way to do this.
I have this in my spider:
class ElCorteIngles(scrapy.Spider):
name = 'ElCorteIngles'
url = ''
DEBUG = False
def start_requests(self):
if self.url != '':
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
# Get product name
try:
self.p_name = response.xpath('//*[#id="product-info"]/h2[1]/a/text()').get()
except:
print(f'{CERROR} Problem while getting product name from website - {self.name}')
# Get product price
try:
self.price_no_cent = response.xpath('//*[#id="price-container"]/div/span[2]/text()').get()
self.cent = response.xpath('//*[#id="price-container"]/div/span[2]/span[1]/text()').get()
self.currency = response.xpath('//*[#id="price-container"]/div/span[2]/span[2]/text()').get()
if self.currency == None:
self.currency = response.xpath('//*[#id="price-container"]/div/span[2]/span[1]/text()').get()
self.cent = None
except:
print(f'{CERROR} Problem while getting product price from website - {self.name}')
# Join self.price_no_cent with self.cent
try:
if self.cent != None:
self.price = str(self.price_no_cent) + str(self.cent)
self.price = self.price.replace(',', '.')
else:
self.price = self.price_no_cent
except:
print(f'{ERROR} Problem while joining price with cents - {self.name}')
# Return data
if self.DEBUG == True:
print([self.p_name, self.price, self.currency])
data_collected = ShopScrapersItems()
data_collected['url'] = response.url
data_collected['p_name'] = self.p_name
data_collected['price'] = self.price
data_collected['currency'] = self.currency
yield data_collected
Normally when I run the spider from the console I do:
scrapy crawl ElCorteIngles -a url='https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-braun-senso-smart-5-5500/'
and now I need a way to do the same on a external script and get the output yield data_collected
What I currently have in my external script is this:
import scrapy
from scrapy.crawler import CrawlerProcess
import sqlalchemy as db
# Import internal libraries
from Ruby.Ruby.spiders import *
# Variables
engine = db.create_engine('mysql+pymysql://DATABASE_INFO')
class Worker(object):
def __init__(self):
self.crawler = CrawlerProcess({})
def scrape_new_links(self):
conn = engine.connect()
# Get all new links from DB and scrape them
query = 'SELECT * FROM Ruby.New_links'
result = conn.execute(query)
for x in result:
telegram_id = x[1]
email = x[2]
phone_number = x[3]
url = x[4]
spider = x[5]
# In this cade the spider will be ElCorteIngles and
# the url https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-
# braun-senso-smart-5-5500/'
self.crawler.crawl(spider, url=url)
self.crawler.start()
Worker().scrape_new_links()
I also don't know if doing url=url in self.crawler.crawl() is the proper way to give the URL to the spider but let me know what you think.
All data from yield is being returned by a pipeline.
I think there is no need for extra info but if you need any just let me know!
Scrapy works asynchronously...ignore my imports but this is a JSON api I made for scrapy. You need to make a custom runner with an item_scraped signal. There was originally a klein endpoint and when the spider finished it would return a JSON list. I think this is what you want but without the klein endpoint so I've taken it out. My spider was GshopSpider I replaced it with your spiders name.
By taking advantage of deferred we are able to use callbacks and send signals each time an item is scraped. So using this code we collect each item into a list with a signal and when the spider finishes we have a callback setup to return_spider_output
# server.py
import json
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from googleshop.spiders.gshop import GshopSpider
from scrapy.utils.project import get_project_settings
class MyCrawlerRunner(CrawlerRunner):
def crawl(self, crawler_or_spidercls, *args, **kwargs):
# keep all items scraped
self.items = []
crawler = self.create_crawler(crawler_or_spidercls)
crawler.signals.connect(self.item_scraped, signals.item_scraped)
dfd = self._crawl(crawler, *args, **kwargs)
dfd.addCallback(self.return_items)
return dfd
def item_scraped(self, item, response, spider):
self.items.append(item)
def return_items(self, result):
return self.items
def return_spider_output(output):
return json.dumps([dict(item) for item in output])
if __name__=="__main__"
settings = get_project_settings()
runner = MyCrawlerRunner(settings)
spider = ElCorteIngles()
deferred = runner.crawl(spider)
deferred.addCallback(return_spider_output)
return deferred
The easiest way to do this would be something like this:
class ElCorteIngles(scrapy.Spider):
name = 'ElCorteIngles'
url = ''
DEBUG = False
def __init__(self):
super().__init__(self, **kwargs)
# Establish your db connection here. This can be any database connection.
# Reuse this connection object anywhere else
self.conn = conn = engine.connect()
def start_requests(self):
with self.conn.cursor() as cursor:
cursor.execute('''SELECT * FROM Ruby.New_links WHERE url NOT NULL OR url != %s''', ('',))
result = cursor.fetchall()
for url in result:
yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)
def parse(self):
# Your Parse code here
After Doing this you can initiate this crawler using something like this
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from project_name.spiders.filename import ElCorteIngles
process = CrawlerProcess(get_project_settings())
process.crawl(ElCorteIngles)
process.start()
Hope this helps.
I would also recommend you to have a queue if you are working with a large number of URLs. This will enable multiple spider processes to work on these URLs in parallel. You can initiate the queue in the init method.

How to write python scrapy code for extracting url's present in sitemap of a site and export it to csv

I've found the working solution to write python scrapy code for extracting url's present in sitemap of a site from here but don't know how to export the data to CSV file!
When I try to run scrapy crawl myspider -o mydata.csv it returns an empty csv file, but list of urls are getting printed on screen!
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print(sites)
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "http://www.example.com/"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print(self.sitemap_urls)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
First, you aren't make any request with scrapy, also you're combining scrapy with requests, that i think it's not the best idea. Try to change __init__ to:
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
Also, your self._parse_sitemap SHOULD return dict-like or Request(not only your self._parse_sitemap, every function in your scrapy spider, see docs):
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
The whole file(probably doesn't work, but explains the idea):
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l

Reducing data usage of Scrapy Spider/How to be a better internet citizen?

I recently resurrected an old scrapy spider (0.18.4) that crawls craigslist. The spider grabs links from the first page of apartment listings every 15mins, and scrapes the links of new postings. (see code below)
This spider was downloading >1GB of data every 15min! I was able to reduce it by half, but this is still a lot of load on the craigslist site. Please note, this is from the spider alone, as I tested without pipelines enabled. I have also tried limiting through MAX_DEPTH and DOWNLOAD_MAXSIZE in settings.py, spider class settings, and individual follow on requests meta settings. I have spent several hours in the docs, messed around with Rules, updated to the latest version of scrapy, etc.; all to no avail. Granted, I wrote this script several years ago when I was new to python, but perhaps my folly can be the comunities gain...
Given my code below, what can I, or anyone else using scrapy, do to reduce the amount of data I'm downloading, when I only care about a couple kB of text???? Which calls are data greedy?
Spider:
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from craigslist_2br.items import CraigslistItem
from craigslist_2br import craigslistTools
from scrapy.http import Request
from urlparse import urljoin
import types
import exceptions
import datetime
import ipdb as pdb
#from ghost import Ghost
class Suppressor:
def __init__(self, exception_type,localList):
self._exception_type = exception_type
self.safe_dict = dict([ (k, locals().get(k, None)) for k in localList ])
def __call__(self, expression):
try:
exec(expression, {"__builtins__" : None}, self.safe_dict)
except self._exception_type as e:
print 'Suppressor: suppressed exception %s with content \'%s\'' % (type(self._exception_type), e)
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/apa/"]
#g=Ghost()
def parse(self, response):
hxsM = HtmlXPathSelector(response)
##titles = hxs.select('//a[#class="result-title hdrlnk"]')
postings=hxsM.select('//p[#class="result-info"]')
resultRows=hxsM.select('//li[#class="result-row"]')
try:
IDf=open("/home/dapper/python/scrapy/scrapy_craig2br/craigslist_2br/craigslist_2br/postingIDs.txt","r")
IDvec=[int(sl.strip("\n")) for sl in IDf.readlines()]
except:
IDvec=[]
finally:
try:
IDf.close()
except:
pass
for posting,resultRow in zip(postings,resultRows):
item = CraigslistItem()
try:
item["ID"]=posting.select("a/#data-id").extract()[0]
if (int(item["ID"])==int(resultRow.select("#data-pid").extract()[0])):
try:
item["repostID"]=resultRow.select("#data-repost-of").extract()[0]
except:
item["repostID"]=''
else:
item["repostID"]=''
except:
item["ID"]=''
item["repostID"]=''
try:
item ["price"] = posting.select("span[#class='result-meta']/span[#class='result-price']/text()").extract()[0]
except:
item ["price"] = 0
try:
item ["neighborhood"] = posting.select("span[#class='result-meta']/span[#class='result-hood']/text()").extract()[0]
except:
item ["neighborhood"] = ''
if ((not any(item["ID"]) or (int(item["ID"]) not in IDvec)) and
(craigslistTools.FixPriceStr(item['price'])<3000.0) and
(not((any(item["repostID"]) and (int(item["repostID"]) in IDvec)) and (any(item['neighborhood']) and craigslistTools.outDaHoods(item['neighborhood']))))):
#s = Suppressor(exception_type=exceptions.IndexError,localList=[item,titles,postings]) # TODO: put your exception type here
item ["title"] = posting.select("a/text()").extract()[0]
item ["link"] = posting.select("a/#href").extract()[0]
item ["dateAdded"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
try:
item ["size"] = posting.select("span[#class='result-meta']/span[#class='housing']/text()").extract()[0]
except:
item ["size"] = ''
if item['link']:
if 'http://' not in item['link']:
if isinstance(item['link'], types.StringTypes):
item['link'] = urljoin(response.url, item['link'])
else:
item['link'] = urljoin(response.url, item['link'][0])
yield Request(item['link'],
meta={'item': item,'download_maxsize':8**8,'depth_limit':1},
callback=self.anchor_page)
def anchor_page(self, response):
hxs = HtmlXPathSelector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['postingbody']=hxs.select("//section [#id='postingbody']").extract()
try:
latit=float(hxs.select("//div [#id='map']//#data-latitude").extract()[0])
longi=float(hxs.select("//div [#id='map']//#data-longitude").extract()[0])
#print '%f,%f'%(latit,longi)
except:
latit=0
longi=0
try:
old_item["address"] = hxs.select(".//div [#class='mapaddress']/text()").extract()[0]
except:
old_item["address"] = []
try:
if any(latit)&(not any([char.isdigit() for char in old_item["address"][0]])):
try:
old_item["address"] = craigslistTools.coord2addr(latit,longi)
except:
pass
elif (not any(old_item["address"])):
try:
zips=[94611,94610,94609,94606,94618,94705]
for z in zips:
for line in old_item['postingbody'].split("\n"):
if str(z) in line:
old_item["address"] = line
except:
pass
except:
pass
if (latit==0)&(any(old_item["address"])):
try:
cities=['Oakland','oakland','Piedmont','piedmont','Berkeley','berkeley','montclair','Montclair']
if not any([c in old_item["address"] for c in cities]):
old_item["address"]+=', Oakland'
geoRes=Geocoder.geocode(old_item["address"])
latit=geoRes[0].latitude
longi=geoRes[0].longitude
except:
pass
old_item["latit"] = latit
old_item["longi"] = longi
try:
(bartDist,bartCoord)=craigslistTools.bartDist((latit,longi))
except:
bartDist=0
try:
if ((bartDist!=0) and (bartDist<2)):
bartWalkingTime=craigslistTools.bartWalking((latit,longi),bartCoord)
else:
bartWalkingTime=0
except:
bartWalkingTime=0
old_item["bartDist"] = bartDist
old_item["BartWalkingTime"] = bartWalkingTime
try:
if ((bartDist<1) and (bartDist!=0)):
old_item['LLNLDrivingTime']=craigslistTools.LLNLdriving((latit,longi))
else:
old_item['LLNLDrivingTime']=0
except:
old_item['LLNLDrivingTime']=0
try:
old_item["Br"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[0].strip('BR'))
old_item["baths"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[1].strip('Ba'))
except:
try:
old_item["Br"]=int(re.findall(r"(\d+)br",old_item['size'])[0])
old_item["baths"]=1
except:
old_item["Br"]=-1
old_item["baths"]=-1
try:
old_item['amenities']=hxs.select("//p [#class='attrgroup']/span/text()").extract()
except:
old_item['amenities']=[]
yield old_item
settings.py:
# Scrapy settings for craigslist_2br project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'craigslist_2br'
SPIDER_MODULES = ['craigslist_2br.spiders']
NEWSPIDER_MODULE = 'craigslist_2br.spiders'
ITEM_PIPELINES = {'craigslist_2br.pipelines.Craigslist2BrPipeline':0}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'craigslist_2br (+http://www.craigslist.org)'
DOWNLOAD_DELAY = 5
RANDOMIZE_DOWNLOAD_DELAY = False
CONCURRENT_REQUESTS_PER_DOMAIN = 1 # Default: 8
#SCHEDULER = 'scrapy.core.scheduler.Scheduler'
#HTTPCACHE_ENABLED = True
DEPTH_LIMIT = 1
Good news! The problem I experienced will likely not effect you (just what you wanted to read when searching for answers ;) ) Using the Depreciated HtmlXPathSelector from v0.18 can cause major issues (surprise surprise), so don't do it.

How to save data to MongoDB by pipeline when using multi spiders in Scrapy?

There are 2 spiders that I use to get data from webpage, and I use Crawler Process() to run them at the same time.
The spiders' code:
class GDSpider(Spider):
name = "GenDis"
allowed_domains = ["gold.jgi.doe.gov"]
base_url ="https://gold.jgi.doe.gov/projects"
stmp = []
term = "man"
for i in range(1, 1000):
url = "https://gold.jgi.doe.gov/projects?page="+ str(i) +"&Project.Project+Name="+ term+ "&count=25"
stmp.append(url)
start_urls = stmp
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for site in sites:
item = GenDis()
item['Id'] = site.xpath('td/a/text()').extract()
item['Link'] = site.xpath('td/a/#href').extract()
item['Name'] = map(unicode.strip, site.xpath('td[2]/text()').extract())
item['Status'] = map(unicode.strip, site.xpath('td[3]/text()').extract())
item['Add_Date'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
yield item
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
term = "man"
start_urls = ["http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery="+term+"&submit=Feeling+Lucky"]
MONGODB_DB = name + "_" + term
MONGODB_COLLECTION = name + "_" + term
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
url_list = []
base_url = "http://epgd.biosino.org/EPGD"
for site in sites:
item = EPGD()
item['genID'] = map(unicode.strip, site.xpath('td[1]/a/text()').extract())
item['genID_url'] = base_url+map(unicode.strip, site.xpath('td[1]/a/#href').extract())[0][2:]
item['taxID'] = map(unicode.strip, site.xpath('td[2]/a/text()').extract())
item['taxID_url'] = map(unicode.strip, site.xpath('td[2]/a/#href').extract())
item['familyID'] = map(unicode.strip, site.xpath('td[3]/a/text()').extract())
item['familyID_url'] = base_url+map(unicode.strip, site.xpath('td[3]/a/#href').extract())[0][2:]
item['chromosome'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
item['symbol'] = map(unicode.strip, site.xpath('td[5]/text()').extract())
item['description'] = map(unicode.strip, site.xpath('td[6]/text()').extract())
yield item
sel_tmp = Selector(response)
link = sel_tmp.xpath('//span[#id="quickPage"]')
for site in link:
url_list.append(site.xpath('a/#href').extract())
for i in range(len(url_list[0])):
if cmp(url_list[0][i], "#") == 0:
if i+1 < len(url_list[0]):
print url_list[0][i+1]
actual_url = "http://epgd.biosino.org/EPGD/search/"+ url_list[0][i+1]
yield Request(actual_url, callback=self.parse)
break
else:
print "The index is out of range!"
process = CrawlerProcess()
process.crawl(EPGD_spider)
process.crawl(GDSpider)
process.start() # the script will block here until all crawling jobs are finished
I want to save the data to MongoDB database. Here is my pipeline code:
class EPGD_pipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DB'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']))
return item
It works correctly when I use one spider at one time. But when I run them at the same time, it seems that the pipeline doesn't work any more. Neither database nor collections have been set up.
I've seen the CrawlerProcess() part of Scrapy document many times, but it doesn't mention about the pipeline things. So can anybody tell me what's wrong with my code?
This should fix the problem:
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(EPGD_spider)
process.crawl(GDSpider)
process.start()
You will also likely need to refactor your spider code to open a connection for each spider (this example is using "Bonus Tip 2" below):
# In your pipeline
class EPGD_pipeline(object):
def __init__(self):
self.collections = {
spider_name: self.setup_db_connection(dj_mongo_database_url.parse(url))
for spider_name, url in settings['MONGODB_PIPELINE_SETTINGS'].iterItems()
)
}
def process_item(self, item, spider):
collection = self.collections[spider.name]
...
# In settings.py
MONGODB_PIPELINE_SETTINGS = {
"GenDis": "mongodb://myhost:29297/test_db/collection",
"EPGD": "mongodb://myhost:29297/test_db/collection2",
}
Bonus Tip 1: Use txmongo instead of pymongo, otherwise you'll be getting potentially very bad performance (see also here).
Bonus Tip 2: All those settings get difficult to manage. Consider using something like django-mongo-database-url to "pack" them all in a single URL and keep them more manage-able (would be more clean if collection was also in the URL).
Bonus Tip 3: You likely do way too many writes/transactions. If the use-case allows, save results to .jl file(s) and use mongoimport to bulk-import on crawl finish. Here's how to do it in more detail.
Assuming a project named tutorial and a spider named example that creates 100 items, you create an extension in tutorial/extensions.py:
import logging
import subprocess
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class MyBulkExtension(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
settings = crawler.settings
self._feed_uri = settings.get('FEED_URI', None)
if self._feed_uri is None:
raise NotConfigured('Missing FEED_URI')
self._db = settings.get('BULK_MONGO_DB', None)
if self._db is None:
raise NotConfigured('Missing BULK_MONGO_DB')
self._collection = settings.get('BULK_MONGO_COLLECTION', None)
if self._collection is None:
raise NotConfigured('Missing BULK_MONGO_COLLECTION')
crawler.signals.connect(self._closed, signal=signals.spider_closed)
def _closed(self, spider, reason, signal, sender):
logger.info("writting file %s to db %s, colleciton %s" %
(self._feed_uri, self._db, self._collection))
command = ("mongoimport --db %s --collection %s --drop --file %s" %
(self._db, self._collection, self._feed_uri))
p = subprocess.Popen(command.split())
p.communicate()
logger.info('Import done')
On your tutorial/settings.py, you activate the extension and set the two settings:
EXTENSIONS = {
'tutorial.extensions.MyBulkExtension': 500
}
BULK_MONGO_DB = "test"
BULK_MONGO_COLLECTION = "foobar"
You can then run your crawl like this:
$ scrapy crawl -L INFO example -o foobar.jl
...
[tutorial.extensions] INFO: writting file foobar.jl to db test, colleciton foobar
connected to: 127.0.0.1
dropping: test.foobar
check 9 100
imported 100 objects
[tutorial.extensions] INFO: Import done
...

Scrapy Callback Function not scraping the Entire Data?

First of all this is my code-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/#href').extract())[3:-2]
point1 = sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/#class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[not (#class="name") or not(#class="description") or not(#class="qty") or not(#class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="image"]/a[not (#class="sft_quickshop_icon")]/img[#class="img-responsive"]/#src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
Now, these are the issues that I am facing.
1. There are numerous divs that can be found with this xpath - '//html/body/div/div/div[4]/div/div/div[5]/div' . However, the above code scrapes the contents only of the first div , i.e , having the xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' , and not all of them.
The moment I comment these three lines, the scraper scrapes everything, obviously then I am not able to add the 'add' field in the item-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
So, I want to scrape all the divs , in addition with the 'add' field in my item Class (notice the class DmozItem). How do I do that? Please give a corrected code for my SPECIFIC case, it would be best that way!
2. Secondly, as I said, as I comment the three lines, that I mentioned above, then the program scrapes everything in a time close to 5 seconds (around 4.9 seconds).
But as soon as I un-comment, those 3 lines (again those that I mentioned above), the program's run-time exceeds drastically, and it runs in a time close to 9 seconds (around 8.8 - 8.9 seconds). Why does this happen? Is that because of this - dont_filter=True? Please suggest ways to overcome this, as the run-time can prove to be a very big problem for me. Also, can I decrease the initial time of 5 seconds (around 4.9) somehow?
Use html/body/div/div/div[4]/div/div/div[5]//div to get all divs after div[5].
EDIT:
This is the correct xpath - //html/body/div/div/div[4]/div/div/div[5]/div, that gave all the div's after div[5]. The previous one mentioned, gave multiple errors!
If you do a return statement inside the loop you end the whole method execution. So if you enable those three lines you end the execution of your method (and the for loop) after the first element.
This means you should yield your request instead of returning it.

Categories

Resources