Scrapy Pipeline with MongoDB is not working

Scrapy Pipeline with MongoDB is not working - python

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item

Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

Related

Storing the scraped data in MongoDB

I want to store the scraped data in MongoDb, but I am getting an error.
File "C:\Pythom27\lib\site-packages\six.py", line 599 , in iteritems
return d.iteritems(**kw)
AttributeError: 'list' object has no attribute 'iteritem'.
I have not used attribute has iteritem anywhere in the program
Here is the program code:
ex.py
import scrapy
from example.items import ExampleItem
class ExampleSpider(scrapy.Spider):
name = 'aaa'
allowed_domains = ["in.bookmyshow.com"]
start_urls = ["https://in.bookmyshow.com/movies"]
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
item = {}
item['Moviename'] = map(unicode.strip, response.xpath('.//h1[#id="eventTitle"]/text()').extract())
item['Language'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[1]/a/text()').extract())
item['Info'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[3]/a/text()').extract())
yield item
settings.py:
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
ITEM_PIPELINES = ['example.pipelines.MongoDBPipeline', ]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "ticketbook"
MONGODB_COLLECTION = "movies"
pipleline.py
import pymongo
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class ExamplePipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I would like to know where i have gone wrong..

In your settings.py, change the ITEMS_PIPELINES from a list to a dictionary like so:
ITEM_PIPELINES = { 'example.pipelines.MongoDBPipeline': 100 }
See explanation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html#activating-an-item-pipeline-component

Iterate over all links/sub-links with Scrapy run from script

I want to run Scrapy Spider from my script, but it works only for 1 request. I cannot execute the procedure self.parse_product from scrapy.http.Request(product_url, callback=self.parse_product).
I guess it's being due the command crawler.signals.connect(callback, signal=signals.spider_closed). Please advise how correctly go over all links and sub-links.
Whole script is shown below.
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[#class="extension-grid"]'):
for product_block in meta.xpath('//div[#class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[#class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[#class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/#href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[#class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()

Your spider is being blocked from visiting pages after the start page by your allowed_domains specification. The value should include just the domain, not the protocol. Try
allowed_domains = ["www.WebStore.com"]
Also the line desc_out = Join() in your WebStoreItemLoader definition may give an error as you have no desc field.

Why scrapy not storing data into mongodb?

My main File:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
heading = scrapy.Field()
data = scrapy.Field()
Model_name = scrapy.Field()
class aqaqspider(CrawlSpider):
name = "mouth_shut_new"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"
]
rules = (
Rule(
SgmlLinkExtractor(allow=('.*\-page-.*',)),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
products = response.xpath('//div[#id="allreviews"]/ul/li')
items = []
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
item['name'] = product.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = product.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = product.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
# yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
# headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
# callback=self.parse,
# dont_filter=True)
My settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'mouth'
SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'
ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'
My pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I ran scrapy scrapy crawl mouth_shut_new. But my data didn't store in the database. In the output it should show that the data is stored in mongo and the collection name. What I am missing?

process_item() method is not indented properly, should be:
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item

You didn't yield the item in callback function: callback="parse_start_url", You should do it like this:
def parse_start_ul(self, response):
...
for product in products:
item = Product()
....
yield item

Scrapy prints fields but doesn't populate XML file

I have a problem where it prints the XML files correctly but it doesn't populate the XML file with any content.
The output in terminal is this:
[u'Tove'] [u'Jani'] [u'Reminder'] [u"Don't forget me this weekend!"]
However the output site_products.xml results in this (which is wrong, no data):
<?xml version="1.0" encoding="utf-8"?>
<items></items>
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
to = selector.xpath('//to/text()').extract()
who = selector.xpath('//from/text()').extract()
heading = selector.xpath('//heading/text()').extract()
body = selector.xpath('//body/text()').extract()
return item
pipelines.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
class CrawlerItem(scrapy.Item):
to = scrapy.Field()
who = scrapy.Field()
heading = scrapy.Field()
body = scrapy.Field()
pass
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
ITEM_PIPELINES = {'crawler.pipelines.XmlExportPipeline': 300,}
Any help with this would be really appreciated.

You need to instantiate a CrawlerItem instance in your parse_node() method:
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item

No output in XML file using XMLITEMEXPORTER

I am beginner to python and I am working with scrapy. I have used xmlitemexporter to export my scraped data to xml file. But i get only "<"/item"">" in the xml file.
My items.py is like follow:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
And the spider is like:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
And the pipelines.py is:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
The settings.py is:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
I can't figure out where my problem is.

Ok! I found the problem. What i did is just put a 'return' at the last line in spider.py
return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Pipeline with MongoDB is not working - python

Related

Storing the scraped data in MongoDB

Iterate over all links/sub-links with Scrapy run from script

Why scrapy not storing data into mongodb?

Scrapy prints fields but doesn't populate XML file

No output in XML file using XMLITEMEXPORTER

Categories

Resources