Scrapy append duplicate keys to mongodb list - python

I have made a scraper with scrapy to extract data from google patents based on chemicals search. I search chemicals like this: O1C(=CCCC1C)C and I extract the publication number from the search results and store them in a mongodb database. The structure of the collection is this:
{ "_id" : ObjectId("6123733f10bd1504a29a9c75"),
"chemical" : "O=C(NCC1N(CC)CCC1)C2=C(O)C(=CC(Cl)=C2OC)CC",
"publication_number" : ["EP3185946B1", "US10751533B2"]
}
The problem is this: If a chemical returns more than one page, in mongodb will store the same chemical two times but with different publication numbers. What I want to do is to check if the chemical exists in mongodb and if it will, I want to append the publication numbers in the already exist chemical record, otherwise if the chemical does not exist, I want to store it in the database
scraper.py
from pathlib import Path
import scrapy
from scrapy_splash import SplashRequest
from pattents.items import PattentsItem
BASE_DIR = Path(__file__).resolve().parent.parent
class PattentLinksSpider(scrapy.Spider):
name = 'pattent_links'
allowed_domains = ['patents.google.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
while not splash:select('#resultsContainer') do
splash:wait(3)
end
splash:wait(4)
return splash:html()
end
'''
def start_requests(self):
with open(BASE_DIR.joinpath('spiders/urls.txt'), "rt") as f:
start_urls = [url.strip().replace('=', '%3d') for url in f.readlines()]
for url in start_urls:
yield SplashRequest(
url='https://patents.google.com/?q=CL%3d' + url + '&page=0&num=100',
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'page_number': 0,
'chemical': url
}
)
def parse(self, response):
items = response.xpath('//search-result-item')
if len(items) > 0:
item = PattentsItem()
pn = response.xpath('//span[#class="style-scope search-result-item"]/text()').getall()
item['chemical'] = response.meta['chemical'].replace('%3d', '=')
item['publication_number'] = pn
yield item
page_number = int(response.meta['page_number']) + 1
yield SplashRequest(
url= response.url.replace(f'&page={page_number - 1}', f'&page={page_number}'),
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'chemical': response.meta['chemical'],
'page_number': page_number
}
)
pipelines.py
import pymongo
from itemadapter import ItemAdapter
class PattentsPipeline(object):
collection_name = 'items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item

Related

Scrapy use case - new links only

At a high level - I think I'm trying to use the Scrapy framework like a scraping library.
My use case is, I have a webpage with links to meeting minutes I'd like to scrape, as time passes, more links to meeting minutes are added.
My plan was to use a regular spider to scrape the links to meeting minutes, and pipeline/CsvItemExporter the list of links to a CSV.
Regular Spider 1 - webpage with links to meeting minutes I'd like to scrape, exports to csv:
class QuotesSpider(scrapy.Spider):
name = "easthamptontown-links"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.ThemisPipeline': 400
}
}
def start_requests(self):
urls = [
'http://easthamptontown.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
rowtops = response.xpath('//div[#class="RowTop"]')
for meeting in rowtops:
yield {
'meeting': meeting.css("a[href*='Detail_Meeting']").get(),
'files': meeting.css("a[href*='FileView']").getall(),
}
Pipeline 1
class ThemisPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.csv' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
file_output = {}
_item = ItemAdapter(item).asdict()
if len(_item['files']) > 0:
for filelink in _item['files']:
parser = MyHTMLParser()
parser.feed(filelink)
file_output['filelink'] = parser.lsHref
file_output['filetype'] = parser.lsData
parser.feed(_item['meeting'])
file_output['meetinglink'] = parser.lsHref
file_output['meetingtitle'] = parser.lsTitle
file_output['meetingdate'] = parser.lsData.strip()
self.exporter.export_item(file_output)
else:
DropItem(item)
return item
A CsvReader()/list comprehension feeds the links from the CSV to a second regular spider in start_urls, which, using the links, scrapes the meeting minutes and pipeline/CsvItemExporter to a .txt file named for the link, eg meeting123.txt.
The second time I run the first scraper, compare to the links in the new csv to the original csv, scrape the meeting minutes in links in the new csv but not the original csv
pipeline/CsvItemExporter to a .txt file named for the link, eg meeting124.txt.
My immediate problem is that passing the scraped minutes link to the pipeline to name the file after the minutes link is harder than I would have guessed - the framework doesn't seem to be for this.
regular spider 2 - scrapes meeting minutes from URLs supplied from a CSV:
class ASpider(scrapy.Spider):
name = "town-text"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.MinutesPipeline': 400
}
}
meetings = csvreader('./town-links.csv')
# don't override start_requests, default scrapy.Request(url=url, callback=self.parse)
start_urls = ['http://http://easthamptontown.iqm2.com/Citizens/' + meeting['filelink'] \
for meeting in meetings \
if 'Zoning' in meeting['meetingtitle'] and \
'Regular Meeting' in meeting['meetingtitle'] and \
meeting['filetype'] == 'Minutes']
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
}
pipeline:
class MinutesPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.txt' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
_item = ItemAdapter(item).asdict()
self.exporter.export_item(_item)
return item
I'd like to be able to pass the particular URL whose html I am scraping, meeting['filelink'], - to the CSV filename for the items. I tried changing scrapy.Spider to CrawlSpider to attempt to use parse_start_url() but the selector did not return any data using CrawlSpider.
Any thoughts on design for this use case unique to the Scrapy framework would be appreciated.
If you want to use the url as the filename all you need to do is pass the url with the item, and create a new file with said filename and export it.
for example:
In your parse method add a url field to the dictionary and add the response.url as it's value.
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
'url': response.url
}
Then in your pipeline:
def process_item(self, item, spider):
url = item["url"]
filename = url.split("/")[-1]
exporter = CsvItemExporter(filename)
text = item["text"]
... do text formatting if needed
... export to text to file
raise DropItem

Scrapy Pipeline with MongoDB is not working

I'm trying to put all the data that I'm scraping in MongoDB to monitor the properties prices.
I've already done a lot of tests, but it's not working.
I will put the code here, if anyone could help me. Please.
init.py
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'realstatedata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[#title="Próxima página"]/#href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
#print(path)
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
#print(path)
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[#class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[#class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
item['address'] = resource.xpath('.//span[#class="property-card__address"]/text()').extract_first()
item['prop_area'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
item['prop_rooms'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_bath'] = resource.xpath('.//span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['prop_parking'] = resource.xpath('.//ul/li[4]/span[#class="property-card__detail-value js-property-card-value"]/text()').extract_first()
item['price_rent'] = resource.xpath('.//p[#style="display: block;"]/text()').extract_first()
item['price_cond'] = resource.xpath('.//strong[#class="js-condo-price"]/text()').extract_first()
item['realstate_name'] = resource.xpath('.//picture/img/#alt').extract_first()
yield item
settings.py
BOT_NAME = 'realstatedata'
SPIDER_MODULES = ['realstatedata.spiders']
NEWSPIDER_MODULE = 'realstatedata.spiders'
ITEM_PIPELINES = ['realstatedata.pipelines.MongoPipeline', ]
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = "vivareal_db"
items.py
import scrapy
class RealstatedataItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
description = scrapy.Field()
address = scrapy.Field()
prop_area = scrapy.Field()
prop_rooms = scrapy.Field()
prop_bath = scrapy.Field()
prop_parking = scrapy.Field()
price_rent = scrapy.Field()
price_cond = scrapy.Field()
realstate_name = scrapy.Field()
pass
pipeline.py
in this part of the code, i've tried two differents forms to do, but none works.
import pymongo
import logging
class MongoPipeline(object):
collection_name = 'rent_properties'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
self.db[self.collection_name].insert(dict(item))
logging.debug("Properties added to MongoDB")
return item
Obviously your settings for enabling pipeline is wrong. ITEM_PIPELINES should be defined as a dict but not a list. In your code, the pipeline is not loaded at all.
ITEM_PIPELINES = {
"realstatedata.pipelines.MongoPipeline": 100,
}
The value in the dict represents priority when more than 1 pipeline are enabled.

Scrapy yeild items from multiple requests

I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items
If you want to send items from parse to postlogin2, etc. then add it as meta data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
yield items

Scrapy & MySQL database:

I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"

Storing the scraped data in MongoDB

I want to store the scraped data in MongoDb, but I am getting an error.
File "C:\Pythom27\lib\site-packages\six.py", line 599 , in iteritems
return d.iteritems(**kw)
AttributeError: 'list' object has no attribute 'iteritem'.
I have not used attribute has iteritem anywhere in the program
Here is the program code:
ex.py
import scrapy
from example.items import ExampleItem
class ExampleSpider(scrapy.Spider):
name = 'aaa'
allowed_domains = ["in.bookmyshow.com"]
start_urls = ["https://in.bookmyshow.com/movies"]
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
item = {}
item['Moviename'] = map(unicode.strip, response.xpath('.//h1[#id="eventTitle"]/text()').extract())
item['Language'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[1]/a/text()').extract())
item['Info'] = map(unicode.strip, response.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/span[3]/a/text()').extract())
yield item
settings.py:
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
ITEM_PIPELINES = ['example.pipelines.MongoDBPipeline', ]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "ticketbook"
MONGODB_COLLECTION = "movies"
pipleline.py
import pymongo
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class ExamplePipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
I would like to know where i have gone wrong..
In your settings.py, change the ITEMS_PIPELINES from a list to a dictionary like so:
ITEM_PIPELINES = { 'example.pipelines.MongoDBPipeline': 100 }
See explanation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html#activating-an-item-pipeline-component

Categories

Resources