Dropping duplicate items from Scrapy pipeline - python

My scrapy crawler collect data from ptt website, and input the crawling data into google spreadsheet by using gspread. my ptt spider parse latest 40 post on the ptt website everyday, and now i would like to drop duplicate data in this latest 40 post, for example, if the post_title or post_link is the same with yesterday, then don't need to parse this post into google spreadsheet.
i know i should use DropItem in scarpy, but literally i didn't know how to fix my code( i am a very new beginner in Python), and would like ask for help for this one, thanks.
This is my ppt spider code
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.exceptions import CloseSpider
from myFirstScrapyProject.items import MyfirstscrapyprojectItem
class PttSpider(scrapy.Spider):
count_page = 1
name = 'ptt'
allowed_domains = ['www.ptt.cc/']
start_urls = ['https://www.ptt.cc/bbs/e-shopping/search?q=%E8%9D%A6%E7%9A%AE']+['https://www.ptt.cc/bbs/e-seller/search?q=%E8%9D%A6%E7%9A%AE']
# start_urls = ['https://www.ptt.cc/bbs/e-shopping/index.html']
def parse(self, response):
items = MyfirstscrapyprojectItem()
for q in response.css('div.r-ent'):
items['push']=q.css('div.nrec > span.h1::text').extract_first()
items['title']=q.css('div.title > a::text').extract_first()
items['href']=q.css('div.title> a::attr(href)').extract_first()
items['date']=q.css('div.meta > div.date ::text').extract_first()
items['author']=q.css('div.meta > div.author ::text').extract_first()
yield(items)
and this is my pipeline
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
thanks to sharmiko, i rewrite it, but it seems doesn't work, what should i fix?
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#class DuplicatesTitlePipeline(object):
def __init__(self):
self.article = set()
def process_item(self, item, spider):
href = item['href']
if href in self.article:
raise DropItem('duplicates href found %s', item)
self.exporter.export_item(item)
return(item)
this is the code for export to google sheet
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from scrapy.exporters import BaseItemExporter
class GoogleSheetItemExporter(BaseItemExporter):
def __init__(self):
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('pythonupload.json', scope)
gc = gspread.authorize(credentials)
self.spreadsheet = gc.open('Community')
self.worksheet = self.spreadsheet.get_worksheet(1)
def export_item(self, item):
self.worksheet.append_row([item['push'], item['title'],
item['href'],item['date'],item['author']])

You should modify your process_item function to check for duplicate elements and if it is found, you can just drop it.
from scrapy.exceptions import DropItem
...
def process_item(self, item, spider):
if [ your duplicate check logic goes here]:
raise DropItem('Duplicate element found')
else:
self.exporter.export_item(item)
return item
Dropped items are no longer passed to other pipeline components. You can read more about pipelines here.

Related

Yield items with scrapy

I'm having trouble with my spider, the way I have set it up doesn't seem to work. The spider should be able to scrape multiple pages (1,2,3), all on the same website. I'm not sure if I should do a for loop or an if/else statement so extract all the data?
I'm getting this code after I run it: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min).
Any help would be greatly appreciated!
Shown below are the code for the spider, items.py, and pipelines.py:
class abcSpider(scrapy.Spider):
name = 'abc'
page_number = 2
allowed_domains = ['']
def parse(self, response):
items = folder1Item()
deal_number_var = response.css(".mclbEl a::text").extract()
deal_type_var = response.css('.#ContentContainer1_ctl00_Content_ListCtrl1_LB1_VDTBL .mclbEl:nth-child(9)').css('::text').extract()
items['deal_number_var'] = deal_number_var
items['deal_type_var'] = deal_type_var
yield items
next_page = '' + str(abcSpider.page_number) + '/'
if abcSpider.page_number < 8:
abcSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
This is my items.py page:
import scrapy
class folder1Item(scrapy.Item):
deal_number_var = scrapy.Field()
deal_type_var = scrapy.Field()
I would like to save the data as a .db file to import into sqlite3. It looks like this in my pipelines.py:
import sqlite3
class folder1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("abc.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS abc_tb""")
self.curr.execute("""create table abc_tb(deal_number_var text, deal_type_var text)""")
def process_item(self, items, spider):
self.store_db(items)
return items
def store_db(self,items):
self.curr.execute("""insert into abc_tb values (?,?,?)""" , (items['deal_number_var'][0], items['deal_type_var'][0]))
self.conn.commit()
Middleware.py code:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
I assume this is your entire code? If so: you did not define any start_urls. Furthermore you either have to set the allowed_domains correctly or remove the variable completely because right now you define that no url is allowed.

Scrapy pipeline only save one page of results

I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem
If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.

Scrapy output is showing empty rows per column

my output is as follows
0 winner loser
1 winner1
2 loser1
3 winner2
4 loser2
5 winner3
6 loser3
how do I remove the empty cells so that winner and loser values are on the same row? I've tried to locate add new line parameters to pipelines but have no luck. Is there any way to over-ride pipelines to only write if item has a value to the row so the output can be on the same row?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import scrapy
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait':3.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
yield il.load_item()
pipline.py
from scrapy.exporters import CsvItemExporter
class ScrapejsPipeline(object):
def process_item(self, item, spider):
return item
class CsvPipeline(object):
def __init__(self):
self.file = open("quotedata2.csv", 'w+b')
self.exporter = CsvItemExporter(self.file, str)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose,
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
#date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
Check this one, the problem is located: https://stackoverflow.com/a/48859488/9270398

Scrapy & MySQL database:

I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"

Renaming downloaded images in Scrapy 0.24 with content from an item field while avoiding filename conflicts?

I'm attempting to rename the images that are downloaded by my Scrapy 0.24 spider. Right now the downloaded images are stored with a SHA1 hash of their URLs as the file names. I'd like to instead name them the value I extract with item['model']. This question from 2011 outlines what I want, but the answers are for previous versions of Scrapy and don't work with the latest version.
Once I manage to get this working I'll also need to make sure I account for different images being downloaded with the same filename. So I'll need to download each image to its own uniquely named folder, presumably based on the original URL.
Here is a copy of the code I am using in my pipeline. I got this code from a more recent answer in the link above, but it's not working for me. Nothing errors out and the images are downloaded as normal. It doesn't seem my extra code has any effect on the filenames as they still appear as SHA1 hashes.
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. I don't know, how to put it in one line :-)
for image in item['images']:
yield Request(image)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
products.py (my spider)
import scrapy
import urlparse
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract() # The value I'd like to use to name my images.
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent .col-sm-9 img').xpath('./#src').extract()
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
yield item
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
itemcode = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
Here's a pastebin of the output I get from the command prompt when I run the spider: http://pastebin.com/ir7YZFqf
Any help would be greatly appreciated!
The pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy import log
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
image_guid = request.meta['model'][0]
log.msg(image_guid, level=log.DEBUG)
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
log.msg(image_guid, level=log.DEBUG)
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
You're using the settings.py wrong. You should use this:
ITEM_PIPELINES = {'allenheath.pipelines.MyImagesPipeline': 1}
For thumbsnails to work, add this to settings.py:
IMAGES_THUMBS = {
'small': (50, 50),
'big': (100, 100),
}
Since the URL hash will make sure you'll end up with a unique identifier, you could perhaps just write separately to a file the item's value and the URL hash.
After all is done, you can then just loop over this file and do the renaming (and using a Counter dictionary to make sure you rename them with a number appended based on how many Items with an equal value).

Categories

Resources