Yield items with scrapy

Yield items with scrapy - python

I'm having trouble with my spider, the way I have set it up doesn't seem to work. The spider should be able to scrape multiple pages (1,2,3), all on the same website. I'm not sure if I should do a for loop or an if/else statement so extract all the data?
I'm getting this code after I run it: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min).
Any help would be greatly appreciated!
Shown below are the code for the spider, items.py, and pipelines.py:
class abcSpider(scrapy.Spider):
name = 'abc'
page_number = 2
allowed_domains = ['']
def parse(self, response):
items = folder1Item()
deal_number_var = response.css(".mclbEl a::text").extract()
deal_type_var = response.css('.#ContentContainer1_ctl00_Content_ListCtrl1_LB1_VDTBL .mclbEl:nth-child(9)').css('::text').extract()
items['deal_number_var'] = deal_number_var
items['deal_type_var'] = deal_type_var
yield items
next_page = '' + str(abcSpider.page_number) + '/'
if abcSpider.page_number < 8:
abcSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
This is my items.py page:
import scrapy
class folder1Item(scrapy.Item):
deal_number_var = scrapy.Field()
deal_type_var = scrapy.Field()
I would like to save the data as a .db file to import into sqlite3. It looks like this in my pipelines.py:
import sqlite3
class folder1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("abc.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS abc_tb""")
self.curr.execute("""create table abc_tb(deal_number_var text, deal_type_var text)""")
def process_item(self, items, spider):
self.store_db(items)
return items
def store_db(self,items):
self.curr.execute("""insert into abc_tb values (?,?,?)""" , (items['deal_number_var'][0], items['deal_type_var'][0]))
self.conn.commit()
Middleware.py code:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)

I assume this is your entire code? If so: you did not define any start_urls. Furthermore you either have to set the allowed_domains correctly or remove the variable completely because right now you define that no url is allowed.

Related

Scrapy only get the data of last page

I'm using python 3.6 and scrapy 2.4.1, and I wrote a spider to scrape about 5 pages, then use xlsxwriter to save to excel, however this scarpy only get last page data, can't figure out why, here is my spider code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['www.ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if not next_url_href:
return
else:
yield scrapy.Request(next_url_href, callback=self.parse)
and pipeline code
import xlsxwriter
class EbayPipeline:
def open_spider(self, spider):
pass
def process_item(self, item, spider):
col_num = 0
workbook = xlsxwriter.Workbook(r'C:\Users\Clevo\Desktop\store_spider.xlsx')
worksheet = workbook.add_worksheet()
item_source = dict(item)
# print(item_source)
for key, values in item_source.items():
worksheet.write(0, col_num, key)
worksheet.write_column(1, col_num, values)
col_num += 1
workbook.close()
return item
someone know the reason why? it seems everything is ok, but I can only get last page data
by the way, is there anyway to transfer data to another function? I want to scrapy page detail and transfer the data to process_item function and yield them together

Better scraped every pages first and get data on its product page.
class EbaySpiderSpider(scrapy.Spider):
name = "ebay_spider"
def start_requests(self):
base_url = 'https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs='
for i in range(1,6):
page = base_url + str(i)#i will be the page number and add to base_url
yield scrapy.Request(url=page , callback=self.parse)
# scraped all product links first and yield to parse_contents
def parse(self, response):
links = response.xpath('//h3[#class="lvtitle"]/a/#href').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
#scraped desired data on product page
def parse_contents(self, response):
product_url = response.url
title = response.xpath('//h1/text()').extract()[0]
price = response.xpath('//span[#itemprop="price"]/text()').extract()[0]
item = EbayItem()
item['product_title'] = title
item['product_price'] = price
yield item ### to items.py
items.py, make sure that the item keys are equal to scrapy.Field()
class EbayITem(scrapy.Item):
product_title = scrapy.Field()
product_price = scrapy.Field()
pipelines.py
import xlsxwriter
class EbayPipeline:
def process_item(self, item, spider):
title = item['product_title']
price = item['product_price']
#process your worksheet here

Working version of your code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if next_url_href is not None:
next_url_href = response.urljoin(next_url_href)
yield scrapy.Request(next_url_href, callback=self.parse)
You will have to set ROBOTSTXT_OBEY=False in settings.py (which is not a good practice) or else it your spider won't scrape data and will give message:
[scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1>

Scrapy response uniform blank rows making it impossible to format response output

I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item

The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item

Scrapy pipeline only save one page of results

I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item

Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem

If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.

Scrapy & MySQL database:

I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"

Using middleware to prevent scrapy from double-visiting websites

I have a problem like this:
how to filter duplicate requests based on url in scrapy
So, I do not want a website to be crawled more than once. I adapted the middleware and wrote a print statement to test whether it correctly classifies already seen websites. It does.
Nonetheless the parsing seems to be executed multiple times because the json-File I receive contains double entries.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from crawlspider.items import KickstarterItem
from HTMLParser import HTMLParser
### code for stripping off HTML tags:
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return str(''.join(self.fed))
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
###
items = []
class MySpider(CrawlSpider):
name = 'kickstarter'
allowed_domains = ['kickstarter.com']
start_urls = ['http://www.kickstarter.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('discover/categories/comics', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('projects/', )), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = KickstarterItem()
item['date'] = hxs.select('//*[#id="about"]/div[2]/ul/li[1]/text()').extract()
item['projname'] = hxs.select('//*[#id="title"]/a').extract()
item['projname'] = strip_tags(str(item['projname']))
item['projauthor'] = hxs.select('//*[#id="name"]')
item['projauthor'] = item['projauthor'].select('string()').extract()[0]
item['backers'] = hxs.select('//*[#id="backers_count"]/data').extract()
item['backers'] = strip_tags(str(item['backers']))
item['collmoney'] = hxs.select('//*[#id="pledged"]/data').extract()
item['collmoney'] = strip_tags(str(item['collmoney']))
item['goalmoney'] = hxs.select('//*[#id="stats"]/h5[2]/text()').extract()
items.append(item)
return items
My items.py looks like that:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class KickstarterItem(Item):
# define the fields for your item here like:
date = Field()
projname = Field()
projauthor = Field()
backers = Field()
collmoney = Field()
goalmoney = Field()
pass
My middleware looks like this:
import os
from scrapy.dupefilter import RFPDupeFilter
from scrapy.utils.request import request_fingerprint
class CustomFilter(RFPDupeFilter):
def __getid(self, url):
mm = url.split("/")[4] #extracts project-id (is a number) from project-URL
print "_____________", mm
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
self.fingerprints.add(fp)
if fp in self.fingerprints and fp.isdigit(): # .isdigit() checks wether fp comes from a project ID
print "______fp is a number (therefore a project-id) and has been encountered before______"
return True
if self.file:
self.file.write(fp + os.linesep)
I added this line to settings.py:
DUPEFILTER_CLASS = 'crawlspider.duplicate_filter.CustomFilter'
I call the script using "scrapy crawl kickstarter -o items.json -t json". Then I see the correct print statements from the middleware code.
Any comments on why the json contains multiple entries containing the same data?

So now these are the three modifications that removed the duplicates:
I added this to settings.py:
ITEM_PIPELINES = ['crawlspider.pipelines.DuplicatesPipeline',]
to let scrapy know that I added a function DuplicatesPipeline in pipelines.py:
from scrapy import signals
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['projname'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['projname'])
return item
You do not need to adjust the spider and do not use the dupefilter/middleware stuff I posted before.
But I got the feeling that my solution doesn't reduce the communication as the Item-object has to be created first before it is evaluated and possibly dropped. But I am okay with that.
(Solution found by asker, moved into an answer)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Yield items with scrapy - python

I assume this is your entire code? If so: you did not define any start_urls. Furthermore you either have to set the allowed_domains correctly or remove the variable completely because right now you define that no url is allowed.

Related

Scrapy only get the data of last page

Scrapy response uniform blank rows making it impossible to format response output

Scrapy pipeline only save one page of results

Scrapy & MySQL database:

Using middleware to prevent scrapy from double-visiting websites

Categories

Resources