Scrapy pipeline only save one page of results - python

I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item

Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem

If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.

Related

Activating a Pipeline Component in Scrapy to write JSON

I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline

Scrapy: How to populate an item with data from two websites

I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?
This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

Scrapy only returned 1 item, my brain STUCK

here's the spider.py:
import scrapy
from scrapy.loader import ItemLoader
from dts.items import DtItem
class dtSpider(scrapy.Spider):
name = 'dts'
urls = ['s','s','s','s']
def start_requests(self):
for url in self.urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self,response):
for title in response.xpath('//h2/a/#title').extract()[:-6]:
url_array = response.url.split('/')
author = url_array[url_array.index('author')+1]
l = ItemLoader(item=DtItem(), response=response)
l.add_value('title',title)
l.add_value('author',author)
return l.load_item()
I wrote this to get the titles only.
But, why can't I get all the titles? Scrapy only returned 1 item per page.
change:
return l.load_item()
to:
yield l.load_item()
return will stop the function, but yield will continue to run.

Scrapy spider not saving to csv

I have a spider which reads a list of urls from a text file and saves the title and body text from each. The crawl works but the data does not get saved to csv. I set up a pipeline to save to csv because the normal -o option did not work for me. I did change the settings.py for piepline. Any help with this would be greatly appreciated.
The code is as follows:
Items.py
from scrapy.item import Item, Field
class PrivacyItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
desc = Field()
PrivacySpider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from privacy.items import PrivacyItem
class PrivacySpider(CrawlSpider):
name = "privacy"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
items =[]
for url in start_urls:
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
items.append(item)
return items
Pipelines.py
import csv
class CSVWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('CONTENT.csv', 'wb'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['desc'][0]])
return item
you don't have to loop on start_urls, scrapy is doing something like this:
for url in spider.start_urls:
request url and call spider.parse() with its response
so your parse function should look something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
return item
also try to avoid returning lists as item fields, do something like: hxs.select('..').extract()[0]

Categories

Resources