Scrapy use case - new links only - python

At a high level - I think I'm trying to use the Scrapy framework like a scraping library.
My use case is, I have a webpage with links to meeting minutes I'd like to scrape, as time passes, more links to meeting minutes are added.
My plan was to use a regular spider to scrape the links to meeting minutes, and pipeline/CsvItemExporter the list of links to a CSV.
Regular Spider 1 - webpage with links to meeting minutes I'd like to scrape, exports to csv:
class QuotesSpider(scrapy.Spider):
name = "easthamptontown-links"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.ThemisPipeline': 400
}
}
def start_requests(self):
urls = [
'http://easthamptontown.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
rowtops = response.xpath('//div[#class="RowTop"]')
for meeting in rowtops:
yield {
'meeting': meeting.css("a[href*='Detail_Meeting']").get(),
'files': meeting.css("a[href*='FileView']").getall(),
}
Pipeline 1
class ThemisPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.csv' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
file_output = {}
_item = ItemAdapter(item).asdict()
if len(_item['files']) > 0:
for filelink in _item['files']:
parser = MyHTMLParser()
parser.feed(filelink)
file_output['filelink'] = parser.lsHref
file_output['filetype'] = parser.lsData
parser.feed(_item['meeting'])
file_output['meetinglink'] = parser.lsHref
file_output['meetingtitle'] = parser.lsTitle
file_output['meetingdate'] = parser.lsData.strip()
self.exporter.export_item(file_output)
else:
DropItem(item)
return item
A CsvReader()/list comprehension feeds the links from the CSV to a second regular spider in start_urls, which, using the links, scrapes the meeting minutes and pipeline/CsvItemExporter to a .txt file named for the link, eg meeting123.txt.
The second time I run the first scraper, compare to the links in the new csv to the original csv, scrape the meeting minutes in links in the new csv but not the original csv
pipeline/CsvItemExporter to a .txt file named for the link, eg meeting124.txt.
My immediate problem is that passing the scraped minutes link to the pipeline to name the file after the minutes link is harder than I would have guessed - the framework doesn't seem to be for this.
regular spider 2 - scrapes meeting minutes from URLs supplied from a CSV:
class ASpider(scrapy.Spider):
name = "town-text"
custom_settings = {
'ITEM_PIPELINES': {
'themis.pipelines.MinutesPipeline': 400
}
}
meetings = csvreader('./town-links.csv')
# don't override start_requests, default scrapy.Request(url=url, callback=self.parse)
start_urls = ['http://http://easthamptontown.iqm2.com/Citizens/' + meeting['filelink'] \
for meeting in meetings \
if 'Zoning' in meeting['meetingtitle'] and \
'Regular Meeting' in meeting['meetingtitle'] and \
meeting['filetype'] == 'Minutes']
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
}
pipeline:
class MinutesPipeline:
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s.txt' % spider.name, 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
_item = ItemAdapter(item).asdict()
self.exporter.export_item(_item)
return item
I'd like to be able to pass the particular URL whose html I am scraping, meeting['filelink'], - to the CSV filename for the items. I tried changing scrapy.Spider to CrawlSpider to attempt to use parse_start_url() but the selector did not return any data using CrawlSpider.
Any thoughts on design for this use case unique to the Scrapy framework would be appreciated.

If you want to use the url as the filename all you need to do is pass the url with the item, and create a new file with said filename and export it.
for example:
In your parse method add a url field to the dictionary and add the response.url as it's value.
def parse(self, response):
for element in response.xpath('//div[#id="Content"]/div/*'):
yield {
'line': element.xpath('.//text()').getall(),
'url': response.url
}
Then in your pipeline:
def process_item(self, item, spider):
url = item["url"]
filename = url.split("/")[-1]
exporter = CsvItemExporter(filename)
text = item["text"]
... do text formatting if needed
... export to text to file
raise DropItem

Related

How to scrape a site usin different rules for a spider?

I separated the spider from the crawler. I need to extract some data from a website using Python Scrapy using different conditions to get results. So I have the functions in first file:
def parse(self, response):
xpath = '//div[#class="proinfor"]//div[#class="prolist_casinforimg"]/a/#href'
urls = response.xpath(xpath).extract()
for url in urls:
url = url.replace("//", "", 1)
yield scrapy.Request(response.urljoin(url),
callback=self.parse_requem)
yield scrapy.Request(response.urljoin(url),
callback=self.parse_obj)
def parse_requem(self, response):
...
yield scrapy.Request(callback=self.parse_item)
def parse_item(self, response):
parser = BaseParser(response)
return parser.construct_item()
def parse_obj(self, response):
parser = BaseParser(response)
return parser.construct()
And the code in the BaseParser class:
def parse_price(self):
Price = response.body
return Price
def parse_ex(self):
exists = self.xpath('//text()').extract_first()
return exists
def construct(self):
item = dict()
item['ex'] = self.parse_ex()
return item
def construct_item(self):
item = dict()
item['price'] = self.parse_price()
return item
As you can see, I'm trying to separate the data retrieval logic, but instead, I'm only getting the execution result from a single function.
How to separate the parsing logic for a spider?

Scrapy append duplicate keys to mongodb list

I have made a scraper with scrapy to extract data from google patents based on chemicals search. I search chemicals like this: O1C(=CCCC1C)C and I extract the publication number from the search results and store them in a mongodb database. The structure of the collection is this:
{ "_id" : ObjectId("6123733f10bd1504a29a9c75"),
"chemical" : "O=C(NCC1N(CC)CCC1)C2=C(O)C(=CC(Cl)=C2OC)CC",
"publication_number" : ["EP3185946B1", "US10751533B2"]
}
The problem is this: If a chemical returns more than one page, in mongodb will store the same chemical two times but with different publication numbers. What I want to do is to check if the chemical exists in mongodb and if it will, I want to append the publication numbers in the already exist chemical record, otherwise if the chemical does not exist, I want to store it in the database
scraper.py
from pathlib import Path
import scrapy
from scrapy_splash import SplashRequest
from pattents.items import PattentsItem
BASE_DIR = Path(__file__).resolve().parent.parent
class PattentLinksSpider(scrapy.Spider):
name = 'pattent_links'
allowed_domains = ['patents.google.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
while not splash:select('#resultsContainer') do
splash:wait(3)
end
splash:wait(4)
return splash:html()
end
'''
def start_requests(self):
with open(BASE_DIR.joinpath('spiders/urls.txt'), "rt") as f:
start_urls = [url.strip().replace('=', '%3d') for url in f.readlines()]
for url in start_urls:
yield SplashRequest(
url='https://patents.google.com/?q=CL%3d' + url + '&page=0&num=100',
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'page_number': 0,
'chemical': url
}
)
def parse(self, response):
items = response.xpath('//search-result-item')
if len(items) > 0:
item = PattentsItem()
pn = response.xpath('//span[#class="style-scope search-result-item"]/text()').getall()
item['chemical'] = response.meta['chemical'].replace('%3d', '=')
item['publication_number'] = pn
yield item
page_number = int(response.meta['page_number']) + 1
yield SplashRequest(
url= response.url.replace(f'&page={page_number - 1}', f'&page={page_number}'),
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'chemical': response.meta['chemical'],
'page_number': page_number
}
)
pipelines.py
import pymongo
from itemadapter import ItemAdapter
class PattentsPipeline(object):
collection_name = 'items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item

Python, Scrapy Pipeline csv out problem, error in for loop

I am working Google search crawling using scrapy. This is the code and it works well to get search results.
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
title = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
My next step is use "pipeline" on Scrapy to save a csv file for results.
Here is the code that I have written so far.
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is modified my spider code.
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
It has error where in:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
I get this error:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
Here is the working output according to your comment.
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
titles = page.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item

Activating a Pipeline Component in Scrapy to write JSON

I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline

Scrapy pipeline only save one page of results

I have a spider to crawl course_tal which has a pipeline to save two types of items:
moocs.csv which contains the course data.
moocs_review.csv which contains the reviews data.
This is the spider code I have:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[#class="course-listing-card"]//a[contains(#href, "/courses/")]/#href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[#class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[#class="course-info__school__name"]//text()[2]')
#'//*[#class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Which goes to each course page and save the details into the corresponding item. I'm getting the pagination here:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
The spider goes to next pages but the result is not saved in the output file.
I'm guessing the problem is in the pipeline, where the files are created:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
Are you sure the spider is doing the pagination properly?
When you do this:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract()
extract() returns a list of results that you are then passing into the url parameter of the Request:
yield Request(url=next_page_url, callback=self.parse)
But url must be a string or unicode value, so, doing that will generate the following error:
TypeError: Request url must be str or unicode, got list:
It can be solved by using the extract_first() method, and I would also check that the value is not None:
next_page_url = response.xpath('//*[#class="js-course-pagination"]//a[contains(#aria-label,"Next")]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
Please, try this and tell me if it solved your problem
If you use -t csv, this will also do work. instead of pipeline
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
This will automatically create a file in spider folder.

Categories

Resources