I'm writing a scrapy web crawler that saves the html from the pages that I visit and I'm uploading them to S3. Since they are uploading to S3, there's no point in keeping a local copy
Spider class
class MySpider(CrawlSpider):
name = 'my name'
start_urls = ['my url']
allowed_domains = ['my domain']
rules = (Rule (LinkExtractor(allow=()), callback="parse_item", follow= True),
)
def parse_item(self,response):
item = MyItem()
item['url'] = response.url
item['html'] = response.body
return item
pipelines.py
save_path = 'My path'
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
with open(os.path.join(save_path, filename), 'wb') as f:
f.write(item['html'])
self.UploadtoS3(filename)
def UploadtoS3(self, filename):
...
I read in the python docs that I can create a NamedTemporaryFile: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
I'm a little fuzzy on when it gets deleted. If I were to use a NamedTemporaryFile how could I delete the file after successful upload to S3?
Extending on my comment:
You could use the io.StringIO method to create a text buffer instead of saving/reading/deleting a file.
It would be something like this:
import io
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
file = io.StringIO()
file.write(item['html'])
self.UploadtoS3(filename, file)
def UploadtoS3(self, filename, file):
# here instead of reading the file to upload to S3, use the file passed to the method
Documentation: https://docs.python.org/3/library/io.html
Related
I would like to download all PDFs found on a site, e.g. https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html. I also tried to use rules but I think it's not neccessary here.
This is my approach:
import scrapy
from scrapy.linkextractors import IGNORED_EXTENSIONS
CUSTOM_IGNORED_EXTENSIONS = IGNORED_EXTENSIONS.copy()
CUSTOM_IGNORED_EXTENSIONS.remove('pdf')
class PDFParser(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
# URL of the pdf file
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
rules = (
Rule(LinkExtractor(allow=r'.*\.pdf', deny_extensions=CUSTOM_IGNORED_EXTENSIONS), callback='parse', follow=True),
)
def parse(self, response):
# selector of pdf file.
for pdf in response.xpath("//a[contains(#href, 'pdf')]"):
yield scrapy.Request(
url=response.urljoin(pdf),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
It seems there are two problems. The first one when extracting all the pdf links with xpath:
TypeError: Cannot mix str and non-str arguments
and the second problem is about handling the pdf file itself. I just want to store it locally in a specific folder or similar. It would be really great if someone has a working example for this kind of site.
To download files you need to use the FilesPipeline. This requires that you enable it in ITEM_PIPELINES and then provide a field named file_urls in your yielded item. In the example below, I have created an extenstion of the FilesPipeline in order to retain the filename of the pdf as provided on the website. The files will be saved in a folder named downloaded_files in the current directory
Read more about the filespipeline from the docs
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[#class='download pdf pdf']/#href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}
I am trying to collect all the poems from under the category "Índice general alfabético" on this site http://amediavoz.com/. There it appears the title of the poems which one has to click to get to the actual poems. Basically I want to copy all the text of each poems from each of these pages (the text within <p></p> under xpath "/html/body/blockquote[2]/blockquote" in each of the pages) except the ending information about the poem under <i></i> in the HTML code. I would like to save everything in .txt files, either one big one, or one per page.
This code is an attempt to do this.
import scrapy
class FirstSpider(scrapy.Spider):
name = "FirstSpider"
start_urls = ['http://amediavoz.com/']
def start_requests(self):
url = ['http://amediavoz.com/']
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
xp = "//a[#target='_blank']/#href"
for url in response.xpath(xp).extract():
page = response.url.split("/")[-2]
filename = 'Poems=%s.txt' % page
sub = url.css('blockquote')[1]
with open(filename, 'wb') as f:
f.write(sub.xpath('//font/text()').extract())
self.log('Saved file %s' % filename)
f.close()
When I run the code I dont get any error message but no output either, that is, a text file.
Any help is appreciated.
Sorry, I don't know Spanish. I just roughly extract the text, not necessarily right. If you can mark which data you need to extract from HTML, I will help you modify the code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class FirstSpider(Spider):
name = 'FirstSpider'
start_urls = ['http://amediavoz.com/']
refresh_urls = True
def extract(self, url, html, models, modelNames):
try:
doc = SimplifiedDoc(html)
if url['url']==self.start_urls[0]:
lstA = doc.listA(url=url['url'],start='blockquote',end='La voz de los poetas')
return [{"Urls":lstA}]
blockquotes = doc.getElementsByTag('blockquote')
page = url['url'].split("/")[-1]
filename = 'data/Poems=%s.txt' % page
with open(filename, 'w') as f:
for blockquote in blockquotes:
f.write(blockquote.getText('\n'))
f.write('\n')
print ('Saved file %s' % filename)
return True
except Exception as e:
print ('extract',e)
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(FirstSpider())# start scrapping
I'm writing a scrapy web crawler that saves the html from the pages that I visit. I also want to save the files that I crawl with their file extension.
This is what I have so far
Spider class
class MySpider(CrawlSpider):
name = 'my name'
start_urls = ['my url']
allowed_domains = ['my domain']
rules = (Rule (LinkExtractor(allow=()), callback="parse_item", follow= True),
)
def parse_item(self,response):
item = MyItem()
item['url'] = response.url
item['html'] = response.body
return item
pipelines.py
save_path = 'My path'
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
with open(os.path.join(save_path, filename), 'wb') as f:
f.write(item['html'])
self.UploadtoS3(filename)
def UploadtoS3(self, filename):
...
Is there an easy way to detect if the link ends in a file extension and save to that file extension? What I currently have will save to .html regardless of the extension.
I think that I could remove
filename = '%s.html' % page
and it would save as it's own extension, but there are cases where I want to save as html instead, such as if it ends in aspx
Try this ...
import os
extension = os.path.splitext(url)[-1].lower()
#check if URL has GET request parameters and remove them (page.html?render=true)
if '?' in extension:
extension = extension.split('?')[0]
Might want to check if that returns empty - for cases such as 'http://google.com' where there isn't a .format at the end.
I ended up doing
if not '.' in page:
fileName = '%s.html' % page
else:
fileName = page
So I am trying to make my first crawler with scrapy and all has gone well so far, but for some reason I can't get my crawler to output to a csv file.
It creates the file but when I close the command prompt to stop the crawler, then open the file it created, the file is empty...
Can anyone see what I may be doing wrong and why nothing is being written to the file?
I am trying to get a list of titles and image urls from wikipedia.
class WikispyderSpider(CrawlSpider):
name = "wikiSpyder"
custom_settings = {
'ROBOTSTXT_OBEY': False,
'DOWNLOAD_DELAY': 5
}
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles']
rules = (
Rule(LinkExtractor(canonicalize=True, unique=True), follow=True, callback="parse_link"),
)
def parse_link(self, response):
hxs = HtmlXPathSelector(response)
item = WikicrawlerItem()
item['title'] = hxs.select('//h1[contains(#id,"firstHeading")]/text()').extract()
item['imgURL'] = hxs.select('//div[contains(#class, "thumbinner")]//a/#href')[0].extract()
print(item)
yield item
pipelines.py
import csv
class WikicrawlerPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('results.csv', 'w'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['imgURL'][0]])
return item
items.py
import scrapy
class WikicrawlerItem(scrapy.Item):
title = scrapy.Field()
imgURL = scrapy.Field()
pass
For some reason my program would not work if I opened my csv.writer in the init function. After reading more documentation I tried to open the csv.writer in the open_spider function and VIOLA! I have a working writer.
pipelines.py
import csv
class WikicrawlerPipeline(object):
def open_spider(self, spider):
self.csvwriter = csv.writer(open('results.csv', 'a'))
self.csvwriter.writerow({'Title', 'ImageURL'})
self.ids_seen = set()
I need scrapy to take an argument (-a FILE_NAME="stuff") from the command line and apply that to the file created in my CSVWriterPipeLine in pipelines.py file. (The reason I went with pipeline.py was that the built in exporter was repeating data and repeating the header in the output file. Same code, but writing in the pipeline fixed it.)
I tried from scrapy.utils.project import get_project_settings as seen in
How to access scrapy settings from item Pipeline
but I couldn't change the file name from the command line.
I've also tried implementing #avaleske's solution that's on the page, since it specifically addresses this, but I don't know where to place the code he talks about in my scrapy folder.
Help?
settings.py:
BOT_NAME = 'internal_links'
SPIDER_MODULES = ['internal_links.spiders']
NEWSPIDER_MODULE = 'internal_links.spiders'
CLOSESPIDER_PAGECOUNT = 100
ITEM_PIPELINES = ['internal_links.pipelines.CsvWriterPipeline']
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'internal_links (+http://www.mycompany.com)'
FILE_NAME = "mytestfilename"
pipelines.py:
import csv
class CsvWriterPipeline(object):
def __init__(self, file_name):
header = ["URL"]
self.file_name = file_name
self.csvwriter = csv.writer(open(self.file_name, 'wb'))
self.csvwriter.writerow(header)
def process_item(self, item, internallinkspider):
# build your row to export, then export the row
row = [item['url']]
self.csvwriter.writerow(row)
return item
spider.py:
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from internal_links.items import MyItem
class MySpider(CrawlSpider):
name = 'internallinkspider'
allowed_domains = ['angieslist.com']
start_urls = ['http://www.angieslist.com']
rules = (Rule(SgmlLinkExtractor(), callback='parse_url', follow=True), )
def parse_url(self, response):
item = MyItem()
item['url'] = response.url
return item
You can use the "settings" notion and the -s command-line argument:
scrapy crawl internallinkspider -s FILE_NAME="stuff"
Then, in the pipeline:
import csv
class CsvWriterPipeline(object):
#classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
file_name = settings.get("FILE_NAME")
return cls(file_name)
def __init__(self, file_name):
header = ["URL"]
self.csvwriter = csv.writer(open(file_name, 'wb'))
self.csvwriter.writerow(header)
def process_item(self, item, internallinkspider):
# build your row to export, then export the row
row = [item['url']]
self.csvwriter.writerow(row)
return item