I'm writing a scrapy web crawler that saves the html from the pages that I visit. I also want to save the files that I crawl with their file extension.
This is what I have so far
Spider class
class MySpider(CrawlSpider):
name = 'my name'
start_urls = ['my url']
allowed_domains = ['my domain']
rules = (Rule (LinkExtractor(allow=()), callback="parse_item", follow= True),
)
def parse_item(self,response):
item = MyItem()
item['url'] = response.url
item['html'] = response.body
return item
pipelines.py
save_path = 'My path'
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
with open(os.path.join(save_path, filename), 'wb') as f:
f.write(item['html'])
self.UploadtoS3(filename)
def UploadtoS3(self, filename):
...
Is there an easy way to detect if the link ends in a file extension and save to that file extension? What I currently have will save to .html regardless of the extension.
I think that I could remove
filename = '%s.html' % page
and it would save as it's own extension, but there are cases where I want to save as html instead, such as if it ends in aspx
Try this ...
import os
extension = os.path.splitext(url)[-1].lower()
#check if URL has GET request parameters and remove them (page.html?render=true)
if '?' in extension:
extension = extension.split('?')[0]
Might want to check if that returns empty - for cases such as 'http://google.com' where there isn't a .format at the end.
I ended up doing
if not '.' in page:
fileName = '%s.html' % page
else:
fileName = page
Related
I would like to download all PDFs found on a site, e.g. https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html. I also tried to use rules but I think it's not neccessary here.
This is my approach:
import scrapy
from scrapy.linkextractors import IGNORED_EXTENSIONS
CUSTOM_IGNORED_EXTENSIONS = IGNORED_EXTENSIONS.copy()
CUSTOM_IGNORED_EXTENSIONS.remove('pdf')
class PDFParser(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
# URL of the pdf file
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
rules = (
Rule(LinkExtractor(allow=r'.*\.pdf', deny_extensions=CUSTOM_IGNORED_EXTENSIONS), callback='parse', follow=True),
)
def parse(self, response):
# selector of pdf file.
for pdf in response.xpath("//a[contains(#href, 'pdf')]"):
yield scrapy.Request(
url=response.urljoin(pdf),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
It seems there are two problems. The first one when extracting all the pdf links with xpath:
TypeError: Cannot mix str and non-str arguments
and the second problem is about handling the pdf file itself. I just want to store it locally in a specific folder or similar. It would be really great if someone has a working example for this kind of site.
To download files you need to use the FilesPipeline. This requires that you enable it in ITEM_PIPELINES and then provide a field named file_urls in your yielded item. In the example below, I have created an extenstion of the FilesPipeline in order to retain the filename of the pdf as provided on the website. The files will be saved in a folder named downloaded_files in the current directory
Read more about the filespipeline from the docs
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[#class='download pdf pdf']/#href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}
I am trying to collect all the poems from under the category "Índice general alfabético" on this site http://amediavoz.com/. There it appears the title of the poems which one has to click to get to the actual poems. Basically I want to copy all the text of each poems from each of these pages (the text within <p></p> under xpath "/html/body/blockquote[2]/blockquote" in each of the pages) except the ending information about the poem under <i></i> in the HTML code. I would like to save everything in .txt files, either one big one, or one per page.
This code is an attempt to do this.
import scrapy
class FirstSpider(scrapy.Spider):
name = "FirstSpider"
start_urls = ['http://amediavoz.com/']
def start_requests(self):
url = ['http://amediavoz.com/']
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
xp = "//a[#target='_blank']/#href"
for url in response.xpath(xp).extract():
page = response.url.split("/")[-2]
filename = 'Poems=%s.txt' % page
sub = url.css('blockquote')[1]
with open(filename, 'wb') as f:
f.write(sub.xpath('//font/text()').extract())
self.log('Saved file %s' % filename)
f.close()
When I run the code I dont get any error message but no output either, that is, a text file.
Any help is appreciated.
Sorry, I don't know Spanish. I just roughly extract the text, not necessarily right. If you can mark which data you need to extract from HTML, I will help you modify the code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class FirstSpider(Spider):
name = 'FirstSpider'
start_urls = ['http://amediavoz.com/']
refresh_urls = True
def extract(self, url, html, models, modelNames):
try:
doc = SimplifiedDoc(html)
if url['url']==self.start_urls[0]:
lstA = doc.listA(url=url['url'],start='blockquote',end='La voz de los poetas')
return [{"Urls":lstA}]
blockquotes = doc.getElementsByTag('blockquote')
page = url['url'].split("/")[-1]
filename = 'data/Poems=%s.txt' % page
with open(filename, 'w') as f:
for blockquote in blockquotes:
f.write(blockquote.getText('\n'))
f.write('\n')
print ('Saved file %s' % filename)
return True
except Exception as e:
print ('extract',e)
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(FirstSpider())# start scrapping
I wish to download all the files from the first post, of several forum topics of a specific forum page. I have my own file pipeline set up to take the items file_url, file_name and source(topic name), in order to save them to the folder ./source/file_name.
However, the file links are relative and I need to use the absolute path. I tried response.urljoin and it gives me a string of the absolute url but of the last file of the post only.
Running the spider gives me the error ValueError: Missing scheme in request url: h
This happens because the absolute url is a string and not a list
Here is my code:
import scrapy
from ..items import FilespipelineItem
class MTGSpider (scrapy.Spider):
name = 'mtgd'
base_url = 'https://www.slightlymagic.net/forum'
subforum_url = '/viewforum.php?f=48'
start_urls = [base_url + subforum_url]
def parse(self, response):
for topic_url in response.css('.row dl dt a.topictitle::attr(href)').extract():
yield response.follow(topic_url, callback=self.parse_topic)
def parse_topic(self, response):
item = FilespipelineItem()
item['source'] = response.xpath('//h2/a/text()').get()
item['file_name'] = response.css('.postbody')[0].css('.file .postlink::text').extract()
# Problematic code
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
yield item
If it helps here's the pipeline code:
import re
from scrapy.pipelines.files import FilesPipeline
from scrapy import Request
class MyFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
for file_url in item['file_url']:
yield Request(file_url,
meta={
'name': item['file_name'],
'source': item['source']
})
# Rename files to their original name and not the hash
def file_path(self, request, response=None, info=None):
file = request.meta['name']
source = request.meta['source']
# Get names from previous function meta
source = re.sub(r'[?\\*|"<>:/]', '', source)
# Clean source name for windows compatible folder name
filename = u'{0}/{1}'.format(source, file)
# Folder storage key: {0} corresponds to topic name; {1} corresponds to filename
return filename
So my question is.
In a topic with more than 1 file to be downloaded, how can I save the several absolute urls into the file_url item? The for loop is not working as intended since it only saves the last file's url.
Do I need a for loop for this problem? If so, what should it be?
In:
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
You are overwriting item['file_url'] every time with a new URL, and as a result the value of the last one is the value that stays.
Use Python list comprehension instead of a for loop:
file_urls = response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_urls'] = [response.urljoin(file_url) for file_url in file_urls]
I'm writing a scrapy web crawler that saves the html from the pages that I visit and I'm uploading them to S3. Since they are uploading to S3, there's no point in keeping a local copy
Spider class
class MySpider(CrawlSpider):
name = 'my name'
start_urls = ['my url']
allowed_domains = ['my domain']
rules = (Rule (LinkExtractor(allow=()), callback="parse_item", follow= True),
)
def parse_item(self,response):
item = MyItem()
item['url'] = response.url
item['html'] = response.body
return item
pipelines.py
save_path = 'My path'
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
with open(os.path.join(save_path, filename), 'wb') as f:
f.write(item['html'])
self.UploadtoS3(filename)
def UploadtoS3(self, filename):
...
I read in the python docs that I can create a NamedTemporaryFile: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
I'm a little fuzzy on when it gets deleted. If I were to use a NamedTemporaryFile how could I delete the file after successful upload to S3?
Extending on my comment:
You could use the io.StringIO method to create a text buffer instead of saving/reading/deleting a file.
It would be something like this:
import io
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
file = io.StringIO()
file.write(item['html'])
self.UploadtoS3(filename, file)
def UploadtoS3(self, filename, file):
# here instead of reading the file to upload to S3, use the file passed to the method
Documentation: https://docs.python.org/3/library/io.html
So I am trying to make my first crawler with scrapy and all has gone well so far, but for some reason I can't get my crawler to output to a csv file.
It creates the file but when I close the command prompt to stop the crawler, then open the file it created, the file is empty...
Can anyone see what I may be doing wrong and why nothing is being written to the file?
I am trying to get a list of titles and image urls from wikipedia.
class WikispyderSpider(CrawlSpider):
name = "wikiSpyder"
custom_settings = {
'ROBOTSTXT_OBEY': False,
'DOWNLOAD_DELAY': 5
}
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles']
rules = (
Rule(LinkExtractor(canonicalize=True, unique=True), follow=True, callback="parse_link"),
)
def parse_link(self, response):
hxs = HtmlXPathSelector(response)
item = WikicrawlerItem()
item['title'] = hxs.select('//h1[contains(#id,"firstHeading")]/text()').extract()
item['imgURL'] = hxs.select('//div[contains(#class, "thumbinner")]//a/#href')[0].extract()
print(item)
yield item
pipelines.py
import csv
class WikicrawlerPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('results.csv', 'w'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['imgURL'][0]])
return item
items.py
import scrapy
class WikicrawlerItem(scrapy.Item):
title = scrapy.Field()
imgURL = scrapy.Field()
pass
For some reason my program would not work if I opened my csv.writer in the init function. After reading more documentation I tried to open the csv.writer in the open_spider function and VIOLA! I have a working writer.
pipelines.py
import csv
class WikicrawlerPipeline(object):
def open_spider(self, spider):
self.csvwriter = csv.writer(open('results.csv', 'a'))
self.csvwriter.writerow({'Title', 'ImageURL'})
self.ids_seen = set()