How to use Scrapy to parse PDFs? - python

I would like to download all PDFs found on a site, e.g. https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html. I also tried to use rules but I think it's not neccessary here.
This is my approach:
import scrapy
from scrapy.linkextractors import IGNORED_EXTENSIONS
CUSTOM_IGNORED_EXTENSIONS = IGNORED_EXTENSIONS.copy()
CUSTOM_IGNORED_EXTENSIONS.remove('pdf')
class PDFParser(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
# URL of the pdf file
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
rules = (
Rule(LinkExtractor(allow=r'.*\.pdf', deny_extensions=CUSTOM_IGNORED_EXTENSIONS), callback='parse', follow=True),
)
def parse(self, response):
# selector of pdf file.
for pdf in response.xpath("//a[contains(#href, 'pdf')]"):
yield scrapy.Request(
url=response.urljoin(pdf),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
It seems there are two problems. The first one when extracting all the pdf links with xpath:
TypeError: Cannot mix str and non-str arguments
and the second problem is about handling the pdf file itself. I just want to store it locally in a specific folder or similar. It would be really great if someone has a working example for this kind of site.

To download files you need to use the FilesPipeline. This requires that you enable it in ITEM_PIPELINES and then provide a field named file_urls in your yielded item. In the example below, I have created an extenstion of the FilesPipeline in order to retain the filename of the pdf as provided on the website. The files will be saved in a folder named downloaded_files in the current directory
Read more about the filespipeline from the docs
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[#class='download pdf pdf']/#href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}

Related

How to dynamically change download folder in scrapy?

I am downloading some HTML files from a website using scrapy, but all the downloads are being stored under one folder. I would rather like to store them in different folders dynamically, say HTML files from page 1 go into folder_1 and so on...
this is what my spider looks like
import scrapy
class LearnSpider(scrapy.Spider):
name = "learn"
start_urls = ["someUrlWithIndexstart="+chr(i) for i in range(ord('a'), ord('z')+1)]
def parse(self, response):
for song in response.css('.entity-title'):
songs = song.css('a ::attr(href)').get()
yield{
'file_urls': [songs+".html"]
}
ideally, what I wanna do is HTMLs scraped from each letter, go into the subfolders of each letter.
Following is my settings file.
BOT_NAME = 'learn'
SPIDER_MODULES = ['learn.spiders']
NEWSPIDER_MODULE = 'learn.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
FILES_STORE = 'downloaded_files'
Any solution/idea will be helpful, thank you.
Create a pipeline:
pipelines.py:
import os
from itemadapter import ItemAdapter
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
class ProcessPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return item['path'] + file_name
Change ITEM_PIPELINES in the settings to this class (ITEM_PIPELINES = {'projectsname.pipelines.ProcessPipeline': 1})
When you yield the item also add the path to the directory you want to download to:
yield {
'file_urls': [songs+".html"]
'path': f'folder{page}/' # ofcourse you'll need to provide the page variable
}

List with several absolute urls "urljoin'ed"

I wish to download all the files from the first post, of several forum topics of a specific forum page. I have my own file pipeline set up to take the items file_url, file_name and source(topic name), in order to save them to the folder ./source/file_name.
However, the file links are relative and I need to use the absolute path. I tried response.urljoin and it gives me a string of the absolute url but of the last file of the post only.
Running the spider gives me the error ValueError: Missing scheme in request url: h
This happens because the absolute url is a string and not a list
Here is my code:
import scrapy
from ..items import FilespipelineItem
class MTGSpider (scrapy.Spider):
name = 'mtgd'
base_url = 'https://www.slightlymagic.net/forum'
subforum_url = '/viewforum.php?f=48'
start_urls = [base_url + subforum_url]
def parse(self, response):
for topic_url in response.css('.row dl dt a.topictitle::attr(href)').extract():
yield response.follow(topic_url, callback=self.parse_topic)
def parse_topic(self, response):
item = FilespipelineItem()
item['source'] = response.xpath('//h2/a/text()').get()
item['file_name'] = response.css('.postbody')[0].css('.file .postlink::text').extract()
# Problematic code
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
yield item
If it helps here's the pipeline code:
import re
from scrapy.pipelines.files import FilesPipeline
from scrapy import Request
class MyFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
for file_url in item['file_url']:
yield Request(file_url,
meta={
'name': item['file_name'],
'source': item['source']
})
# Rename files to their original name and not the hash
def file_path(self, request, response=None, info=None):
file = request.meta['name']
source = request.meta['source']
# Get names from previous function meta
source = re.sub(r'[?\\*|"<>:/]', '', source)
# Clean source name for windows compatible folder name
filename = u'{0}/{1}'.format(source, file)
# Folder storage key: {0} corresponds to topic name; {1} corresponds to filename
return filename
So my question is.
In a topic with more than 1 file to be downloaded, how can I save the several absolute urls into the file_url item? The for loop is not working as intended since it only saves the last file's url.
Do I need a for loop for this problem? If so, what should it be?
In:
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
You are overwriting item['file_url'] every time with a new URL, and as a result the value of the last one is the value that stays.
Use Python list comprehension instead of a for loop:
file_urls = response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_urls'] = [response.urljoin(file_url) for file_url in file_urls]

Unable to rename downloaded images through pipelines without the usage of item.py

I've created a script using python's scrapy module to download and rename movie images from multiple pages out of a torrent site and store them in a desktop folder. When it is about downloading and storing those images in a desktop folder, my script is the same errorlessly. However, what I'm struggling to do now is rename those files on the fly. As I didn't make use of item.py file and I do not wish to either, I hardly understand how the logic of pipelines.py file would be to handle the renaming process.
My spider (It downloads the images flawlessly):
from scrapy.crawler import CrawlerProcess
import scrapy, os
class YifySpider(scrapy.Spider):
name = "yify"
allowed_domains = ["www.yify-torrent.org"]
start_urls = ["https://www.yify-torrent.org/search/1080p/p-{}/".format(page) for page in range(1,5)]
custom_settings = {
'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
'IMAGES_STORE': r"C:\Users\WCS\Desktop\Images",
}
def parse(self, response):
for link in response.css("article.img-item .poster-thumb::attr(src)").extract():
img_link = response.urljoin(link)
yield scrapy.Request(img_link, callback=self.get_images)
def get_images(self, response):
yield {
'image_urls': [response.url],
}
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(YifySpider)
c.start()
pipelines.py contains: (the following lines are the placeholders to let you know I at least tried):
from scrapy.http import Request
class YifyPipeline(object):
def file_path(self, request, response=None, info=None):
image_name = request.url.split('/')[-1]
return image_name
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
How can I rename the images through pipelines.py without the usage of item.py?
You need to subclass the original ImagesPipeline:
from scrapy.pipelines.images import ImagesPipeline
class YifyPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
image_name = request.url.split('/')[-1]
return image_name
And then refer to it in your settings:
custom_settings = {
'ITEM_PIPELINES': {'my_project.pipelines.YifyPipeline': 1},
}
But be aware that a simple "use the exact filename" idea will cause issues when different files have the same name, unless you add a unique folder structure or an additional component to the filename. That's one reason checksums-based filenames are used by default. Refer to the original file_path, in case you want to include some of the original logic to prevent that.

Images downloaded from scrapy smaller than expected (jpegs) or unreadable (tifs)

I'm not sure how best to pose this question. I'm new to both python and scrapy.
Essentially, the files I download using my scrapy script do not match the files I would download manually. All the files (even the smallest jpeg image) is reduced in size. When I open the images in Photoshop, the 'tif' files are in an unrecognizable format. The jpegs open fine. Further, the files I download are downloaded as grayscale files, and the ones my scrapy script pulls are RGB.
As far as I can tell the documentation on the image_pipeline is pretty much all there is for processing images with scrapy, but it does mention it uses the pillow library for processing.
My thinking is that it's doing something under the hood by default to adjust the images &| limit the size of the downloads. But I don't know what that could be or how to disable it. I'd like to download the images 'as is', i.e., with as little (read: none) processing as possible.
If it helps, below are the relevant files. I've omitted some of the code for my spider for brevity, the omitted parts only relate to scraping metadata such as titles and reference numbers.
items.py
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
settings.py
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
spider.py
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
Looks like I figured it out and solved my own issue! I poked around in the source code for the ImagesPipeline and discovered that by default, scrapy uses a method convert_images when it calls get_images. convert_images was the issue, as it converts both the filetype and colorspace for non-jpeg and bmp images.
I re-wrote get_images to handle both the tiff and jpeg formats I was interested in scraping:
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
image = Image.open(BytesIO(response.body))
buf = BytesIO()
ext = response.url.split('.')[-1]
if ext == 'tif':
exif = image.tag_v2
image.save(buf, 'TIFF', tiffinfo=exif)
else:
image.save(buf, 'JPEG')
yield path, image, buf
Hope that helps others in the future!

Scrapy Save Downloadable Files

I'm writing a scrapy web crawler that saves the html from the pages that I visit. I also want to save the files that I crawl with their file extension.
This is what I have so far
Spider class
class MySpider(CrawlSpider):
name = 'my name'
start_urls = ['my url']
allowed_domains = ['my domain']
rules = (Rule (LinkExtractor(allow=()), callback="parse_item", follow= True),
)
def parse_item(self,response):
item = MyItem()
item['url'] = response.url
item['html'] = response.body
return item
pipelines.py
save_path = 'My path'
if not os.path.exists(save_path):
os.makedirs(save_path)
class HtmlFilePipeline(object):
def process_item(self, item, spider):
page = item['url'].split('/')[-1]
filename = '%s.html' % page
with open(os.path.join(save_path, filename), 'wb') as f:
f.write(item['html'])
self.UploadtoS3(filename)
def UploadtoS3(self, filename):
...
Is there an easy way to detect if the link ends in a file extension and save to that file extension? What I currently have will save to .html regardless of the extension.
I think that I could remove
filename = '%s.html' % page
and it would save as it's own extension, but there are cases where I want to save as html instead, such as if it ends in aspx
Try this ...
import os
extension = os.path.splitext(url)[-1].lower()
#check if URL has GET request parameters and remove them (page.html?render=true)
if '?' in extension:
extension = extension.split('?')[0]
Might want to check if that returns empty - for cases such as 'http://google.com' where there isn't a .format at the end.
I ended up doing
if not '.' in page:
fileName = '%s.html' % page
else:
fileName = page

Categories

Resources