Download images to a absolute path - python

def parse_images(self,response):
Name = response.meta['Name']
album = response.meta['Album Name']
os.makedirs(f'Master/{Name}/{album}',exist_ok=True)
for ind,image in enumerate(response.xpath('//ul/li/a/img')):
img = image.xpath('#srcset').extract_first().split(', ')[-1].split()[0] #image URL
print(img)
imageName = f'image_{ind+1}'+os.path.splitext(img)[1] #image_1.jpg
path = os.path.join('Master',Name,album,imageName)
abs_path = os.path.abspath(path) #Path where I want to download
How can I create a pipeline to store the image in the absolute path I created,I checked Downloading Item Images but I can't find a way to change the storage place.
Note: I do prefer to stay with scrapy and not use requests to actually download the images

This example gets images from http://books.toscrape.com/ and use pipeline to put in subfolders using first char of filename.
I settings I set path to Master
It can be relative
'IMAGES_STORE': 'Master',
or absolute path
'IMAGES_STORE': '/full/path/to/Master',
This folder has to exist before run code. If doesn't exists then pipeline will not create it and it will not download. But pipeline will create subfolders automatically so you will no need makedirs().
In parser I add name and album to item so these values will be send to pipeline
def parse(self, response):
print('url:', response.url)
#open_in_browser(response) # to see url in web browser
# download images and convert to JPG (even if it is already JPG)
for url in response.css('img::attr(src)').extract():
url = response.urljoin(url)
image = url.rsplit('/')[-1] # get first char from image name
yield {'image_urls': [url], 'name': 'books', 'album': image[0]}
In pipeline in get_media_requests() I get values from item and put in meta to send it to file_path which generates local path for file (in IMAGES_STORE).
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
# send `meta` to `file_path()`
yield scrapy.Request(image_url, meta={'name': item['name'], 'album': item['album']})
In pipeline in full_path() I get values from meta and finally I create path name/album/image.jpg. Originally pipeline use hashcode as filename
def file_path(self, request, response=None, info=None):
# get `meta`
name = request.meta['name']
album = request.meta['album']
image = request.url.rsplit('/')[-1]
#print('file_path:', request.url, request.meta, image)
return '%s/%s/%s' % (name, album, image)
And this saves image in IMAGES_STORE/name/album/image.jpg
Minimal working example.
You can put all code in one file and run it as normal script - python script.py - without creating scrapy project. This way everyone can easily test this code.
import scrapy
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
#from scrapy.commands.view import open_in_browser
#import json
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
# see page created for scraping: http://toscrape.com/
start_urls = ['http://books.toscrape.com/'] #'http://quotes.toscrape.com']
def parse(self, response):
print('url:', response.url)
#open_in_browser(response) # to see url in web browser
# download images and convert to JPG (even if it is already JPG)
for url in response.css('img::attr(src)').extract():
url = response.urljoin(url)
image = url.rsplit('/')[-1] # get first char from image name
yield {'image_urls': [url], 'name': 'books', 'album': image[0]}
# --- pipelines ---
import os
# --- original code --- # needed only if you use `image_guid`
#import hashlib
#from scrapy.utils.python import to_bytes
# --- original code ---
class RenameImagePipeline(ImagesPipeline):
'''Pipeline to change file names - to add folder name'''
def get_media_requests(self, item, info):
# --- original code ---
#for image_url in item['image_urls']:
# yield scrapy.Request(image_url)
# --- original code ---
for image_url in item['image_urls']:
# send `meta` to `file_path()`
yield scrapy.Request(image_url, meta={'name': item['name'], 'album': item['album']})
def file_path(self, request, response=None, info=None):
# --- original code ---
#image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
#return 'full/%s.jpg' % (image_guid,)
# --- original code ---
# get `meta`
name = request.meta['name']
album = request.meta['album']
image = request.url.rsplit('/')[-1]
#image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
print('file_path:', request.url, request.meta, image) #, image_guid)
#return '%s/%s/%s.jpg' % (name, album, image_guid)
return '%s/%s/%s' % (name, album, image)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
# it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
#'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1}, # used standard ImagePipeline (download to IMAGES_STORE/full)
'ITEM_PIPELINES': {'__main__.RenameImagePipeline': 1}, # used Pipeline create in current file (needs __main___)
#'IMAGES_STORE': '/full/path/to/valid/dir', # this folder has to exist before downloading
'IMAGES_STORE': 'Master', # this folder has to exist before downloading
})
c.crawl(MySpider)
c.start()
BTW: Using
import scrapy
print(scrapy.__file__)
you can find source code and see how it looks in original ImagePipeline. In full example above I put some original code in comments.
On Linux I have
/usr/local/lib/python3.7/dist-packages/scrapy/
and
/usr/local/lib/python3.7/dist-packages/scrapy/pipelines/images.py
/usr/local/lib/python3.7/dist-packages/scrapy/pipelines/files.py
BTW: ImagePipeline compresses all images to JPG - event if you download JPG. If you want to keep original image then you may need FilePipeline instead of ImagePipeline. And FILE_STORE instead of IMAGE_STORE.
BTW: Sometimes is problem with Pipeline because it doesn't display error messages (scrapy catch errors and doesn't display) so it hard to recognize when there is mistake in code in Pipeline.
EDIT: The same example but with FilesPipeline (and FILE_STORES and item['file_urls']).
I put comments with phrase "instead of" to show differences.
import scrapy
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
#from scrapy.commands.view import open_in_browser
#import json
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
# see page created for scraping: http://toscrape.com/
start_urls = ['http://books.toscrape.com/'] #'http://quotes.toscrape.com']
def parse(self, response):
print('url:', response.url)
#open_in_browser(response) # to see url in web browser
# download all types of files (without converting images to JPG)
for url in response.css('img::attr(src)').extract():
url = response.urljoin(url)
image = url.rsplit('/')[-1] # get first char from image name
#yield {'image_urls': [url], 'name': 'books', 'album': image[0]}
yield {'file_urls': [url], 'name': 'books', 'album': image[0]} # <--- file_urls instead of image_urls
# --- pipelines ---
import os
#class RenameImagesPipeline(ImagesPipeline):
class RenameFilesPipeline(FilesPipeline): # <-- FilesPipeline instead of ImagesPipeline
'''Pipeline to change file names - to add folder name'''
def get_media_requests(self, item, info):
#for image_url in item['image_urls']:
for image_url in item['file_urls']: # <--- file_urls instead of image_urls
# send `meta` to `file_path()`
yield scrapy.Request(image_url, meta={'name': item['name'], 'album': item['album']})
def file_path(self, request, response=None, info=None):
# get `meta`
name = request.meta['name']
album = request.meta['album']
image = request.url.rsplit('/')[-1]
print('file_path:', request.url, request.meta, image)
return '%s/%s/%s' % (name, album, image)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# --- images ---
# download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
# it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
#'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1}, # used standard ImagesPipeline (download to IMAGES_STORE/full)
#'ITEM_PIPELINES': {'__main__.RenameImagesPipeline': 1},
#'IMAGES_STORE': '/full/path/to/valid/dir', # this folder has to exist before downloading
# --- files ---
# download files to `FILES_STORE/full` (standard folder) (without converting images)
# it needs `yield {'file_urls': [url]}` in `parse()` and both ITEM_PIPELINES and FILES_STORE to work
#'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1}, # used standard FilesPipeline (download to FILES_STORE/full)
'ITEM_PIPELINES': {'__main__.RenameFilesPipeline': 1}, # <--- RenameFilesPipeline instead of RenameImagesPipeline
'FILES_STORE': 'Master', # this folder has to exist before downloading # <--- FILES_STORE instead of IMAGES_STORE
})
c.crawl(MySpider)
c.start()

Related

How to use Scrapy to parse PDFs?

I would like to download all PDFs found on a site, e.g. https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html. I also tried to use rules but I think it's not neccessary here.
This is my approach:
import scrapy
from scrapy.linkextractors import IGNORED_EXTENSIONS
CUSTOM_IGNORED_EXTENSIONS = IGNORED_EXTENSIONS.copy()
CUSTOM_IGNORED_EXTENSIONS.remove('pdf')
class PDFParser(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
# URL of the pdf file
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
rules = (
Rule(LinkExtractor(allow=r'.*\.pdf', deny_extensions=CUSTOM_IGNORED_EXTENSIONS), callback='parse', follow=True),
)
def parse(self, response):
# selector of pdf file.
for pdf in response.xpath("//a[contains(#href, 'pdf')]"):
yield scrapy.Request(
url=response.urljoin(pdf),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
It seems there are two problems. The first one when extracting all the pdf links with xpath:
TypeError: Cannot mix str and non-str arguments
and the second problem is about handling the pdf file itself. I just want to store it locally in a specific folder or similar. It would be really great if someone has a working example for this kind of site.
To download files you need to use the FilesPipeline. This requires that you enable it in ITEM_PIPELINES and then provide a field named file_urls in your yielded item. In the example below, I have created an extenstion of the FilesPipeline in order to retain the filename of the pdf as provided on the website. The files will be saved in a folder named downloaded_files in the current directory
Read more about the filespipeline from the docs
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://www.stadt-koeln.de/politik-und-verwaltung/bekanntmachungen/amtsblatt/index.html']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[#class='download pdf pdf']/#href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}

How to dynamically change download folder in scrapy?

I am downloading some HTML files from a website using scrapy, but all the downloads are being stored under one folder. I would rather like to store them in different folders dynamically, say HTML files from page 1 go into folder_1 and so on...
this is what my spider looks like
import scrapy
class LearnSpider(scrapy.Spider):
name = "learn"
start_urls = ["someUrlWithIndexstart="+chr(i) for i in range(ord('a'), ord('z')+1)]
def parse(self, response):
for song in response.css('.entity-title'):
songs = song.css('a ::attr(href)').get()
yield{
'file_urls': [songs+".html"]
}
ideally, what I wanna do is HTMLs scraped from each letter, go into the subfolders of each letter.
Following is my settings file.
BOT_NAME = 'learn'
SPIDER_MODULES = ['learn.spiders']
NEWSPIDER_MODULE = 'learn.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
FILES_STORE = 'downloaded_files'
Any solution/idea will be helpful, thank you.
Create a pipeline:
pipelines.py:
import os
from itemadapter import ItemAdapter
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
class ProcessPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return item['path'] + file_name
Change ITEM_PIPELINES in the settings to this class (ITEM_PIPELINES = {'projectsname.pipelines.ProcessPipeline': 1})
When you yield the item also add the path to the directory you want to download to:
yield {
'file_urls': [songs+".html"]
'path': f'folder{page}/' # ofcourse you'll need to provide the page variable
}

List with several absolute urls "urljoin'ed"

I wish to download all the files from the first post, of several forum topics of a specific forum page. I have my own file pipeline set up to take the items file_url, file_name and source(topic name), in order to save them to the folder ./source/file_name.
However, the file links are relative and I need to use the absolute path. I tried response.urljoin and it gives me a string of the absolute url but of the last file of the post only.
Running the spider gives me the error ValueError: Missing scheme in request url: h
This happens because the absolute url is a string and not a list
Here is my code:
import scrapy
from ..items import FilespipelineItem
class MTGSpider (scrapy.Spider):
name = 'mtgd'
base_url = 'https://www.slightlymagic.net/forum'
subforum_url = '/viewforum.php?f=48'
start_urls = [base_url + subforum_url]
def parse(self, response):
for topic_url in response.css('.row dl dt a.topictitle::attr(href)').extract():
yield response.follow(topic_url, callback=self.parse_topic)
def parse_topic(self, response):
item = FilespipelineItem()
item['source'] = response.xpath('//h2/a/text()').get()
item['file_name'] = response.css('.postbody')[0].css('.file .postlink::text').extract()
# Problematic code
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
yield item
If it helps here's the pipeline code:
import re
from scrapy.pipelines.files import FilesPipeline
from scrapy import Request
class MyFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
for file_url in item['file_url']:
yield Request(file_url,
meta={
'name': item['file_name'],
'source': item['source']
})
# Rename files to their original name and not the hash
def file_path(self, request, response=None, info=None):
file = request.meta['name']
source = request.meta['source']
# Get names from previous function meta
source = re.sub(r'[?\\*|"<>:/]', '', source)
# Clean source name for windows compatible folder name
filename = u'{0}/{1}'.format(source, file)
# Folder storage key: {0} corresponds to topic name; {1} corresponds to filename
return filename
So my question is.
In a topic with more than 1 file to be downloaded, how can I save the several absolute urls into the file_url item? The for loop is not working as intended since it only saves the last file's url.
Do I need a for loop for this problem? If so, what should it be?
In:
for file_url in response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_url'] = response.urljoin(file_url)
You are overwriting item['file_url'] every time with a new URL, and as a result the value of the last one is the value that stays.
Use Python list comprehension instead of a for loop:
file_urls = response.css('.postbody')[0].css('.file .postlink::attr(href)').extract():
item['file_urls'] = [response.urljoin(file_url) for file_url in file_urls]

Unable to rename downloaded images through pipelines without the usage of item.py

I've created a script using python's scrapy module to download and rename movie images from multiple pages out of a torrent site and store them in a desktop folder. When it is about downloading and storing those images in a desktop folder, my script is the same errorlessly. However, what I'm struggling to do now is rename those files on the fly. As I didn't make use of item.py file and I do not wish to either, I hardly understand how the logic of pipelines.py file would be to handle the renaming process.
My spider (It downloads the images flawlessly):
from scrapy.crawler import CrawlerProcess
import scrapy, os
class YifySpider(scrapy.Spider):
name = "yify"
allowed_domains = ["www.yify-torrent.org"]
start_urls = ["https://www.yify-torrent.org/search/1080p/p-{}/".format(page) for page in range(1,5)]
custom_settings = {
'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
'IMAGES_STORE': r"C:\Users\WCS\Desktop\Images",
}
def parse(self, response):
for link in response.css("article.img-item .poster-thumb::attr(src)").extract():
img_link = response.urljoin(link)
yield scrapy.Request(img_link, callback=self.get_images)
def get_images(self, response):
yield {
'image_urls': [response.url],
}
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(YifySpider)
c.start()
pipelines.py contains: (the following lines are the placeholders to let you know I at least tried):
from scrapy.http import Request
class YifyPipeline(object):
def file_path(self, request, response=None, info=None):
image_name = request.url.split('/')[-1]
return image_name
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
How can I rename the images through pipelines.py without the usage of item.py?
You need to subclass the original ImagesPipeline:
from scrapy.pipelines.images import ImagesPipeline
class YifyPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
image_name = request.url.split('/')[-1]
return image_name
And then refer to it in your settings:
custom_settings = {
'ITEM_PIPELINES': {'my_project.pipelines.YifyPipeline': 1},
}
But be aware that a simple "use the exact filename" idea will cause issues when different files have the same name, unless you add a unique folder structure or an additional component to the filename. That's one reason checksums-based filenames are used by default. Refer to the original file_path, in case you want to include some of the original logic to prevent that.

Images downloaded from scrapy smaller than expected (jpegs) or unreadable (tifs)

I'm not sure how best to pose this question. I'm new to both python and scrapy.
Essentially, the files I download using my scrapy script do not match the files I would download manually. All the files (even the smallest jpeg image) is reduced in size. When I open the images in Photoshop, the 'tif' files are in an unrecognizable format. The jpegs open fine. Further, the files I download are downloaded as grayscale files, and the ones my scrapy script pulls are RGB.
As far as I can tell the documentation on the image_pipeline is pretty much all there is for processing images with scrapy, but it does mention it uses the pillow library for processing.
My thinking is that it's doing something under the hood by default to adjust the images &| limit the size of the downloads. But I don't know what that could be or how to disable it. I'd like to download the images 'as is', i.e., with as little (read: none) processing as possible.
If it helps, below are the relevant files. I've omitted some of the code for my spider for brevity, the omitted parts only relate to scraping metadata such as titles and reference numbers.
items.py
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
settings.py
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
spider.py
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
Looks like I figured it out and solved my own issue! I poked around in the source code for the ImagesPipeline and discovered that by default, scrapy uses a method convert_images when it calls get_images. convert_images was the issue, as it converts both the filetype and colorspace for non-jpeg and bmp images.
I re-wrote get_images to handle both the tiff and jpeg formats I was interested in scraping:
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
image = Image.open(BytesIO(response.body))
buf = BytesIO()
ext = response.url.split('.')[-1]
if ext == 'tif':
exif = image.tag_v2
image.save(buf, 'TIFF', tiffinfo=exif)
else:
image.save(buf, 'JPEG')
yield path, image, buf
Hope that helps others in the future!

Categories

Resources