Download all Images in a Web directory - python

I am trying to gather all images in a specific Directory on my webserver, using BeautifulSoup4.
So far I got this code,
from init import *
from bs4 import BeautifulSoup
import urllib
import urllib.request
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urllib.request.urlopen(url)
return BeautifulSoup(html, features="html.parser")
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print ('Downloading images to current working directory.')
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.Request(each, filename)
return image_links
#a standard call looks like this
get_images('https://omabilder.000webhostapp.com/img/')
This however, spits out the following error
7images found.
Downloading images to current working directory.
Traceback (most recent call last):
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 1, in <module>
from init import *
File "C:\Users\MyPC\Desktop\oma projekt\init.py", line 9, in <module>
from getpics import *
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 26, in <module>
get_images('https://omabilder.000webhostapp.com/img/')
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 22, in get_images
urllib.request.Request(each, filename)
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '/icons/blank.gif'
What I do not understand is the following,
There is no GIF in the Directory and no /icon/ subdirectory.
Furthermore it spits out 7 images were found, when there are only like 3 uploaded to the website.

The gifs are the icons next to the links on your website (tiny ~20x20 px images). They're actually shown on the website. If I understand correctly, you want to download the png images -- these are links, rather than images at the url you've provided.
If you want to download the png images from the links, then you can use something like this:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urllib.request.urlopen(url)
return BeautifulSoup(html, features="html.parser")
def get_images(url):
soup = make_soup(url)
# get all links (start with "a")
images = [link["href"] for link in soup.find_all('a', href=True)]
# keep ones that end with png
images = [im for im in images if im.endswith(".png")]
print (str(len(images)) + " images found.")
print ('Downloading images to current working directory.')
#compile our unicode list of image links
for each in images:
urllib.request.urlretrieve(os.path.join(url, each), each)
return images
# #a standard call looks like this
get_images('https://omabilder.000webhostapp.com/img/')

Related

Web scraping images are in a unsupported format

I have been trying to scrape some images using Beautifulsoup in Python and I am facing some problems, so the thing is that I am successfully able to scrape the link as well as store it in the folder but the images are in an unsupported format.
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in res.iter_content(1000):
ima.write(down)
It says file format not supported even though it's in a jpeg format
output image in a folder
This line for down in res.iter_content(1000): is not iterating the image from ll - it is reiterating the html result. Your OS may recognize the file from the extension (.jpeg), but this is only because of the filename - not the content (which is not JPEG, but HTML, and hence the error).
You should make another request for the image itself, so it can be fetched and stored:
for link in soup.find_all('img'):
ll = link.get('src')
img_rs = requests.get(os.path.join('https://books.toscrape.com/', ll)) # <-- this line
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in img_rs.iter_content(1000): # <-- and iterate on the result
ima.write(down)
The reason for saving the HTML is obscure. So, ignoring that part of the code in question, it comes down to this:
import requests
from os.path import join, basename
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
URL = 'https://books.toscrape.com'
TARGET_DIR = '/tmp'
with requests.Session() as session:
(r := session.get(URL)).raise_for_status()
for image in BS(r.text, 'lxml').find_all('img'):
src = image['src']
(r := session.get(urljoin(URL, src), stream=True)).raise_for_status()
with open(join(TARGET_DIR, basename(src)), 'wb') as t:
for chunk in r.iter_content(chunk_size=8192):
t.write(chunk)
In terms of performance, this can be significantly enhanced by multithreading
Your problem is that after you find the URL of the image you don't do anything with it and instead you try to save the whole inital request which is just the html file of the whole website. Try something like this instead:
base_url = 'https://books.toscrape.com/'
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = os.path.join('images', os.path.basename(ll))
current_img = os.path.join(base_url, ll)
img_res = requests.get(current_img, stream = True)
with open(ima, 'wb') as f:
shutil.copyfileobj(img_res.raw, f)
del img_res

Python - Downloading images using Wget. How to add a string to each file?

I'm using the following Python code to download images from a certain website. It's part of a code that I'm using to make a web scraper.
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
continue
It's working well, but I want to know if it's possible to add a string as a prefix to each file...
My ideia is get the page title via Xpath and add as a prefix for each file.
I don't know where to add a string in this code. Can someone help me?
For example, I'm downloading these files:
logo.jpg, plans.jpg, circle.jpg
And I need to add a prefix, like these:
Beautiful_Plan_logo.jpg, Beautiful_Plan_plans.jpg, Beautiful_Plan_circle.jpg
Following I'll put the entire code:
import requests
import bs4 as bs
import urllib.request
import wget
##################################################
# getting url images #
##################################################
url = "https://tyreehouseplans.com/shop/house-plans/blackberry-blossom/"
opener = urllib.request.build_opener()
opener.add_headers = [{'User-Agent' : 'Mozilla'}]
urllib.request.install_opener(opener)
raw = requests.get(url).text
soup = bs.BeautifulSoup(raw, 'html.parser')
imgs = soup.find_all('img')
links = []
for img in imgs:
link = img.get('src')
links.append(link)
print(links)
################################################
# downloading images #
################################################
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
continue
Thank you for any help!
python module wget has an option out, which determines the name of the output file. For example, the following script downloads 3 images, adding a prefix Beautiful_Plan_.
import wget
base_url = 'https://homepages.cae.wisc.edu/~ece533/images/'
image_names = ['airplane.png', 'arctichare.png', 'baboon.png']
prefix = 'Beautiful_Plan_'
for image_name in image_names:
wget.download(base_url + image_name, out = prefix + image_name)
you can use shutil for this
import shutil
prefix = "prefix_"
#your piece of code
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
shutil.copy(local_image_filename, prefix+local_image_filename)
use os.rename as per this documentation
I wrote code for making a seperate file with the extra information up front with a seperator.
import requests
import bs4 as bs
import urllib.request
import wget
##################################################
# getting url images #
##################################################
url = "https://tyreehouseplans.com/shop/house-plans/blackberry-blossom/"
opener = urllib.request.build_opener()
opener.add_headers = [{'User-Agent': 'Mozilla'}]
urllib.request.install_opener(opener)
raw = requests.get(url).text
soup = bs.BeautifulSoup(raw, 'html.parser')
imgs = soup.find_all('img')
links = []
for img in imgs:
link = img.get('src')
links.append(link)
# print(links)
################################################
# downloading images #
################################################
for url in links:
# Invoke wget download method to download specified url image.
try:
local_image_filename = wget.download(url)
except ValueError:
break
# Print out local image file name.
print(local_image_filename)
with open(local_image_filename, 'r') as myFile:
try:
data = myFile.read()
except UnicodeDecodeError:
data = "UNICODE DECODE ERROR"
except ValueError:
data = "VALUE ERROR"
print(data)
print(type(data))
myFile.close()
newSaveString = str(local_image_filename) + "SeperatorOfSomeKind" + str(data)
newFileName = "NEW_" + local_image_filename
with open(newFileName, 'w') as myFile:
myFile.write(newSaveString)
myFile.close()
continue

Writing to HTML-File - Opened file looks weird [duplicate]

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

"File Does Not Exist" when dynamically creating files for PDF download with Requests in Python 2.7

I'm trying to dynamically download pdf's from a web site. I am sure I'm listing them correctly but I am not sure I'm doing the actual file I/O correctly. I get the following error
File "download.py", line 22, in <module>
with open("'"+url+"'", "wb") as pdf:
IOError: [Errno 2] No such file or directory: "'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf'"
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
origin = requests.get("http://freehaven.net/anonbib")
soup=BeautifulSoup(origin.text)
results = soup.find_all(href=re.compile("(http).*(pdf)"))
for link in results:
url = (link.get('href'))
r = requests.get(url)
with open("'"+url+"'", "wb") as pdf:
try:
pdf.write(r.content)
finally:
pdf.close
If url is set to 'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf', your code fails because it is trying to open a file with that name on your filesystem.
Instead, try something like this:
fileForUrl = '/tmp/' + url.split('/')[-1]
with open(fileForUrl, 'wb') as pdf:
# Rest of the code as before

How to save "complete webpage" not just basic html using Python

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

Categories

Resources