How can I download all PDFs from aspx webpage? - python

the task is easy: use Python to download all PDFs from:
https://www.electroimpact.com/Company/Patents.aspx
I am just a beginner of Python. I read python crawler but samples deal with html not aspx. And all I got is blank file downloaded.
Following is my code:
import urllib2
import re
def saveFile(url, fileName):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
with open(fileName,'wb') as handle:
handle.write(response.read())
def main():
base_url = 'https://www.electroimpact.com/Company/Patents/'
page = 'https://www.electroimpact.com/Company/Patents.aspx'
request = urllib2.Request(page)
response = urllib2.urlopen(request)
url_lst = re.findall('href.*(US.*\.pdf)', response.read())
print url_lst
Result:
['US5201205.pdf', 'US5279024.pdf', 'US5339598.pdf', 'US9021688B2.pdf']
Only 4 PDFs were found by my regular expression. Actually, there are much more PDFs to extract. Why?

With lxml.html and cssselect instead of re you will get all linked patent document paths:
#!/usr/bin/env python
# coding: utf8
from __future__ import absolute_import, division, print_function
import urllib2
from lxml import html
def main():
url = 'https://www.electroimpact.com/Company/Patents.aspx'
source = urllib2.urlopen(url).read()
document = html.fromstring(source)
patent_paths = [
a.attrib['href'] for a in document.cssselect('div.PatentNumber a')
]
print(patent_paths)
if __name__ == '__main__':
main()

Related

Scrape YouTube video url from a specific channel to Json

I am trying to save the url obtained with this script in a json file. but I couldn't get it
from bs4 import BeautifulSoup
from lxml import etree
import urllib
import requests
import sys
def fetch_titles(url):
video_titles = []
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
for entry in soup.find_all("entry"):
for link in entry.find_all("link"):
youtube = etree.HTML(urllib.request.urlopen(link["href"]).read())
video_title = youtube.xpath("//span[#id='eow-title']/#title")
if len(video_title)>0:
video_titles.append({"title":video_title[0], "url":link.attrs["href"]})
return video_titles
def main():
if sys.argv.__len__() == 1:
print("Error: You should specifying keyword")
print("eg: python3 ./main.py KEYWORD")
return
url="https://www.youtube.com/feeds/videos.xml?user=LinusTechTips"
keyword = sys.argv[1]
video_titles = fetch_titles(url)
for video in video_titles:
if video["title"].__contains__(keyword):
print(video["url"])
break # add this line, if you want to print the first match only
if __name__ == "__main__":
main()
my json file have this simple structure
{"url": "https://www.youtube.com/watch?v=xxx"}
As you print the first match and skip others, The whole main.py will be like this:
from bs4 import BeautifulSoup
from lxml import etree
import urllib
import requests
import sys
import json
def fetch_titles(url):
video_titles = []
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
for entry in soup.find_all("entry"):
for link in entry.find_all("link"):
youtube = etree.HTML(urllib.request.urlopen(link["href"]).read())
video_title = youtube.xpath("//span[#id='eow-title']/#title")
if len(video_title)>0:
video_titles.append({"title":video_title[0], "url":link.attrs["href"]})
return video_titles
def save_as_json(result, json_file_path): # I've add this function to save result as json file
data = json.dumps(result)
print(data)
with open(json_file_path, 'w') as file:
file.write(data)
def main():
if len(sys.argv) == 1:
print("Error: You should specifying keyword")
print("eg: python3 ./main.py KEYWORD")
return
json_file_path = "file.json" # json file path
url="https://www.youtube.com/feeds/videos.xml?user=LinusTechTips"
keyword = sys.argv[1]
video_titles = fetch_titles(url)
result ={"url": video["url"] for video in list(reversed(video_titles)) if keyword in video['title']}
save_as_json(result, json_file_path)
if __name__ == "__main__":
main()
You know what? I wrote the python code which you mention in your question. It was the answer to this question !!
I've replaced foreach loop to a single line for and I reversed the list like this list(reversed(video_titles)) to matching the first result.
Happy coding!
Use this piece of code after you call fetch_titles(url), the import should obviously be at the beginning.
import json
# skip
urls = {'url': video['url'] for video in video_titles if keyword in video['title']}
with open('results.json') as f:
f.write(json.dumps(urls))
It builds the urls dictionary using a dict comprehension, in a single operation. The call to __contains__ is done via the in operand. It then writes the output to a results.json file.
You should also replace the sys.argv.__len__() with len(sys.argv), it's the correct, pythonic way how to do it.

Unable to save downloaded images into a folder on the desktop using python

I have made a scraper which is at this moment parsing image links and saving downloaded images into python directory by default. The only thing i wanna do now is choose a folder on the desktop to save those images within but can't. Here is what I'm up to:
import requests
import os.path
import urllib.request
from lxml import html
def Startpoint():
url = "https://www.aliexpress.com/"
response = requests.get(url)
tree = html.fromstring(response.text)
titles = tree.xpath('//div[#class="item-inner"]')
for title in titles:
Pics="https:" + title.xpath('.//span[#class="pic"]//img/#src')[0]
endpoint(Pics)
def endpoint(images):
sdir = (r'C:\Users\ar\Desktop\mth')
testfile = urllib.request.URLopener()
xx = testfile.retrieve(images, images.split('/')[-1])
filename=os.path.join(sdir,xx)
print(filename)
Startpoint()
Upon execution the above code throws an error showing: "join() argument must be str or bytes, not 'tuple'"
you can download images with urllib of python. You can see the official documentation of python here urllib documentation for python 2.7 . If you want to use python 3 then follow this documentation urllib for python 3
You could use urllib.request, BytesIO from io and PIL Image.
(if you have a direct url to the image)
from PIL import Image
from io import BytesIO
import urllib.request
def download_image(url):
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
content = response.read()
img = Image.open(BytesIO(content))
img.filename = url
return img
The images are dynamic now. So, I thought to update this post:
import os
from selenium import webdriver
import urllib.request
from lxml.html import fromstring
url = "https://www.aliexpress.com/"
def get_data(link):
driver.get(link)
tree = fromstring(driver.page_source)
for title in tree.xpath('//li[#class="item"]'):
pics = "https:" + title.xpath('.//*[contains(#class,"img-wrapper")]//img/#src')[0]
os.chdir(r"C:\Users\WCS\Desktop\test")
urllib.request.urlretrieve(pics, pics.split('/')[-1])
if __name__ == '__main__':
driver = webdriver.Chrome()
get_data(url)
driver.quit()
This is the code to download the html file from the web
import random
import urllib.request
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".html" #compatible data type
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
This is the code for downloading any html file from the internet just you have to provide the link in the function.
As in your case you have told that you have retrieved the images links from the web page So you can change the extension from ".html" to compatible type, but the problem is that the image can be of different extension may be ".jpg" , ".png" etc.
So what you can do is you can match the ending of the link using if else with string matching and then assign the extension in the end.
Here is the example for the illustration
import random
import urllib.request
if(link extension is ".png"): #pseudo code
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".png" #compatible extension with .png
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
else if (link extension is ".jpg"): #pseudo code
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".jpg" #compatible extension with .jpg
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
You can use multiple if else for the various type of the extension.
If it helps for your situation have a Thumbs up buddy.

How to download specific GIF images (condition: phd*.gif) from a website using Python's BeautifulSoup?

I have the following code that downloads all images from a web-link.
from BeautifulSoup import BeautifulSoup as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
def main(url, out_folder="/test/"):
"""Downloads all the images at 'url' to /test/"""
soup = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in soup.findAll("img"):
print "Image: %(src)s" % image
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http://example.com [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
I want to modify it so that it downloads only images named as 'phd210223.gif' (for example), that is, images satisfying the condition: 'phd*.gif'
And I want to put it in a loop, so that after fetching such images from one webpage, it increments the page ID by 1 and downloads the same from the next page: 'http://www.example.com/phd.php?id=2'
How can I do this?
Instead of checking the name in the loop, you can use BeautifulSoup's built-in support for regular expressions. Provide the compiled regular expression as a value of src argument:
import re
from bs4 import BeautifulSoup as bs # note, you should use beautifulsoup4
for image in soup.find_all("img", src=re.compile('phd\d+\.gif$')):
...
phd\d+\.gif$ regular expression would search for text starting with phd, followed by 1 or more digits, followed by dot, followed by gif at the end of the string.
Note that you are using an outdated and unmaintained BeautifulSoup3, switch to beautifulsoup4:
pip install beautifulsoup4
Regular expression can help to solve this! when pattern is found in string/url, a match object would be returned, otherwise None.
import re
reg = re.compile('phd.*\.gif$')
str1 = 'path/phd12342343.gif'
str2 = 'path/dhp12424353153.gif'
print re.search(reg,str1)
print re.search(reg,str2)
I personally prefer using python default tools so I use html.parser, what you need it something like this:
import re, urllib.request, html.parser
class LinksHTMLParser(parse.HTMLParser):
def __init__(self, length):
super().__init__()
self.gifs = list()
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href":
gifName = re.split("/", value)[-1]
if *gifNameCondition*:
self.gifs.append(value)
parser = LinksHTMLParser()
parser.feed(urllib.request.urlopen("YOUR URL HERE").read().decode("utf-8"))
for gif in parser.gifs:
urllib.request.urlretrieve(*local path to download gif to*, gif)

Python BeautifulSoup web image crawler IOError: [Errno 2] No such file or directory

I wrote the following Python code to crawl the images from the website www.style.com
import urllib2, urllib, random, threading
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class Images(threading.Thread):
def __init__(self, lock, src):
threading.Thread.__init__(self)
self.src = src
self.lock = lock
def run(self):
self.lock.acquire()
urllib.urlretrieve(self.src,'./img/'+str(random.choice(range(9999))))
print self.src+'get'
self.lock.release()
def imgGreb():
lock = threading.Lock()
site_url = "http://www.style.com"
html = urllib2.urlopen(site_url).read()
soup = BeautifulSoup(html)
img=soup.findAll(['img'])
for i in img:
print i.get('src')
Images(lock, i.get('src')).start()
if __name__ == '__main__':
imgGreb()
But I got this error:
IOError: [Errno 2] No such file or directory: '/images/homepage-2013-october/header/logo.png'
How can it be solved?
Also can this recursively find all the images in the website? I mean other images that are not on the homepage.
Thanks!
You are using the relative path without the domain when you tried to retrieve the URL.
Some of the images are javascript based and you will get the relative path to be javascript:void(0);, which you will never get the page. I added the try except to get around that error. Or you can smartly detect if the URL ends with jpg/gif/png or not. I will that work to you :)
BTW, not all the images are included in the URL, some of the pictures, Beautiful One, are called using Javascript, will there is nothing we can do using urllib and beautifulsoup only. If you really want to challenge yourself, maybe you can try to learn Selenium, which is a more powerful tool.
Try the code below directly:
import urllib2
from bs4 import BeautifulSoup
import sys
from urllib import urlretrieve
reload(sys)
def imgGreb():
site_url = "http://www.style.com"
html = urllib2.urlopen(site_url).read()
soup = BeautifulSoup(html)
img=soup.findAll(['img'])
for i in img:
try:
# built the complete URL using the domain and relative url you scraped
url = site_url + i.get('src')
# get the file name
name = "result_" + url.split('/')[-1]
# detect if that is a type of pictures you want
type = name.split('.')[-1]
if type in ['jpg', 'png', 'gif']:
# if so, retrieve the pictures
urlretrieve(url, name)
except:
pass
if __name__ == '__main__':
imgGreb()

How to save an image locally using Python whose URL address I already know?

I know the URL of an image on Internet.
e.g. http://www.digimouth.com/news/media/2011/09/google-logo.jpg, which contains the logo of Google.
Now, how can I download this image using Python without actually opening the URL in a browser and saving the file manually.
Python 2
Here is a more straightforward way if all you want to do is save it as a file:
import urllib
urllib.urlretrieve("http://www.digimouth.com/news/media/2011/09/google-logo.jpg", "local-filename.jpg")
The second argument is the local path where the file should be saved.
Python 3
As SergO suggested the code below should work with Python 3.
import urllib.request
urllib.request.urlretrieve("http://www.digimouth.com/news/media/2011/09/google-logo.jpg", "local-filename.jpg")
import urllib
resource = urllib.urlopen("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
output = open("file01.jpg","wb")
output.write(resource.read())
output.close()
file01.jpg will contain your image.
I wrote a script that does just this, and it is available on my github for your use.
I utilized BeautifulSoup to allow me to parse any website for images. If you will be doing much web scraping (or intend to use my tool) I suggest you sudo pip install BeautifulSoup. Information on BeautifulSoup is available here.
For convenience here is my code:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import urllib
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print 'Downloading images to current working directory.'
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.urlretrieve(each, filename)
return image_links
#a standard call looks like this
#get_images('http://www.wookmark.com')
This can be done with requests. Load the page and dump the binary content to a file.
import os
import requests
url = 'https://apod.nasa.gov/apod/image/1701/potw1636aN159_HST_2048.jpg'
page = requests.get(url)
f_ext = os.path.splitext(url)[-1]
f_name = 'img{}'.format(f_ext)
with open(f_name, 'wb') as f:
f.write(page.content)
Python 3
urllib.request — Extensible library for opening URLs
from urllib.error import HTTPError
from urllib.request import urlretrieve
try:
urlretrieve(image_url, image_local_path)
except FileNotFoundError as err:
print(err) # something wrong with local path
except HTTPError as err:
print(err) # something wrong with url
I made a script expanding on Yup.'s script. I fixed some things. It will now bypass 403:Forbidden problems. It wont crash when an image fails to be retrieved. It tries to avoid corrupted previews. It gets the right absolute urls. It gives out more information. It can be run with an argument from the command line.
# getem.py
# python2 script to download all images in a given url
# use: python getem.py http://url.where.images.are
from bs4 import BeautifulSoup
import urllib2
import shutil
import requests
from urlparse import urljoin
import sys
import time
def make_soup(url):
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
html = urllib2.urlopen(req)
return BeautifulSoup(html, 'html.parser')
def get_images(url):
soup = make_soup(url)
images = [img for img in soup.findAll('img')]
print (str(len(images)) + " images found.")
print 'Downloading images to current working directory.'
image_links = [each.get('src') for each in images]
for each in image_links:
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print 'Getting: ' + filename
response = requests.get(src, stream=True)
# delay to avoid corrupted previews
time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print ' An error occured. Continuing.'
print 'Done.'
if __name__ == '__main__':
url = sys.argv[1]
get_images(url)
A solution which works with Python 2 and Python 3:
try:
from urllib.request import urlretrieve # Python 3
except ImportError:
from urllib import urlretrieve # Python 2
url = "http://www.digimouth.com/news/media/2011/09/google-logo.jpg"
urlretrieve(url, "local-filename.jpg")
or, if the additional requirement of requests is acceptable and if it is a http(s) URL:
def load_requests(source_url, sink_path):
"""
Load a file from an URL (e.g. http).
Parameters
----------
source_url : str
Where to load the file from.
sink_path : str
Where the loaded file is stored.
"""
import requests
r = requests.get(source_url, stream=True)
if r.status_code == 200:
with open(sink_path, 'wb') as f:
for chunk in r:
f.write(chunk)
Using requests library
import requests
import shutil,os
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
currentDir = os.getcwd()
path = os.path.join(currentDir,'Images')#saving images to Images folder
def ImageDl(url):
attempts = 0
while attempts < 5:#retry 5 times
try:
filename = url.split('/')[-1]
r = requests.get(url,headers=headers,stream=True,timeout=5)
if r.status_code == 200:
with open(os.path.join(path,filename),'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw,f)
print(filename)
break
except Exception as e:
attempts+=1
print(e)
ImageDl(url)
Use a simple python wget module to download the link. Usage below:
import wget
wget.download('http://www.digimouth.com/news/media/2011/09/google-logo.jpg')
This is very short answer.
import urllib
urllib.urlretrieve("http://photogallery.sandesh.com/Picture.aspx?AlubumId=422040", "Abc.jpg")
Version for Python 3
I adjusted the code of #madprops for Python 3
# getem.py
# python2 script to download all images in a given url
# use: python getem.py http://url.where.images.are
from bs4 import BeautifulSoup
import urllib.request
import shutil
import requests
from urllib.parse import urljoin
import sys
import time
def make_soup(url):
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
html = urllib.request.urlopen(req)
return BeautifulSoup(html, 'html.parser')
def get_images(url):
soup = make_soup(url)
images = [img for img in soup.findAll('img')]
print (str(len(images)) + " images found.")
print('Downloading images to current working directory.')
image_links = [each.get('src') for each in images]
for each in image_links:
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print('Getting: ' + filename)
response = requests.get(src, stream=True)
# delay to avoid corrupted previews
time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print(' An error occured. Continuing.')
print('Done.')
if __name__ == '__main__':
get_images('http://www.wookmark.com')
Late answer, but for python>=3.6 you can use dload, i.e.:
import dload
dload.save("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
if you need the image as bytes, use:
img_bytes = dload.bytes("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
install using pip3 install dload
Something fresh for Python 3 using Requests:
Comments in the code. Ready to use function.
import requests
from os import path
def get_image(image_url):
"""
Get image based on url.
:return: Image name if everything OK, False otherwise
"""
image_name = path.split(image_url)[1]
try:
image = requests.get(image_url)
except OSError: # Little too wide, but work OK, no additional imports needed. Catch all conection problems
return False
if image.status_code == 200: # we could have retrieved error page
base_dir = path.join(path.dirname(path.realpath(__file__)), "images") # Use your own path or "" to use current working directory. Folder must exist.
with open(path.join(base_dir, image_name), "wb") as f:
f.write(image.content)
return image_name
get_image("https://apod.nasddfda.gov/apod/image/2003/S106_Mishra_1947.jpg")
this is the easiest method to download images.
import requests
from slugify import slugify
img_url = 'https://apod.nasa.gov/apod/image/1701/potw1636aN159_HST_2048.jpg'
img = requests.get(img_url).content
img_file = open(slugify(img_url) + '.' + str(img_url).split('.')[-1], 'wb')
img_file.write(img)
img_file.close()
If you don't already have the url for the image, you could scrape it with gazpacho:
from gazpacho import Soup
base_url = "http://books.toscrape.com"
soup = Soup.get(base_url)
links = [img.attrs["src"] for img in soup.find("img")]
And then download the asset with urllib as mentioned:
from pathlib import Path
from urllib.request import urlretrieve as download
directory = "images"
Path(directory).mkdir(exist_ok=True)
link = links[0]
name = link.split("/")[-1]
download(f"{base_url}/{link}", f"{directory}/{name}")
# import the required libraries from Python
import pathlib,urllib.request
# Using pathlib, specify where the image is to be saved
downloads_path = str(pathlib.Path.home() / "Downloads")
# Form a full image path by joining the path to the
# images' new name
picture_path = os.path.join(downloads_path, "new-image.png")
# "/home/User/Downloads/new-image.png"
# Using "urlretrieve()" from urllib.request save the image
urllib.request.urlretrieve("//example.com/image.png", picture_path)
# urlretrieve() takes in 2 arguments
# 1. The URL of the image to be downloaded
# 2. The image new name after download. By default, the image is saved
# inside your current working directory
Ok, so, this is my rudimentary attempt, and probably total overkill.
Update if needed, as this doesn't handle any timeouts, but, I got this working for fun.
Code listed here: https://github.com/JayRizzo/JayRizzoTools/blob/master/pyImageDownloader.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# Created Syst: MAC OSX High Sierra 21.5.0 (17G65)
# Created Plat: Python 3.9.5 ('v3.9.5:0a7dcbdb13', 'May 3 2021 13:17:02')
# Created By : Jeromie Kirchoff
# Created Date: Thu Jun 15 23:31:01 2022 CDT
# Last ModDate: Thu Jun 16 01:41:01 2022 CDT
# =============================================================================
# NOTE: Doesn't work on SVG images at this time.
# I will look into this further: https://stackoverflow.com/a/6599172/1896134
# =============================================================================
import requests # to get image from the web
import shutil # to save it locally
import os # needed
from os.path import exists as filepathexist # check if file paths exist
from os.path import join # joins path for different os
from os.path import expanduser # expands current home
from pyuser_agent import UA # generates random UserAgent
class ImageDownloader(object):
"""URL ImageDownloader.
Input : Full Image URL
Output: Image saved to your ~/Pictures/JayRizzoDL folder.
"""
def __init__(self, URL: str):
self.url = URL
self.headers = {"User-Agent" : UA().random}
self.currentHome = expanduser('~')
self.desktop = join(self.currentHome + "/Desktop/")
self.download = join(self.currentHome + "/Downloads/")
self.pictures = join(self.currentHome + "/Pictures/JayRizzoDL/")
self.outfile = ""
self.filename = ""
self.response = ""
self.rawstream = ""
self.createdfilepath = ""
self.imgFileName = ""
# Check if the JayRizzoDL exists in the pictures folder.
# if it doesn't exist create it.
if not filepathexist(self.pictures):
os.mkdir(self.pictures)
self.main()
def getFileNameFromURL(self, URL: str):
"""Parse the URL for the name after the last forward slash."""
NewFileName = self.url.strip().split('/')[-1].strip()
return NewFileName
def getResponse(self, URL: str):
"""Try streaming the URL for the raw data."""
self.response = requests.get(self.url, headers=self.headers, stream=True)
return self.response
def gocreateFile(self, name: str, response):
"""Try creating the file with the raw data in a custom folder."""
self.outfile = join(self.pictures, name)
with open(self.outfile, 'wb') as outFilePath:
shutil.copyfileobj(response.raw, outFilePath)
return self.outfile
def main(self):
"""Combine Everything and use in for loops."""
self.filename = self.getFileNameFromURL(self.url)
self.rawstream = self.getResponse(self.url)
self.createdfilepath = self.gocreateFile(self.filename, self.rawstream)
print(f"File was created: {self.createdfilepath}")
return
if __name__ == '__main__':
# Example when calling the file directly.
ImageDownloader("https://stackoverflow.design/assets/img/logos/so/logo-stackoverflow.png")
Download Image file, with avoiding all possible error:
import requests
import validators
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
def is_downloadable(url):
valid=validators. url(url)
if valid==False:
return False
req = Request(url)
try:
response = urlopen(req)
except HTTPError as e:
return False
except URLError as e:
return False
else:
return True
for i in range(len(File_data)): #File data Contain list of address for image
#file
url = File_data[i][1]
try:
if (is_downloadable(url)):
try:
r = requests.get(url, allow_redirects=True)
if url.find('/'):
fname = url.rsplit('/', 1)[1]
fname = pth+File_data[i][0]+"$"+fname #Destination to save
#image file
open(fname, 'wb').write(r.content)
except Exception as e:
print(e)
except Exception as e:
print(e)

Categories

Resources