I am trying to download an image from an instagram media URL:
https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE
Pasting this into my browser will bring up the image, but when I run the following code I get the following error which i suspect is due to issues with the URL containing a query string (running this on a simple url ending in .jpg works without issue
File "C:/Users/19053/InstagramImageDownloader/downloadImage.py", line 18, in <module>
with open(filename, 'wb') as f:
OSError: [Errno 22] Invalid argument: '106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE'
Full code as follows:
## Importing Necessary Modules
import requests # to get image from the web
import shutil # to save it locally
## Set up the image URL and filename
image_url = "https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE"
filename = image_url.split("/")[-1]
# Open the url image, set stream to True, this will return the stream content.
r = requests.get(image_url, stream=True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
# Open a local file with wb ( write binary ) permission.
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
print('Image sucessfully Downloaded: ', filename)
else:
print('Image Couldn\'t be retreived')
The problem is with the filename. You need to first split by ? then take the first element then split by /
import requests # to get image from the web
import shutil # to save it locally
## Set up the image URL and filename
image_url = "https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE"
filename = image_url.split("?")[0].split("/")[-1]
# Open the url image, set stream to True, this will return the stream content.
r = requests.get(image_url, stream=True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
# Open a local file with wb ( write binary ) permission.
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
print('Image sucessfully Downloaded: ', filename)
else:
print('Image Couldn\'t be retreived')
Related
I have been trying to scrape some images using Beautifulsoup in Python and I am facing some problems, so the thing is that I am successfully able to scrape the link as well as store it in the folder but the images are in an unsupported format.
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in res.iter_content(1000):
ima.write(down)
It says file format not supported even though it's in a jpeg format
output image in a folder
This line for down in res.iter_content(1000): is not iterating the image from ll - it is reiterating the html result. Your OS may recognize the file from the extension (.jpeg), but this is only because of the filename - not the content (which is not JPEG, but HTML, and hence the error).
You should make another request for the image itself, so it can be fetched and stored:
for link in soup.find_all('img'):
ll = link.get('src')
img_rs = requests.get(os.path.join('https://books.toscrape.com/', ll)) # <-- this line
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in img_rs.iter_content(1000): # <-- and iterate on the result
ima.write(down)
The reason for saving the HTML is obscure. So, ignoring that part of the code in question, it comes down to this:
import requests
from os.path import join, basename
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
URL = 'https://books.toscrape.com'
TARGET_DIR = '/tmp'
with requests.Session() as session:
(r := session.get(URL)).raise_for_status()
for image in BS(r.text, 'lxml').find_all('img'):
src = image['src']
(r := session.get(urljoin(URL, src), stream=True)).raise_for_status()
with open(join(TARGET_DIR, basename(src)), 'wb') as t:
for chunk in r.iter_content(chunk_size=8192):
t.write(chunk)
In terms of performance, this can be significantly enhanced by multithreading
Your problem is that after you find the URL of the image you don't do anything with it and instead you try to save the whole inital request which is just the html file of the whole website. Try something like this instead:
base_url = 'https://books.toscrape.com/'
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = os.path.join('images', os.path.basename(ll))
current_img = os.path.join(base_url, ll)
img_res = requests.get(current_img, stream = True)
with open(ima, 'wb') as f:
shutil.copyfileobj(img_res.raw, f)
del img_res
I use this code to upload an image to a directory:
import urllib.request
URL = ...
FILENAME = r'c:\networks\work\image1.jpg'
with urllib.request.urlopen(URL) as response:
image = response.read()
with open(FILENAME, 'wb') as output_file:
output_file.write(image)
and it works for every image, but when I try to upload this type of URL:
https://data.cyber.org.il/python/logpuzzle/p-bbjb-bbia.jpg
the uploading takes something like 3 minutes which is very slow, why does it happen?
i'm trying to download 200k images using their URL.
This is my code:
import requests # to get image from the web
import shutil # to save it locally
r = requests.get(image_url, stream = True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
if not os.path.isdir('images/' + filename.rsplit('/',1)[0] + '/'):
os.makedirs('images/' + filename.rsplit('/',1)[0] + '/')
with open('images/' + filename,'wb') as f:
shutil.copyfileobj(r.raw, f)
The when i run it, it downloads some images but the rest doesn't. It gives the error:
urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead
I have no idea why or when this happens. Maybe when a URL is unreachable? How can I assure that everything UP will be downloaded and exceptions will be passed?
What about using a try/except?
import requests # to get image from the web
import shutil # to save it locally
try:
r = requests.get(image_url, stream = True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
if not os.path.isdir('images/' + filename.rsplit('/',1)[0] + '/'):
os.makedirs('images/' + filename.rsplit('/',1)[0] + '/')
with open('images/' + filename,'wb') as f:
shutil.copyfileobj(r.raw, f)
except urllib3.exceptions.ProtocolError as error:
print("skipped error: " + error)
Perhaps to download such a large number of images you would be interested in an asynchronous web framework like aiohttp. This would save you from having to wait for a slow site to send you its image to download more.
I have 1k of image urls in a csv file and I am trying to download all the images from the urls. I don't know why I am not able to download all the images. Here is my code:
print('Beginning file download with requests')
path = '/home/tt/image_scrap/image2'
for idx, url in tqdm(enumerate(dataset['url']), total=len(dataset['url'])):
response = requests.get(url,stream=True)
time.sleep(2)
filename = url.split("/")[-1]
with open(path+'/'+filename, 'wb') as f:
f.write(response.content)
Try / Except statements are really good for these type of 'errors':
Try this:
try:
with open(path+'/'+filename, 'wb') as f:
f.write(response.content)
except Exception as error:
print(error)
I've a lot of URL with file types .docx and .pdf I want to run a python script that downloads them from the URL and saves it in a folder. Here is what I've done for a single file I'll add them to a for loop:
response = requests.get('http://wbesite.com/Motivation-Letter.docx')
with open("my_file.docx", 'wb') as f:
f.write(response.content)
but the my_file.docx that it is saving is only 266 bytes and is corrupt but the URL is fine.
UPDATE:
Added this code and it works but I want to save it in a new folder.
import os
import shutil
import requests
def download_file(url, folder_name):
local_filename = url.split('/')[-1]
path = os.path.join("/{}/{}".format(folder_name, local_filename))
with requests.get(url, stream=True) as r:
with open(path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
Try using stream option:
import os
import requests
def download(url: str, dest_folder: str):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(dest_folder, filename)
r = requests.get(url, stream=True)
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
download("http://website.com/Motivation-Letter.docx", dest_folder="mydir")
Note that mydir in example above is the name of folder in current working directory. If mydir does not exist script will create it in current working directory and save file in it. Your user must have permissions to create directories and files in current working directory.
You can pass an absolute file path in dest_folder, but check permissions first.
P.S.: avoid asking multiple questions in one post
try:
import urllib.request
urllib.request.urlretrieve(url, filename)