Not able to download all the images using request python - python

I have 1k of image urls in a csv file and I am trying to download all the images from the urls. I don't know why I am not able to download all the images. Here is my code:
print('Beginning file download with requests')
path = '/home/tt/image_scrap/image2'
for idx, url in tqdm(enumerate(dataset['url']), total=len(dataset['url'])):
response = requests.get(url,stream=True)
time.sleep(2)
filename = url.split("/")[-1]
with open(path+'/'+filename, 'wb') as f:
f.write(response.content)

Try / Except statements are really good for these type of 'errors':
Try this:
try:
with open(path+'/'+filename, 'wb') as f:
f.write(response.content)
except Exception as error:
print(error)

Related

python uploading an image to a directory

I use this code to upload an image to a directory:
import urllib.request
URL = ...
FILENAME = r'c:\networks\work\image1.jpg'
with urllib.request.urlopen(URL) as response:
image = response.read()
with open(FILENAME, 'wb') as output_file:
output_file.write(image)
and it works for every image, but when I try to upload this type of URL:
https://data.cyber.org.il/python/logpuzzle/p-bbjb-bbia.jpg
the uploading takes something like 3 minutes which is very slow, why does it happen?

How to download a file using requests

I am using the requests library to download a file from a URL. This is my code
for tag in soup.find_all('a'):
if '.zip' in str(tag):
file_name = str(tag).strip().split('>')[-2].split('<')[0]
link = link_name+tag.get('href')
r = requests.get(link, stream=True)
with open(os.path.join(download_path, file_name), 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
fd.write(chunk)
And then I unzip the file using this code
unzip_path = os.path.join(download_path, file_name.split('.')[0])
with zipfile.ZipFile(os.path.join(download_path, file_name), 'r') as zip_ref:
zip_ref.extractall(unzip_path)
This code looks if there is a zip file in the provided page and then downloads the zipped file in a directory. Then it will unzip the file using the zipFile library.
The problem with this code is that sometimes the download is not complete. So for example if the zipped file is 312KB long only parts of it is downloaded. And then I get a BadZipFile error. But sometimes the entire file is downloaded correctly.
I tried the same without streaming and even that results in the same problem.
How do I check if all the chunks are downloaded properly.
Maybe this works:
r = requests.get(link)
with open(os.path.join(download_path, file_name), 'wb') as fd:
fd.write(r.content)

Download image using requests that contains a query string

I am trying to download an image from an instagram media URL:
https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE
Pasting this into my browser will bring up the image, but when I run the following code I get the following error which i suspect is due to issues with the URL containing a query string (running this on a simple url ending in .jpg works without issue
File "C:/Users/19053/InstagramImageDownloader/downloadImage.py", line 18, in <module>
with open(filename, 'wb') as f:
OSError: [Errno 22] Invalid argument: '106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE'
Full code as follows:
## Importing Necessary Modules
import requests # to get image from the web
import shutil # to save it locally
## Set up the image URL and filename
image_url = "https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE"
filename = image_url.split("/")[-1]
# Open the url image, set stream to True, this will return the stream content.
r = requests.get(image_url, stream=True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
# Open a local file with wb ( write binary ) permission.
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
print('Image sucessfully Downloaded: ', filename)
else:
print('Image Couldn\'t be retreived')
The problem is with the filename. You need to first split by ? then take the first element then split by /
import requests # to get image from the web
import shutil # to save it locally
## Set up the image URL and filename
image_url = "https://instagram.fybz2-1.fna.fbcdn.net/v/t51.2885-15/fr/e15/p1080x1080/106602453_613520712600632_6255422472318530180_n.jpg?_nc_ht=instagram.fybz2-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=WQizf6rhDmQAX883HrQ&oh=140f221889178fd03bf654cf18a9d9a2&oe=5F4D2AFE"
filename = image_url.split("?")[0].split("/")[-1]
# Open the url image, set stream to True, this will return the stream content.
r = requests.get(image_url, stream=True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
# Open a local file with wb ( write binary ) permission.
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
print('Image sucessfully Downloaded: ', filename)
else:
print('Image Couldn\'t be retreived')

How to get response data of a pdf file in python

I am trying to fetch the data of a pdf file available online
I have tried
import requests
response = requests.get("http://imdagrimet.gov.in/sites/default/files/daas_bulletin/District%20Advisory%20patna_17.pdf")
print(response.content)
but it gives a byte object as a response, and I am not able to decode that
You should write the data inside a file in order to be able to get it.
Like this:
with open('/District_Advisory_patna_17.pdf', 'wb') as f:
f.write(response.content)
Try to write your data to file:
import requests
import shutil
url = 'your url'
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(file_path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

Download file from URL and save it in a folder Python

I've a lot of URL with file types .docx and .pdf I want to run a python script that downloads them from the URL and saves it in a folder. Here is what I've done for a single file I'll add them to a for loop:
response = requests.get('http://wbesite.com/Motivation-Letter.docx')
with open("my_file.docx", 'wb') as f:
f.write(response.content)
but the my_file.docx that it is saving is only 266 bytes and is corrupt but the URL is fine.
UPDATE:
Added this code and it works but I want to save it in a new folder.
import os
import shutil
import requests
def download_file(url, folder_name):
local_filename = url.split('/')[-1]
path = os.path.join("/{}/{}".format(folder_name, local_filename))
with requests.get(url, stream=True) as r:
with open(path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
Try using stream option:
import os
import requests
def download(url: str, dest_folder: str):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(dest_folder, filename)
r = requests.get(url, stream=True)
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
download("http://website.com/Motivation-Letter.docx", dest_folder="mydir")
Note that mydir in example above is the name of folder in current working directory. If mydir does not exist script will create it in current working directory and save file in it. Your user must have permissions to create directories and files in current working directory.
You can pass an absolute file path in dest_folder, but check permissions first.
P.S.: avoid asking multiple questions in one post
try:
import urllib.request
urllib.request.urlretrieve(url, filename)

Categories

Resources