I am using the requests library to download a file from a URL. This is my code
for tag in soup.find_all('a'):
if '.zip' in str(tag):
file_name = str(tag).strip().split('>')[-2].split('<')[0]
link = link_name+tag.get('href')
r = requests.get(link, stream=True)
with open(os.path.join(download_path, file_name), 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
fd.write(chunk)
And then I unzip the file using this code
unzip_path = os.path.join(download_path, file_name.split('.')[0])
with zipfile.ZipFile(os.path.join(download_path, file_name), 'r') as zip_ref:
zip_ref.extractall(unzip_path)
This code looks if there is a zip file in the provided page and then downloads the zipped file in a directory. Then it will unzip the file using the zipFile library.
The problem with this code is that sometimes the download is not complete. So for example if the zipped file is 312KB long only parts of it is downloaded. And then I get a BadZipFile error. But sometimes the entire file is downloaded correctly.
I tried the same without streaming and even that results in the same problem.
How do I check if all the chunks are downloaded properly.
Maybe this works:
r = requests.get(link)
with open(os.path.join(download_path, file_name), 'wb') as fd:
fd.write(r.content)
Related
I am trying to download a zip file from URL with a python script
the zip file is downloaded, but can't be opened since it's corrupted.
This is my script:
zipUrl = "https://data.gov.il/dataset/dff8a168-af6c-4e0f-bbe3-c4bd3646084c/resource/c68b4df6-c809-4bb5-a546-61fa1528fed5/download/parcel_all.zip"
targetFile = "parcel_all_from_web.zip"
import requests
try:
r = requests.get(zipUrl)
with open(targetFile, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except:
print("error in download")
I've a lot of URL with file types .docx and .pdf I want to run a python script that downloads them from the URL and saves it in a folder. Here is what I've done for a single file I'll add them to a for loop:
response = requests.get('http://wbesite.com/Motivation-Letter.docx')
with open("my_file.docx", 'wb') as f:
f.write(response.content)
but the my_file.docx that it is saving is only 266 bytes and is corrupt but the URL is fine.
UPDATE:
Added this code and it works but I want to save it in a new folder.
import os
import shutil
import requests
def download_file(url, folder_name):
local_filename = url.split('/')[-1]
path = os.path.join("/{}/{}".format(folder_name, local_filename))
with requests.get(url, stream=True) as r:
with open(path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
Try using stream option:
import os
import requests
def download(url: str, dest_folder: str):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(dest_folder, filename)
r = requests.get(url, stream=True)
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
download("http://website.com/Motivation-Letter.docx", dest_folder="mydir")
Note that mydir in example above is the name of folder in current working directory. If mydir does not exist script will create it in current working directory and save file in it. Your user must have permissions to create directories and files in current working directory.
You can pass an absolute file path in dest_folder, but check permissions first.
P.S.: avoid asking multiple questions in one post
try:
import urllib.request
urllib.request.urlretrieve(url, filename)
New to python and attempting to download the NIST NVD JSON files. I have tried several methods but it only write about 324 bytes file. If I do one file that does in fact work but there are several files to download for this.
I did try to adjust the chunk_size but still can't get a 1 to 6mb zip file to download
from requests import get
def download(url, filename):
response = get(url, stream = True)
with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print('Downloaded! ', filename)
with open('NVD_JSON_SOURCE_URLS.txt') as f:
for line in f:
filename = line.split('/')[-1]
url = line
download(url, filename)
The input works and it starts the downloads, just never completes them. Clearly I am missing something frustratingly simple here but after 2 days I am not getting any closer. Thanks.
I find chunking to be painful for instances like this in Python. This approach has worked for me frequently:
import requests
import shutil
def download_file(url, filename):
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
This streams the whole file to memory then writes it. So wouldn't work for huge files, but you are only talking about a couple of MB should work fine.
I think line has some whitespace characters, so if you remove the whitespace characters from line using strip(), the code should work.
for line in f:
line = line.strip()
...
I tested it and it works for me.
Because there is a line break in the end of lines when you read the data from .txt file. So you should strip the line break in the first.
from requests import get
def download(url, filename):
response = get(url, stream = True)
with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print('Downloaded! ', filename)
with open('NVD_JSON_SOURCE_URLS.txt') as f:
for line in f:
line = line.strip()
filename = line.split('/')[-1]
url = line
download(url, filename)
Currently, I am using this code to save a downloaded file but it is placing them in the same folder where it is being run from.
r = requests.get(url)
with open('file_name.pdf', 'wb') as f:
f.write(r.content)
How would I save the downloaded file to another directory of my choice?
Or if in Linux, try:
# To save to an absolute path.
r = requests.get(url)
with open('/path/I/want/to/save/file/to/file_name.pdf', 'wb') as f:
f.write(r.content)
# To save to a relative path.
r = requests.get(url)
with open('folder1/folder2/file_name.pdf', 'wb') as f:
f.write(r.content)
See open() function docs for more details.
You can just give open a full file path or a relative file path
r = requests.get(url)
with open(r'C:\path\to\save\file_name.pdf', 'wb') as f:
f.write(r.content)
As long as you have access to the directory you can simply change your file_name.pdf' to '/path_to_directory_you_want_to_save/file_name.pdf' and that should do what you want.
Here is a quicker solution:
r = requests.get(url)
open('/path/to/directory/file_name.pdf', 'wb').write(r.content)
Trying to download a file from this URL. It is an excel file and it downloads only 1 kb of it. While the file size is actually 7mb. I dont understand what has to be done here
But if copy and paste the url in IE, the entire file is downloaded
res = requests.get('http://fescodoc.***.com/fescodoc/component/getcontent?objectId=09016fe98f2b59bb¤t=true')
res.raise_for_status()
playFile = open('DC_Estimation Form.xlsm', 'wb')
for chunk in res.iter_content(1024):
playFile.write(chunk)
You should set stream to true in the get(),
res = requests.get('http://fescodoc.***.com/fescodoc/component/getcontent?objectId=09016fe98f2b59bb¤t=true', stream=True)
res.raise_for_status()
with open('DC_Estimation Form.xlsm', 'wb') as playFile:
for chunk in res.iter_content(1024):
playFile.write(chunk)
See here: http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
It is easier to use built-in module urllib for such cases: https://docs.python.org/2/library/urllib.html#urllib.urlretrieve
urllib.urlretrieve('http://fescodoc/component/.', 'DC_Estimation_Form.xslm')