I am using requests to download files, but for large files I need to check the size of the file on disk every time because I can't display the progress in percentage and I would also like to know the download speed. How can I go about doing it ? Here's my code :
import requests
import sys
import time
import os
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
r = requests.get(url, stream=True)
start = time.clock()
f = open(directory + '/' + localFilename, 'wb')
for chunk in r.iter_content(chunk_size = 512 * 1024) :
if chunk :
f.write(chunk)
f.flush()
os.fsync(f.fileno())
f.close()
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()
I think one way to do it would be to read the file every time in the for loop and calculate the percentage of progress based on the header Content-Length. But that would be again an issue for large files(around 500MB). Is there any other way to do it?
see here: Python progress bar and downloads
i think the code would be something like this, it should show the average speed since start as bytes per second:
import requests
import sys
import time
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
with open(directory + '/' + localFilename, 'wb') as f:
start = time.clock()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s] %s bps" % ('=' * done, ' ' * (50-done), dl//(time.clock() - start)))
print ''
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()
An improved version of the accepted answer for python3 using io.Bytes (write to memory), result in Mbps, support for ipv4/ipv6, size and port arguments.
import sys, time, io, requests
def speed_test(size=5, ipv="ipv4", port=80):
if size == 1024:
size = "1GB"
else:
size = f"{size}MB"
url = f"http://{ipv}.download.thinkbroadband.com:{port}/{size}.zip"
with io.BytesIO() as f:
start = time.perf_counter()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(30 * dl / int(total_length))
sys.stdout.write("\r[%s%s] %s Mbps" % ('=' * done, ' ' * (30-done), dl//(time.perf_counter() -
start) / 100000))
print( f"\n{size} = {(time.perf_counter() - start):.2f} seconds")
Usage Examples:
speed_test()
speed_test(10)
speed_test(50, "ipv6")
speed_test(1024, port=8080)
Output Sample:
[==============================] 61.34037 Mbps
100MB = 17.10 seconds
Available Options:
size: 5, 10, 20, 50, 100, 200, 512, 1024
ipv: ipv4, ipv6
port: 80, 81, 8080
Updated on 20221011:
time.perf_counter() replaced time.clock(), which has been deprecated on python 3.3 (kudos to shiro)
I had a problem with a specific slow server to download a big file
no Content-Length header.
big file (42GB),
no compression,
slow server (<1MB/s),
Beeing this big, I had also problem with memory usage during the request. Requests doesn't write output on file, like urlibs does, looks like it keep it in memory.
No content length header makes the accepted answer.. not monitoring.
So I wrote this -basic- method to monitor speed during the csv download following just the "requests" documentation.
It needs a fname (complete output path), a link (http or https) and you can specify custom headers.
BLOCK=5*1024*1024
try:
with open(fname, 'wb') as f:
r = requests.get(link, headers=headers, stream=True)
## This is, because official dozumentation suggest it,
## saying it's more reliable thatn cycling directly on iterlines, to don't lose data
lines = r.iter_lines()
## Init the base vars, for monitor and block management
## Obj is a byte object, because iterlines returno objects
tsize = 0; obj = bytearray(); t0=time.time(); i=0;
for line in lines:
## calculate the line size, in bytes, and add to the byte object
tsize+=len(line)
obj.extend(line)
## When condition reached,
if tsize > BLOCK:
## Increment the block number
i+=1;
## Calculate the speed.. this is in MB/s,
## but you can easily change to KB/s, or Blocks/s
t1=time.time()
t=t1-t0;
speed=round(5/t, 2);
## Write the block to the file.
f.write(obj)
## Write stats
print('got', i*5, 'MB ', 'block' ,i, ' #', speed,'MB/s')
## Reinit all the base vars, for a new block
obj=bytearray(); tsize=0; t0=time.time()
## Write the last block part to the file.
f.write(obj)
except Exception as e:
print("Error: ", e, 0)
Related
I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?
I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()
I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.
Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes
Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.
I'm working making a progress bar for my python Dropbox app using the Dropbox API. The problem is I can't figure out how to get the number of bytes written so far so that I can build it. Is there any way to do this using the python Dropbox API? If not could I measure the bytes being sent out from my system instead, possibly using the os module?
I'm trying to get it to work with get_chunked_loader, but it would be great if I could get the bytes written for put_file() as well as file_copy() and file_move also(). My code so far is something like this:
if (file_size >= 4194304):
big_file = open(path_to_file, 'rb')
uploader = client.get_chunked_uploader(big_file)
print "uploading " + path_to_file
while uploader.offset < file_size:
percent_complete = bytes_written / file_size * 100
clearscreen()
print "%.2f" % percent_complete + "%"
Thanks!
I recreate ChunkedUploader
f = open(filetoupload, 'rb')
uploader = MMChunkedUploader(self.client, f, file_size, 1024*200)
uploader.upload_chunked()
uploader.finish(dropboxfilename)
class MMChunkedUploader(object):
"""Contains the logic around a chunked upload, which uploads a
large file to Dropbox via the /chunked_upload endpoint.
"""
def __init__(self, client, file_obj, length, chunk_size = 4 * 1024 * 1024):
self.client = client
self.offset = 0
self.upload_id = None
self.last_block = None
self.file_obj = file_obj
self.target_length = length
self.chunk_size=chunk_size
self.clocknumber=0
dec=float(self.target_length)/chunk_size - self.target_length//chunk_size
if dec >0:
self.totalblock=self.target_length/chunk_size +1
else:
self.totalblock=self.target_length/chunk_size
def upload_chunked(self, chunk_size = 0):
"""Uploads data from this ChunkedUploader's file_obj in chunks, until
an error occurs. Throws an exception when an error occurs, and can
be called again to resume the upload.
Parameters
chunk_size
The number of bytes to put in each chunk. (Default 4 MB.)
"""
if chunk_size ==0:
chunk_size=self.chunk_size
self.clocknumber=0
while self.offset < self.target_length:
self.clocknumber+=1
print "Block n.", repr(self.clocknumber) , " of " , repr(self.totalblock), " %", round((float(self.clocknumber) * 100) / self.totalblock, 0)
next_chunk_size = min(chunk_size, self.target_length - self.offset) #sceglie tra min e chuck size
if self.last_block == None:
self.last_block = self.file_obj.read(next_chunk_size)
print "Leggo blocco file"
try:
(self.offset, self.upload_id) = self.client.upload_chunk(
StringIO(self.last_block), next_chunk_size, self.offset, self.upload_id)
self.last_block = None
except dropbox.rest.ErrorResponse as e:
# Handle the case where the server tells us our offset is wrong.
must_reraise = True
if e.status == 400:
reply = e.body
if "offset" in reply and reply['offset'] != 0 and reply['offset'] > self.offset:
self.last_block = None
self.offset = reply['offset']
must_reraise = False
if must_reraise:
raise
def finish(self, path, overwrite=False, parent_rev=None):
path = "/commit_chunked_upload/%s%s" % (self.client.session.root, dropbox.client.format_path(path))
params = dict(
overwrite = bool(overwrite),
upload_id = self.upload_id
)
if parent_rev is not None:
params['parent_rev'] = parent_rev
url, params, headers = self.client.request(path, params, content_server=True)
return self.client.rest_client.POST(url, params, headers)
i am developing a script to download online live streaming videos.
My Script:
print "Recording video..."
response = urllib2.urlopen("streaming online video url")
filename = time.strftime("%Y%m%d%H%M%S",time.localtime())+".avi"
f = open(filename, 'wb')
video_file_size_start = 0
video_file_size_end = 1048576 * 7 # end in 7 mb
block_size = 1024
while True:
try:
buffer = response.read(block_size)
if not buffer:
break
video_file_size_start += len(buffer)
if video_file_size_start > video_file_size_end:
break
f.write(buffer)
except Exception, e:
logger.exception(e)
f.close()
above script is working fine to download 7Mb of video from live streaming contents and storing it in to *.avi files.
However, I would like to download just 10 secs of video regardless of the file size and store it in avi file.
I tried different possibilities but to no success.
Could any one please share your knowledge here to fix my issue.
Thanks in advance.
I don't think there is any way of doing that without constantly analysing the video, which will be way to costly. So you could take a guess of how many MB you need and once done check it's long enough. If it's too long, just cut it. Instead of guessing you could also build up some statistics of how much you need to retrieve. You could also replace the while True with:
start_time_in_seconds = time.time()
time_limit = 10
while time.time() - start_time_in_seconds < time_limit:
...
This should give you at least 10 seconds of video, unless connecting takes too much time (less then 10 seconds then) or server sends more for buffering (but that's unlikely for live streams).
You can use the 'Content-Length' header to retrieve the video filesize if it exists.
video_file_size_end = response.info().getheader('Content-Length')
response.read() does not work. response.iter_content() seem to do the trick.
import time
import requests
print("Recording video...")
filename = time.strftime("/tmp/" + "%Y%m%d%H%M%S",time.localtime())+".avi"
file_handle = open(filename, 'wb')
chunk_size = 1024
start_time_in_seconds = time.time()
time_limit = 10 # time in seconds, for recording
time_elapsed = 0
url = "http://demo.codesamplez.com/html5/video/sample"
with requests.Session() as session:
response = session.get(url, stream=True)
for chunk in response.iter_content(chunk_size=chunk_size):
if time_elapsed > time_limit:
break
# to print time elapsed
if int(time.time() - start_time_in_seconds)- time_elapsed > 0 :
time_elapsed = int(time.time() - start_time_in_seconds)
print(time_elapsed, end='\r', flush=True)
if chunk:
file_handle.write(chunk)
file_handle.close()
I am trying to create a download progress bar in python using the urllib2 http client. I've looked through the API (and on google) and it seems that urllib2 does not allow you to register progress hooks. However the older deprecated urllib does have this functionality.
Does anyone know how to create a progress bar or reporting hook using urllib2? Or are there some other hacks to get similar functionality?
Here's a fully working example that builds on Anurag's approach of chunking in a response. My version allows you to set the the chunk size, and attach an arbitrary reporting function:
import urllib2, sys
def chunk_report(bytes_so_far, chunk_size, total_size):
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" %
(bytes_so_far, total_size, percent))
if bytes_so_far >= total_size:
sys.stdout.write('\n')
def chunk_read(response, chunk_size=8192, report_hook=None):
total_size = response.info().getheader('Content-Length').strip()
total_size = int(total_size)
bytes_so_far = 0
while 1:
chunk = response.read(chunk_size)
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, chunk_size, total_size)
return bytes_so_far
if __name__ == '__main__':
response = urllib2.urlopen('http://www.ebay.com');
chunk_read(response, report_hook=chunk_report)
Why not just read data in chunks and do whatever you want to do in between, e.g. run in a thread, hook into a UI, etc etc
import urllib2
urlfile = urllib2.urlopen("http://www.google.com")
data_list = []
chunk = 4096
while 1:
data = urlfile.read(chunk)
if not data:
print "done."
break
data_list.append(data)
print "Read %s bytes"%len(data)
output:
Read 4096 bytes
Read 3113 bytes
done.
urlgrabber has built-in support for progress notification.
Simplified version:
temp_filename = "/tmp/" + file_url.split('/')[-1]
f = open(temp_filename, 'wb')
remote_file = urllib2.urlopen(file_url)
try:
total_size = remote_file.info().getheader('Content-Length').strip()
header = True
except AttributeError:
header = False # a response doesn't always include the "Content-Length" header
if header:
total_size = int(total_size)
bytes_so_far = 0
while True:
buffer = remote_file.read(8192)
if not buffer:
sys.stdout.write('\n')
break
bytes_so_far += len(buffer)
f.write(buffer)
if not header:
total_size = bytes_so_far # unknown size
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" % (bytes_so_far, total_size, percent))
Minor modification to Triptych's response to allow for actually writing out the file (python3):
from urllib.request import urlopen
def chunk_report(bytes_so_far, chunk_size, total_size):
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" %
(bytes_so_far, total_size, percent))
if bytes_so_far >= total_size:
sys.stdout.write('\n')
def chunk_read(response, chunk_size=8192, report_hook=None):
total_size = response.info().get("Content-Length").strip()
total_size = int(total_size)
bytes_so_far = 0
data = b""
while 1:
chunk = response.read(chunk_size)
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, chunk_size, total_size)
data += chunk
return data
Usage:
with open(out_path, "wb") as f:
response = urlopen(filepath)
data_read = chunk_read(response, report_hook=chunk_report)
f.write(data_read)