Python urllib2 Progress Hook

Python urllib2 Progress Hook - python

I am trying to create a download progress bar in python using the urllib2 http client. I've looked through the API (and on google) and it seems that urllib2 does not allow you to register progress hooks. However the older deprecated urllib does have this functionality.
Does anyone know how to create a progress bar or reporting hook using urllib2? Or are there some other hacks to get similar functionality?

Here's a fully working example that builds on Anurag's approach of chunking in a response. My version allows you to set the the chunk size, and attach an arbitrary reporting function:
import urllib2, sys
def chunk_report(bytes_so_far, chunk_size, total_size):
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" %
(bytes_so_far, total_size, percent))
if bytes_so_far >= total_size:
sys.stdout.write('\n')
def chunk_read(response, chunk_size=8192, report_hook=None):
total_size = response.info().getheader('Content-Length').strip()
total_size = int(total_size)
bytes_so_far = 0
while 1:
chunk = response.read(chunk_size)
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, chunk_size, total_size)
return bytes_so_far
if __name__ == '__main__':
response = urllib2.urlopen('http://www.ebay.com');
chunk_read(response, report_hook=chunk_report)

Why not just read data in chunks and do whatever you want to do in between, e.g. run in a thread, hook into a UI, etc etc
import urllib2
urlfile = urllib2.urlopen("http://www.google.com")
data_list = []
chunk = 4096
while 1:
data = urlfile.read(chunk)
if not data:
print "done."
break
data_list.append(data)
print "Read %s bytes"%len(data)
output:
Read 4096 bytes
Read 3113 bytes
done.

urlgrabber has built-in support for progress notification.

Simplified version:
temp_filename = "/tmp/" + file_url.split('/')[-1]
f = open(temp_filename, 'wb')
remote_file = urllib2.urlopen(file_url)
try:
total_size = remote_file.info().getheader('Content-Length').strip()
header = True
except AttributeError:
header = False # a response doesn't always include the "Content-Length" header
if header:
total_size = int(total_size)
bytes_so_far = 0
while True:
buffer = remote_file.read(8192)
if not buffer:
sys.stdout.write('\n')
break
bytes_so_far += len(buffer)
f.write(buffer)
if not header:
total_size = bytes_so_far # unknown size
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" % (bytes_so_far, total_size, percent))

Minor modification to Triptych's response to allow for actually writing out the file (python3):
from urllib.request import urlopen
def chunk_report(bytes_so_far, chunk_size, total_size):
percent = float(bytes_so_far) / total_size
percent = round(percent*100, 2)
sys.stdout.write("Downloaded %d of %d bytes (%0.2f%%)\r" %
(bytes_so_far, total_size, percent))
if bytes_so_far >= total_size:
sys.stdout.write('\n')
def chunk_read(response, chunk_size=8192, report_hook=None):
total_size = response.info().get("Content-Length").strip()
total_size = int(total_size)
bytes_so_far = 0
data = b""
while 1:
chunk = response.read(chunk_size)
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, chunk_size, total_size)
data += chunk
return data
Usage:
with open(out_path, "wb") as f:
response = urlopen(filepath)
data_read = chunk_read(response, report_hook=chunk_report)
f.write(data_read)

Related

streaming reading (chunk-by-chunk reading) using python urlib2.open can only get partial result

I found a way to do streaming reading in Python in this post's most voted answer.
Stream large binary files with urllib2 to file.
But it went wrong that I could only get partial front data when I was doing some time-consuming task after the chunk had been read.
from urllib2 import urlopen
from urllib2 import HTTPError
import sys
import time
CHUNK = 1024 * 1024 * 16
try:
response = urlopen("XXX_domain/XXX_file_in_net.gz")
except HTTPError as e:
print e
sys.exit(1)
while True:
chunk = response.read(CHUNK)
print 'CHUNK:', len(chunk)
#some time-consuming work, just as example
time.sleep(60)
if not chunk:
break
If no sleep, the output is right(the total size added is verified to be same with the actual size ):
CHUNK: 16777216
CHUNK: 16777216
CHUNK: 6888014
CHUNK: 0
If sleep:
CHUNK: 16777216
CHUNK: 766580
CHUNK: 0
And I decompressed these chunk and find only front partial content of the gz file had been read.

Try to support breakpoint-resuming-download in case the server closes the link before sending all enough data.
try:
request = Request(the_url, headers={'Range': 'bytes=0-'})
response = urlopen(request, timeout = 60)
except HTTPError as e:
print e
return 'Connection Error'
print dict(response.info())
header_dict = dict(response.info())
global content_size
if 'content-length' in header_dict:
content_size = int(header_dict['content-length'])
CHUNK = 16*1024 * 1024
while True:
while True:
try:
chunk = response.read(CHUNK )
except socket.timeout, e:
print 'time_out'
break
if not chunk:
break
DoSomeTimeConsumingJob()
global handled_size
handled_size = handled_size + len(chunk)
if handled_size == content_size and content_size != 0:
break
else:
try:
request = Request(the_url, headers={'Range': 'bytes='+ str(handled_size) + '-'})
response = urlopen(request, timeout = 60)
except HTTPError as e:
print e
response.close()

trying to split the file download buffer to into separate threads

I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.

Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes

Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.

Progress bar with Python Dropbox Api

I'm working making a progress bar for my python Dropbox app using the Dropbox API. The problem is I can't figure out how to get the number of bytes written so far so that I can build it. Is there any way to do this using the python Dropbox API? If not could I measure the bytes being sent out from my system instead, possibly using the os module?
I'm trying to get it to work with get_chunked_loader, but it would be great if I could get the bytes written for put_file() as well as file_copy() and file_move also(). My code so far is something like this:
if (file_size >= 4194304):
big_file = open(path_to_file, 'rb')
uploader = client.get_chunked_uploader(big_file)
print "uploading " + path_to_file
while uploader.offset < file_size:
percent_complete = bytes_written / file_size * 100
clearscreen()
print "%.2f" % percent_complete + "%"
Thanks!

I recreate ChunkedUploader
f = open(filetoupload, 'rb')
uploader = MMChunkedUploader(self.client, f, file_size, 1024*200)
uploader.upload_chunked()
uploader.finish(dropboxfilename)
class MMChunkedUploader(object):
"""Contains the logic around a chunked upload, which uploads a
large file to Dropbox via the /chunked_upload endpoint.
"""
def __init__(self, client, file_obj, length, chunk_size = 4 * 1024 * 1024):
self.client = client
self.offset = 0
self.upload_id = None
self.last_block = None
self.file_obj = file_obj
self.target_length = length
self.chunk_size=chunk_size
self.clocknumber=0
dec=float(self.target_length)/chunk_size - self.target_length//chunk_size
if dec >0:
self.totalblock=self.target_length/chunk_size +1
else:
self.totalblock=self.target_length/chunk_size
def upload_chunked(self, chunk_size = 0):
"""Uploads data from this ChunkedUploader's file_obj in chunks, until
an error occurs. Throws an exception when an error occurs, and can
be called again to resume the upload.
Parameters
chunk_size
The number of bytes to put in each chunk. (Default 4 MB.)
"""
if chunk_size ==0:
chunk_size=self.chunk_size
self.clocknumber=0
while self.offset < self.target_length:
self.clocknumber+=1
print "Block n.", repr(self.clocknumber) , " of " , repr(self.totalblock), " %", round((float(self.clocknumber) * 100) / self.totalblock, 0)
next_chunk_size = min(chunk_size, self.target_length - self.offset) #sceglie tra min e chuck size
if self.last_block == None:
self.last_block = self.file_obj.read(next_chunk_size)
print "Leggo blocco file"
try:
(self.offset, self.upload_id) = self.client.upload_chunk(
StringIO(self.last_block), next_chunk_size, self.offset, self.upload_id)
self.last_block = None
except dropbox.rest.ErrorResponse as e:
# Handle the case where the server tells us our offset is wrong.
must_reraise = True
if e.status == 400:
reply = e.body
if "offset" in reply and reply['offset'] != 0 and reply['offset'] > self.offset:
self.last_block = None
self.offset = reply['offset']
must_reraise = False
if must_reraise:
raise
def finish(self, path, overwrite=False, parent_rev=None):
path = "/commit_chunked_upload/%s%s" % (self.client.session.root, dropbox.client.format_path(path))
params = dict(
overwrite = bool(overwrite),
upload_id = self.upload_id
)
if parent_rev is not None:
params['parent_rev'] = parent_rev
url, params, headers = self.client.request(path, params, content_server=True)
return self.client.rest_client.POST(url, params, headers)

How to measure download speed and progress using requests?

I am using requests to download files, but for large files I need to check the size of the file on disk every time because I can't display the progress in percentage and I would also like to know the download speed. How can I go about doing it ? Here's my code :
import requests
import sys
import time
import os
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
r = requests.get(url, stream=True)
start = time.clock()
f = open(directory + '/' + localFilename, 'wb')
for chunk in r.iter_content(chunk_size = 512 * 1024) :
if chunk :
f.write(chunk)
f.flush()
os.fsync(f.fileno())
f.close()
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()
I think one way to do it would be to read the file every time in the for loop and calculate the percentage of progress based on the header Content-Length. But that would be again an issue for large files(around 500MB). Is there any other way to do it?

see here: Python progress bar and downloads
i think the code would be something like this, it should show the average speed since start as bytes per second:
import requests
import sys
import time
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
with open(directory + '/' + localFilename, 'wb') as f:
start = time.clock()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s] %s bps" % ('=' * done, ' ' * (50-done), dl//(time.clock() - start)))
print ''
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()

An improved version of the accepted answer for python3 using io.Bytes (write to memory), result in Mbps, support for ipv4/ipv6, size and port arguments.
import sys, time, io, requests
def speed_test(size=5, ipv="ipv4", port=80):
if size == 1024:
size = "1GB"
else:
size = f"{size}MB"
url = f"http://{ipv}.download.thinkbroadband.com:{port}/{size}.zip"
with io.BytesIO() as f:
start = time.perf_counter()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(30 * dl / int(total_length))
sys.stdout.write("\r[%s%s] %s Mbps" % ('=' * done, ' ' * (30-done), dl//(time.perf_counter() -
start) / 100000))
print( f"\n{size} = {(time.perf_counter() - start):.2f} seconds")
Usage Examples:
speed_test()
speed_test(10)
speed_test(50, "ipv6")
speed_test(1024, port=8080)
Output Sample:
[==============================] 61.34037 Mbps
100MB = 17.10 seconds
Available Options:
size: 5, 10, 20, 50, 100, 200, 512, 1024
ipv: ipv4, ipv6
port: 80, 81, 8080
Updated on 20221011:
time.perf_counter() replaced time.clock(), which has been deprecated on python 3.3 (kudos to shiro)

I had a problem with a specific slow server to download a big file
no Content-Length header.
big file (42GB),
no compression,
slow server (<1MB/s),
Beeing this big, I had also problem with memory usage during the request. Requests doesn't write output on file, like urlibs does, looks like it keep it in memory.
No content length header makes the accepted answer.. not monitoring.
So I wrote this -basic- method to monitor speed during the csv download following just the "requests" documentation.
It needs a fname (complete output path), a link (http or https) and you can specify custom headers.
BLOCK=5*1024*1024
try:
with open(fname, 'wb') as f:
r = requests.get(link, headers=headers, stream=True)
## This is, because official dozumentation suggest it,
## saying it's more reliable thatn cycling directly on iterlines, to don't lose data
lines = r.iter_lines()
## Init the base vars, for monitor and block management
## Obj is a byte object, because iterlines returno objects
tsize = 0; obj = bytearray(); t0=time.time(); i=0;
for line in lines:
## calculate the line size, in bytes, and add to the byte object
tsize+=len(line)
obj.extend(line)
## When condition reached,
if tsize > BLOCK:
## Increment the block number
i+=1;
## Calculate the speed.. this is in MB/s,
## but you can easily change to KB/s, or Blocks/s
t1=time.time()
t=t1-t0;
speed=round(5/t, 2);
## Write the block to the file.
f.write(obj)
## Write stats
print('got', i*5, 'MB ', 'block' ,i, ' #', speed,'MB/s')
## Reinit all the base vars, for a new block
obj=bytearray(); tsize=0; t0=time.time()
## Write the last block part to the file.
f.write(obj)
except Exception as e:
print("Error: ", e, 0)

Upload a file-like object with Paramiko?

I have a bunch of code that looks like this:
with tempfile.NamedTemporaryFile() as tmpfile:
tmpfile.write(fileobj.read()) # fileobj is some file-like object
tmpfile.flush()
try:
self.sftp.put(tmpfile.name, path)
except IOError:
# error handling removed for ease of reading
pass
Is it possible to do an upload like this without having to write the file out somewhere?

Update As of Paramiko 1.10, you can use putfo:
self.sftp.putfo(fileobj, path)
Instead of using paramiko.SFTPClient.put, you can use paramiko.SFTPClient.open, which opens a file-like object. You can write to that. Something like this:
f = self.sftp.open(path, 'wb')
f.write(fileobj.read())
f.close()
Note that it may be worthwhile to feed paramiko data in 32 KiB chunks, since that's the largest chunk underlying SSH protocol can handle without breaking it into multiple packets.

Is StringIO what you're looking for? (doc page)
SFTPClient's get() and put() functions take paths and not file-handles, which makes things a bit awkward.
You could write a wrapper for paramiko.SFTPClient to give it the functionality that you want.
Here's my best untested attempt:
from paramiko import SFTPClient
class SFTPClient2(SFTPClient):
def put(self, local_file, remotepath, callback=None, confirm=True):
fl = source_file
file_size = os.fstat(fl.fileno()).st_size
try:
fr = self.file(remotepath, 'wb')
fr.set_pipelined(True)
size = 0
try:
while True:
data = fl.read(32768)
if len(data) == 0:
break
fr.write(data)
size += len(data)
if callback is not None:
callback(size, file_size)
finally:
fr.close()
finally:
fl.close()
if confirm:
s = self.stat(remotepath)
if s.st_size != size:
raise IOError('size mismatch in put! %d != %d' % (s.st_size, size))
else:
s = SFTPAttributes()
return s
def get(self, remotepath, local_file, callback=None):
fr = self.file(remotepath, 'rb')
file_size = self.stat(remotepath).st_size
fr.prefetch()
try:
fl = local_file
try:
size = 0
while True:
data = fr.read(32768)
if len(data) == 0:
break
fl.write(data)
size += len(data)
if callback is not None:
callback(size, file_size)
finally:
fl.close()
finally:
fr.close()
s = os.fstat(fl.fileno())
if s.st_size != size:
raise IOError('size mismatch in get! %d != %d' % (s.st_size, size))
If it works, the get and put functions should now take local file-handles rather than paths.
All I had to do was get rid of the code that opens the file from the path, and change the code that gets the size of the file to use os.fstat instead of os.stat.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python urllib2 Progress Hook - python

urlgrabber has built-in support for progress notification.

Related

streaming reading (chunk-by-chunk reading) using python urlib2.open can only get partial result

trying to split the file download buffer to into separate threads

Progress bar with Python Dropbox Api

How to measure download speed and progress using requests?

Upload a file-like object with Paramiko?

Categories

Resources