trying to split the file download buffer to into separate threads

trying to split the file download buffer to into separate threads - python

I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.

Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes

Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.

Related

Parallel downloading don't work in python threading

I'm building a parallel download library using threading module.
When I use my library, it downloads the file without error, but the video file doesn't have the same content as if I downloaded it through the browser.
I use threading for parallel downloading and I think I have a problem with threading.Lock and file.seek, but I can't figure out how to fix the problem.
This is my code:
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
self.lock = threading.Lock()
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
self.lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
self.lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")
How can I solve the problem with my library and get the same data in the file? Thank you for any help.

There were two problems with my code:
I found a solution to the first problem here. https://stackoverflow.com/a/25165183/14900791:
The Lock() function creates an entirely new lock - one that only the
thread calling the function can use. That's why it doesn't work,
because each thread is locking an entirely different lock.
Mixdrop (mxdcontent.net) only allows two videos in the same ip, so the code only works for two parts, the others got status code 509 (I didn't checked the status code so I didn't get an error).
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
# global lock instance
lock = threading.Lock()
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")

Speeding up multithreaded downloads

I've written a Python script that will download files from a website. To speed it up, I've made the downloading of the files multithreaded. Obviously, this is faster than doing the downloads serially, but I've come across some effects that I cannot explain.
The first x files (seems proportional to the amount of threads created) downloaded are incredibly fast--the output shows upwards of 40 files per second--but after that, slows down a lot.
Up to a point (near 200 threads), the maximum speed at which I can download files averages 10 files per second. If I increase the thread count to, say, 700, it still maxes out at 10 files per second. Increasing the thread count to a very large number (over 1,000) seems to limit the download speed based on CPU speed.
So, my questions are:
Why are the first files I download downloaded so fast compared to the rest and can I maintain the original speed?
Why does the thread count have such diminishing returns for download speeds?
Here is my script:
#!/usr/bin/python
import inspect
import math
from queue import Queue
from urllib.request import ProxyHandler, build_opener
from ast import literal_eval
from time import time, sleep
from datetime import timedelta
import random
from threading import Thread, activeCount
import os
proxies = Queue()
threads = Queue()
agents = []
total_files = 0
finished_files = 0
downloaded_files = 0
start_time = 0
class Config(object):
DEBUG = False
PROXIES_PATH = '/home/shane/bin/proxies.txt'
AGENTS_PATH = '/home/shane/bin/user-agents.txt'
DESTINATION_PATH = '/home/shane/images/%d.jpg'
SOURCE_URL = 'https://example.org/%d.jpg'
MAX_THREADS = 500
TIMEOUT = 62
RETRIES = 1
RETRIES_TIME = 1
def get_files_per_second():
return float(downloaded_files) / (time() - start_time)
def get_time_remaining():
delta = timedelta(seconds=float(total_files - finished_files) / get_files_per_second())
seconds = delta.total_seconds()
days, remainder = divmod(seconds, 86400)
hours, remainder = divmod(remainder, 3600)
minutes, seconds = divmod(remainder, 60)
days = str(int(days)).zfill(2)
hours = str(int(hours)).zfill(2)
minutes = str(int(minutes)).zfill(2)
seconds = str(int(seconds)).zfill(2)
return "%s:%s:%s:%s" % (days, hours, minutes, seconds)
def release_proxy(opener):
if Config.DEBUG:
print('Releasing proxy')
for handler in opener.handlers:
if type(handler) is ProxyHandler:
proxies.put(handler)
return
raise Exception('No proxy found')
def get_new_proxy():
if Config.DEBUG:
print('Getting new proxy')
if proxies.empty():
raise Exception('No proxies')
return proxies.get()
def get_new_agent():
if len(agents) == 0:
raise Exception('No user agents')
return random.choice(agents)
def get_new_opener():
opener = build_opener(get_new_proxy())
opener.addheaders = [('User-Agent', get_new_agent())]
return opener
def download(opener, source, destination, tries=0):
global finished_files, downloaded_files
if Config.DEBUG:
print('Downloading %s to %s' % (source, destination))
try:
result = opener.open(source, timeout=Config.TIMEOUT).read()
with open(destination, 'wb') as d:
d.write(result)
release_proxy(opener)
finished_files += 1
downloaded_files += 1
to_print = '(%d/%d files) (%d proxies) (%f files/second, %s left) (%d threads) %s'
print(to_print % (finished_files, total_files, proxies.qsize(), round(get_files_per_second(), 2), get_time_remaining(), activeCount(), source))
except Exception as e:
if Config.DEBUG:
print(e)
if tries < Config.RETRIES:
sleep(Config.RETRIES_TIME)
download(opener, source, destination, tries + 1)
else:
if proxies.qsize() < Config.MAX_THREADS * 2:
release_proxy(opener)
download(get_new_opener(), source, destination, 0)
class Downloader(Thread):
def __init__(self, source, destination):
Thread.__init__(self)
self.source = source
self.destination = destination
def run(self):
if Config.DEBUG:
print('Running thread')
download(get_new_opener(), self.source, self.destination)
if threads.qsize() > 0:
threads.get().start()
def populate_proxies():
if Config.DEBUG:
print('Populating proxies')
with open(Config.PROXIES_PATH, 'r') as fh:
for line in fh:
line = line.replace('\n', '')
if Config.DEBUG:
print('Adding %s to proxies' % line)
proxies.put(ProxyHandler(literal_eval(line)))
def populate_agents():
if Config.DEBUG:
print('Populating agents')
with open(Config.AGENTS_PATH, 'r') as fh:
for line in fh:
line = line.replace('\n', '')
if Config.DEBUG:
print('Adding %s to agents' % line)
agents.append(line)
def populate_threads():
global total_files, finished_files
if Config.DEBUG:
print('Populating threads')
for x in range(0, 100000):
destination = Config.SOURCE_URL % x
# queue threads
print('Queueing %s' % destination)
threads.put(Downloader(source, destination))
def start_work():
global start_time
if threads.qsize() == 0:
raise Exception('No work to be done')
start_time = time()
for x in range(0, min(threads.qsize(), Config.MAX_THREADS)):
if Config.DEBUG:
print('Starting thread %d' % x)
threads.get().start()
populate_proxies()
populate_agents()
populate_threads()
start_work()

The no. of threads you are using is a very high number, python does not actually run the threads in parallel, it just switches between them frequently, which seems like parallel threads.
If the task is CPU intensive, then use multi-processing, else if the task is I/O intensive, threads will be useful.
Keep the thread count low (10-70), on a normal Quad-core PC, 8GB ram, else the switching time will reduce the speed of your code.
Check these 2 links:
Stack Over Flow Question
Executive Summary On this page.

Only 1 Thread started in for loop

So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()

You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?

I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

How to measure download speed and progress using requests?

I am using requests to download files, but for large files I need to check the size of the file on disk every time because I can't display the progress in percentage and I would also like to know the download speed. How can I go about doing it ? Here's my code :
import requests
import sys
import time
import os
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
r = requests.get(url, stream=True)
start = time.clock()
f = open(directory + '/' + localFilename, 'wb')
for chunk in r.iter_content(chunk_size = 512 * 1024) :
if chunk :
f.write(chunk)
f.flush()
os.fsync(f.fileno())
f.close()
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()
I think one way to do it would be to read the file every time in the for loop and calculate the percentage of progress based on the header Content-Length. But that would be again an issue for large files(around 500MB). Is there any other way to do it?

see here: Python progress bar and downloads
i think the code would be something like this, it should show the average speed since start as bytes per second:
import requests
import sys
import time
def downloadFile(url, directory) :
localFilename = url.split('/')[-1]
with open(directory + '/' + localFilename, 'wb') as f:
start = time.clock()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s] %s bps" % ('=' * done, ' ' * (50-done), dl//(time.clock() - start)))
print ''
return (time.clock() - start)
def main() :
if len(sys.argv) > 1 :
url = sys.argv[1]
else :
url = raw_input("Enter the URL : ")
directory = raw_input("Where would you want to save the file ?")
time_elapsed = downloadFile(url, directory)
print "Download complete..."
print "Time Elapsed: " + time_elapsed
if __name__ == "__main__" :
main()

An improved version of the accepted answer for python3 using io.Bytes (write to memory), result in Mbps, support for ipv4/ipv6, size and port arguments.
import sys, time, io, requests
def speed_test(size=5, ipv="ipv4", port=80):
if size == 1024:
size = "1GB"
else:
size = f"{size}MB"
url = f"http://{ipv}.download.thinkbroadband.com:{port}/{size}.zip"
with io.BytesIO() as f:
start = time.perf_counter()
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
dl = 0
if total_length is None: # no content length header
f.write(r.content)
else:
for chunk in r.iter_content(1024):
dl += len(chunk)
f.write(chunk)
done = int(30 * dl / int(total_length))
sys.stdout.write("\r[%s%s] %s Mbps" % ('=' * done, ' ' * (30-done), dl//(time.perf_counter() -
start) / 100000))
print( f"\n{size} = {(time.perf_counter() - start):.2f} seconds")
Usage Examples:
speed_test()
speed_test(10)
speed_test(50, "ipv6")
speed_test(1024, port=8080)
Output Sample:
[==============================] 61.34037 Mbps
100MB = 17.10 seconds
Available Options:
size: 5, 10, 20, 50, 100, 200, 512, 1024
ipv: ipv4, ipv6
port: 80, 81, 8080
Updated on 20221011:
time.perf_counter() replaced time.clock(), which has been deprecated on python 3.3 (kudos to shiro)

I had a problem with a specific slow server to download a big file
no Content-Length header.
big file (42GB),
no compression,
slow server (<1MB/s),
Beeing this big, I had also problem with memory usage during the request. Requests doesn't write output on file, like urlibs does, looks like it keep it in memory.
No content length header makes the accepted answer.. not monitoring.
So I wrote this -basic- method to monitor speed during the csv download following just the "requests" documentation.
It needs a fname (complete output path), a link (http or https) and you can specify custom headers.
BLOCK=5*1024*1024
try:
with open(fname, 'wb') as f:
r = requests.get(link, headers=headers, stream=True)
## This is, because official dozumentation suggest it,
## saying it's more reliable thatn cycling directly on iterlines, to don't lose data
lines = r.iter_lines()
## Init the base vars, for monitor and block management
## Obj is a byte object, because iterlines returno objects
tsize = 0; obj = bytearray(); t0=time.time(); i=0;
for line in lines:
## calculate the line size, in bytes, and add to the byte object
tsize+=len(line)
obj.extend(line)
## When condition reached,
if tsize > BLOCK:
## Increment the block number
i+=1;
## Calculate the speed.. this is in MB/s,
## but you can easily change to KB/s, or Blocks/s
t1=time.time()
t=t1-t0;
speed=round(5/t, 2);
## Write the block to the file.
f.write(obj)
## Write stats
print('got', i*5, 'MB ', 'block' ,i, ' #', speed,'MB/s')
## Reinit all the base vars, for a new block
obj=bytearray(); tsize=0; t0=time.time()
## Write the last block part to the file.
f.write(obj)
except Exception as e:
print("Error: ", e, 0)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

trying to split the file download buffer to into separate threads - python

Related

Parallel downloading don't work in python threading

Speeding up multithreaded downloads

Only 1 Thread started in for loop

using python stream downloading file with server limit

How to measure download speed and progress using requests?

Categories

Resources