Parallel downloading don't work in python threading

Parallel downloading don't work in python threading - python

I'm building a parallel download library using threading module.
When I use my library, it downloads the file without error, but the video file doesn't have the same content as if I downloaded it through the browser.
I use threading for parallel downloading and I think I have a problem with threading.Lock and file.seek, but I can't figure out how to fix the problem.
This is my code:
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
self.lock = threading.Lock()
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
self.lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
self.lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")
How can I solve the problem with my library and get the same data in the file? Thank you for any help.

There were two problems with my code:
I found a solution to the first problem here. https://stackoverflow.com/a/25165183/14900791:
The Lock() function creates an entirely new lock - one that only the
thread calling the function can use. That's why it doesn't work,
because each thread is locking an entirely different lock.
Mixdrop (mxdcontent.net) only allows two videos in the same ip, so the code only works for two parts, the others got status code 509 (I didn't checked the status code so I didn't get an error).
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
# global lock instance
lock = threading.Lock()
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")

Related

Python Multi Processing printing statement multiple times

I've written this code that downloads a file from the internet and saves it to my computer.
To make it more efficient, I added MultiProcessing to my code to be able to download multiple files at the same time and it works, However, It keeps printing the progressbar I added again and again.
What I want is for the progress bars to display once and keep updating, like they would before the Multi Processing functionality is added. I've added my code below to reproduce.
from multiprocessing import Process
from alive_progress import alive_bar
import requests
import time
import os
def download(url):
curr_dir = os.getcwd()
x = requests.head(url)
y = requests.head(x.headers['Location'])
file_size = int(int(y.headers['content-length']) / 1024)
chunk_size = 1024
def compute():
response = requests.get(url, stream=True)
with open(curr_dir + '\\' + str(time.time()) + '.mp4', 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
f.write(chunk)
yield 1024
with alive_bar(file_size, bar='classic2', spinner='classic') as bar:
for i in compute():
bar()
print("Downloaded!")
if __name__ == '__main__':
processess = []
num_processess = 2
for i in num_processess:
process = Process(target=download, args=(links[i],))
processess.append(process)
for process in processess:
process.start()
for process in processess:
process.join()

Alive-progress doesn't support showing and updating multiple progress bars. You have to use another library, such as the tqdm.
The following is an example of using the tqdm for your scenario. The key point is to call the tqdm.set_lock() to specify a synchronization mechanism for inter-process interaction and control positions of progress bars via the position argument of tqdm().
import multiprocessing
import tqdm
def download(url, id, tqdm_lock):
...
tqdm.tqdm.set_lock(tqdm_lock)
with tqdm.tqdm(total=file_size, position=id) as bar:
for i in compute():
bar.update(1)
bar.clear()
...
if __name__ == '__main__':
tqdm_lock = multiprocessing.RLock()
processess = []
num_processess = 2
links = [...]
for i in num_processess:
process = Process(target=download, args=(links[i], i, tqdm_lock))
processess.append(process)
for process in processess:
process.start()
for process in processess:
process.join()

Update 2
If you want multiple progress bars, then I would use package tqdm.
This is how I would approach it:
First find out for each URL how many CHUNK_SIZE chunks there are. CHUNK_SIZE is set at 1024, but consider increasing this for large files. A potential issue is that the 'content-length' header key is not always present. In this case, the URL is considered to consist of a single chunk and the progress bar created will be updated only once when the entire file has been downloaded..
Then each submitted task creates a progress bar whose size is the number of chunks it retrieved in step 1 and designated for a specific position based on its task number. Then the chunks are retrieved and the progress bar is updated. The logic is predicated on the file being retrieved never varying in size when the content-length key is present in the fetched header. That is, the size of the file does not change between the head and get requests being issued so that the progress bar size set from the head command will match the actual number of chunks read when the download is done.
In the code below I have commented out specific code pertaining to the writing of downloaded files to disk and have gotten rid of the compute generator function, which now seems to be unnecessary. I have also added a delay between successive fetching of chunks so that the progress bar does not progress too fast:
import requests
from tqdm import tqdm
CHUNK_SIZE = 1024
def get_number_of_chunks(url):
r = requests.head(url, allow_redirects=True)
headers = r.headers
if 'content-length' in headers:
n_chunks, remainder = divmod(int(headers['content-length']), CHUNK_SIZE)
if remainder:
n_chunks += 1
else:
n_chunks = 1
return n_chunks
def download(task_number, url):
n_chunks = get_number_of_chunks(url)
response = requests.get(url, stream=True)
#with open(str(time.time()) + '.mp4', 'wb') as f:
if True:
with tqdm(total=n_chunks, position=task_number) as bar:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
#f.write(chunk)
if n_chunks != 1:
bar.update(1)
# For demo purposes:
import time
time.sleep(.1)
if n_chunks == 1:
bar.update(1)
if __name__ == '__main__':
from multiprocessing.pool import ThreadPool
links = [
'http://localhost/friends/images/nav.png',
'http://localhost/friends/images/race.jpg',
]
n_writers = len(links)
pool = ThreadPool(n_writers)
pool.starmap(download, enumerate(links))
pool.close()
pool.join()
Multiprocessing Version
If you must use multiprocessing, then thanks to relent95, who showed the way:
import requests
from tqdm import tqdm
CHUNK_SIZE = 1024
def init_pool_processes(lock):
"""
Note: The lock only needs to be set once for each pool process.
"""
tqdm.set_lock(lock)
def get_number_of_chunks(url):
r = requests.head(url, allow_redirects=True)
headers = r.headers
if 'content-length' in headers:
n_chunks, remainder = divmod(int(headers['content-length']), CHUNK_SIZE)
if remainder:
n_chunks += 1
else:
n_chunks = 1
return n_chunks
def download(task_number, url):
n_chunks = get_number_of_chunks(url)
response = requests.get(url, stream=True)
#with open(str(time.time()) + '.mp4', 'wb') as f:
if True:
with tqdm(total=n_chunks, position=task_number) as bar:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
#f.write(chunk)
if n_chunks != 1:
bar.update(1)
# For demo purposes:
import time
time.sleep(.1)
if n_chunks == 1:
bar.update(1)
if __name__ == '__main__':
from multiprocessing import Pool, Lock
links = [
'http://localhost/friends/images/nav.png',
'http://localhost/friends/images/race.jpg',
]
n_writers = len(links)
pool = Pool(n_writers, initializer=init_pool_processes, initargs=(Lock(),))
pool.starmap(download, enumerate(links))
pool.close()
pool.join()

Only 1 Thread started in for loop

So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()

You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?

I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

trying to split the file download buffer to into separate threads

I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.

Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes

Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.

Progress bar with Python Dropbox Api

I'm working making a progress bar for my python Dropbox app using the Dropbox API. The problem is I can't figure out how to get the number of bytes written so far so that I can build it. Is there any way to do this using the python Dropbox API? If not could I measure the bytes being sent out from my system instead, possibly using the os module?
I'm trying to get it to work with get_chunked_loader, but it would be great if I could get the bytes written for put_file() as well as file_copy() and file_move also(). My code so far is something like this:
if (file_size >= 4194304):
big_file = open(path_to_file, 'rb')
uploader = client.get_chunked_uploader(big_file)
print "uploading " + path_to_file
while uploader.offset < file_size:
percent_complete = bytes_written / file_size * 100
clearscreen()
print "%.2f" % percent_complete + "%"
Thanks!

I recreate ChunkedUploader
f = open(filetoupload, 'rb')
uploader = MMChunkedUploader(self.client, f, file_size, 1024*200)
uploader.upload_chunked()
uploader.finish(dropboxfilename)
class MMChunkedUploader(object):
"""Contains the logic around a chunked upload, which uploads a
large file to Dropbox via the /chunked_upload endpoint.
"""
def __init__(self, client, file_obj, length, chunk_size = 4 * 1024 * 1024):
self.client = client
self.offset = 0
self.upload_id = None
self.last_block = None
self.file_obj = file_obj
self.target_length = length
self.chunk_size=chunk_size
self.clocknumber=0
dec=float(self.target_length)/chunk_size - self.target_length//chunk_size
if dec >0:
self.totalblock=self.target_length/chunk_size +1
else:
self.totalblock=self.target_length/chunk_size
def upload_chunked(self, chunk_size = 0):
"""Uploads data from this ChunkedUploader's file_obj in chunks, until
an error occurs. Throws an exception when an error occurs, and can
be called again to resume the upload.
Parameters
chunk_size
The number of bytes to put in each chunk. (Default 4 MB.)
"""
if chunk_size ==0:
chunk_size=self.chunk_size
self.clocknumber=0
while self.offset < self.target_length:
self.clocknumber+=1
print "Block n.", repr(self.clocknumber) , " of " , repr(self.totalblock), " %", round((float(self.clocknumber) * 100) / self.totalblock, 0)
next_chunk_size = min(chunk_size, self.target_length - self.offset) #sceglie tra min e chuck size
if self.last_block == None:
self.last_block = self.file_obj.read(next_chunk_size)
print "Leggo blocco file"
try:
(self.offset, self.upload_id) = self.client.upload_chunk(
StringIO(self.last_block), next_chunk_size, self.offset, self.upload_id)
self.last_block = None
except dropbox.rest.ErrorResponse as e:
# Handle the case where the server tells us our offset is wrong.
must_reraise = True
if e.status == 400:
reply = e.body
if "offset" in reply and reply['offset'] != 0 and reply['offset'] > self.offset:
self.last_block = None
self.offset = reply['offset']
must_reraise = False
if must_reraise:
raise
def finish(self, path, overwrite=False, parent_rev=None):
path = "/commit_chunked_upload/%s%s" % (self.client.session.root, dropbox.client.format_path(path))
params = dict(
overwrite = bool(overwrite),
upload_id = self.upload_id
)
if parent_rev is not None:
params['parent_rev'] = parent_rev
url, params, headers = self.client.request(path, params, content_server=True)
return self.client.rest_client.POST(url, params, headers)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parallel downloading don't work in python threading - python

Related

Python Multi Processing printing statement multiple times

Only 1 Thread started in for loop

using python stream downloading file with server limit

trying to split the file download buffer to into separate threads

Progress bar with Python Dropbox Api

Categories

Resources