So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.
Related
I'm building a parallel download library using threading module.
When I use my library, it downloads the file without error, but the video file doesn't have the same content as if I downloaded it through the browser.
I use threading for parallel downloading and I think I have a problem with threading.Lock and file.seek, but I can't figure out how to fix the problem.
This is my code:
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
self.lock = threading.Lock()
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
self.lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
self.lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")
How can I solve the problem with my library and get the same data in the file? Thank you for any help.
There were two problems with my code:
I found a solution to the first problem here. https://stackoverflow.com/a/25165183/14900791:
The Lock() function creates an entirely new lock - one that only the
thread calling the function can use. That's why it doesn't work,
because each thread is locking an entirely different lock.
Mixdrop (mxdcontent.net) only allows two videos in the same ip, so the code only works for two parts, the others got status code 509 (I didn't checked the status code so I didn't get an error).
import requests
import threading
from tqdm import tqdm
DOWNLOAD_CHUNK_SIZE = 1 << 20 # 1 MiB
# global lock instance
lock = threading.Lock()
class DownloadPart:
def __init__(self, url, byte_range) -> None:
self.url = url
self.byte_range = byte_range
def download(self, file, pbar=None):
response = requests.get(
self.url,
headers={"Range": "bytes={}-{}".format(*self.byte_range)},
allow_redirects=True,
stream=True,
)
written = 0
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
if chunk:
lock.acquire()
file.seek(self.byte_range[0] + written)
length = file.write(chunk)
file.flush()
written += length
pbar.update(length)
lock.release()
class Downloader:
def __init__(self, url, parts=10):
self.url = url
self.parts = parts
def _get_file_size(self) -> int:
info = requests.head(self.url, allow_redirects=True)
info.raise_for_status()
size = info.headers.get("content-length", None)
assert size
return int(size)
def download(self, filename):
file_size = self._get_file_size()
# file_size = 1024
size_per_part = file_size // self.parts
print(file_size, size_per_part)
file = open(filename, "wb")
pbar = tqdm(total=file_size)
threads = []
for index in range(self.parts):
# fix last part have more bytes
if index + 1 == self.parts:
byte_range = (size_per_part * index, file_size - 1)
else:
byte_range = (size_per_part * index, size_per_part * (index + 1) - 1)
thread = threading.Thread(
target=DownloadPart(self.url, byte_range).download, args=(file,), kwargs={"pbar": pbar}
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
file.close()
URL = "https://s-delivery38.mxdcontent.net/v/8a5f59673042ed97c402be84ceeb20d9.mp4?s=TfiDzO2oBLrhub_GhToCiQ&e=1676489987&_t=1676476332"
d = Downloader(URL)
d.download("video.mp4")
I have this code:
configurationsFile = "file.csv"
configurations = []
def loadConfigurations():
with open(configurationsFile) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
line_count = 0
for row in csv_reader:
url = row[0]
line_count += 1
configurations.append({"url": url})
print(f'{line_count} urls loaded.')
loadConfigurations()
failedConfigs = []
session_requests = requests.session()
for config in configurations:
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
if len(failedConfigs) > 0:
print("These errored out:")
for theConfig in failedConfigs:
print("ERROR: {}".format(theConfig['url']))
It reads urls from a csv file, and then runs a code for each of the urls that's listed in the csv file.
The only "problem" is if the csv file contains a lot of urls, then it takes a long time to run thru them all. So I'm looking for a way to run more then one url at a time.
I'm not that good with python so I don't even know if it's possible.
But the question is, is there some way to tell the code to run, say 5 urls at once instead of just 1?
You can use the threading.Thread class. Here is an example:
from threading import Thread
def read(file, start, end):
with open(file, 'r') as r:
for i, v in enumerate(r):
if start <= i < end:
print(v)
file = "file.txt"
t1 = Thread(target=read, args=(file, 0, 100))
t2 = Thread(target=read, args=(file, 100, 200))
t3 = Thread(target=read, args=(file, 200, 300))
t4 = Thread(target=read, args=(file, 300, 400))
t5 = Thread(target=read, args=(file, 400, 500))
t1.start()
t2.start()
t3.start()
t3.start()
t5.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
Or use a loop:
from threading import Thread
def read(file, start, end):
with open(file, 'r') as r:
for i, v in enumerate(r):
if start <= i < end:
print(v)
file = "file.txt"
threads = []
for i in range(5):
threads.append(Thread(target=read, args=(file, i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
Basically, the read() function defined above reads in a file from line start to line end. Split the reading tasks into 5 segments so that 5 threads can simultaneously read the file.
UPDATE UPON REQUEST
For your code, the
for config in configurations:
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
Can be converted to a function which allows you to specify from which index to which index of the configurations you want to process:
def process(start, end):
for i in range(start, end):
config = configurations[i]
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
Which you can then add
threads = []
for i in range(5):
threads.append(Thread(target=process, args=(i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
So you might end up with something like:
configurationsFile = "file.csv"
configurations = []
def loadConfigurations():
with open(configurationsFile) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
line_count = 0
for row in csv_reader:
url = row[0]
line_count += 1
configurations.append({"url": url})
print(f'{line_count} urls loaded.')
loadConfigurations()
failedConfigs = []
session_requests = requests.session()
def process(start, end):
for i in range(start, end):
config = configurations[i]
try:
"Do something with the url loaded fron file.csv"
except Exception as e:
print(e)
failedConfigs.append(config)
threads = []
for i in range(5):
threads.append(Thread(target=process, args=(i * 100, (i + 1) * 100)))
for t in threads:
t.start()
for t in threads:
t.join()
if len(failedConfigs) > 0:
print("These errored out:")
for theConfig in failedConfigs:
print("ERROR: {}".format(theConfig['url']))
Inside a worker thread I am generating a data frame . Trying to put this into the queue passed to the worker thread is failing. In fact trying to put any values into the queue is failing.
The part of the code that is failing inside the worker thread task1() is given below:
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
Complete code
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl1 = TSNE(perplexity=100,early_exaggeration=1, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl1.fit_transform(dense_mx))
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
print(df)
print(str(mdltiming))
print(paramval)
def task2(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl2 = TSNE(perplexity=100,early_exaggeration=10, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl2.fit_transform(dense_mx2))
qmdlvalues.put(pd.DataFrame([[1,2,3,4],[3,4,5,6]]))
qmdlparams.put(paramval)
mdltiming = time.time() - start
qtiming.put(mdltiming)
if __name__ == "__main__":
dense_mx2 = dense_mx
dense_mx3 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qtme = queue.Queue()
mdlvalues = pd.DataFrame()
t1 = threading.Thread(target=task1,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 1 timing:$_plex100_exag1.csv"), name='t1')
t2 = threading.Thread(target=task2,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 10 timing:$_plex100_exag10.cv"), name='t2')
# starting threads
t1.start()
t2.start()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
# wait until all threads finish
t1.join()
t2.join()
Below is the actual output I am getting from the code in the main
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
ID of process running main program: 6456
Main thread name: MainThread
Queue closed. Exiting thread.
I want to able to put the data frame into a queue inside the worker thread and access the same data frame in the main thread.
There are parameter mis-matches in my earlier code those have been corrected an a full working code presented below.
I stored the output of t-SNE directly into the queue and retrieved the same in the main thread. The next progression would be convert this to thread pool and sub-classing.
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl1 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl1.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
print(str(mdltiming)+"time")
qmdltime.put(mdltiming)
def task2(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl2 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl2.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task3(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl3 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl3.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task4(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl4 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl4.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
if __name__ == "__main__":
# print ID of current process
print("ID of process running main program: {}".format(os.getpid()))
# print name of main thread
print("Main thread name: {}".format(threading.main_thread().name))
dense_mx2 = dense_mx
dense_mx3 = dense_mx
dense_mx4 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qmdltme = queue.Queue()
qmdlhdrfname = queue.Queue()
perplex = 200
# creating threads
exag=10
t1 = threading.Thread(target=task1,args=(dense_mx,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 10 timing:$_plex200_exag10.csv"), name='t1')
exag=30
t2 = threading.Thread(target=task2,args=(dense_mx2,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 30 timing:$_plex200_exag30.cv"), name='t2')
exag=50
t3 = threading.Thread(target=task3,args=(dense_mx3,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 50 timing:$_plex200_exag50.csv"), name='t3')
exag=100
t4 = threading.Thread(target=task4,args=(dense_mx4,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 100 timing:$_plex200_exag100.cv"), name='t4')
# starting threads
t1.start()
t2.start()
t3.start()
t4.start()
# wait until all threads finish
t1.join()
t2.join()
t3.join()
t4.join()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item1 = qmdlvl.get(timeout=.5)
item2 = qmdlch.get(timeout=.5)
item3 = qmdltme.get(timeout=.5)
header,fname = qmdlhdrfname.get(timeout=.5).split('$')
except:
continue
write_tsne_op(item1,fname,header)
I am trying to read data from an input file, and for each line perform a task in a while loop. Problem is that when I create the first process - its loop is executing and not returning control to the above for loop. Bottom line there is no parallelism. What am I doing wrong?
Here is the relevant code:
from multiprocessing import Process
def work_line(list1Line,jobId):
while True:
print list1Line
tenant = list1Line[0]
module = list1Line[1]
endTime = int(time.time())
startTime = endTime - startTimeDelta
generate(jobId, startTime, endTime, tenantServiceAddress, tenant, module)
print ("tenant {} will sleep for {} seconds").format(tenant,sleepBetweenLoops)
time.sleep(sleepBetweenLoops)
def openFiles():
file = open(CLOUD_INPUT_FILE, 'r')
lines = file.readlines()
file.close()
linesLen = len(lines)
processes = []
for linesIndex in range(0, linesLen):
jobId = GenerateRandomID()
line = lines[linesIndex]
list1Line = line.split()
p = Process(target=work_line(list1Line,jobId))
p.start()
processes.append(p)
print processes
for p in processes:
p.join()
if __name__ == '__main__':
CLOUD_INPUT_FILE = r'C:\CF\input_file.txt'
tenantServiceAddress = 'address.address'
startTimeDelta = 300
sleepBetweenLoops = 1800
print multiprocessing.cpu_count()
openFiles()
You are actually calling the function. Change to
p = Process(target=work_line, args=(list1Line,jobId))
I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.
Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes
Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.