Speeding up multithreaded downloads

Speeding up multithreaded downloads - python

I've written a Python script that will download files from a website. To speed it up, I've made the downloading of the files multithreaded. Obviously, this is faster than doing the downloads serially, but I've come across some effects that I cannot explain.
The first x files (seems proportional to the amount of threads created) downloaded are incredibly fast--the output shows upwards of 40 files per second--but after that, slows down a lot.
Up to a point (near 200 threads), the maximum speed at which I can download files averages 10 files per second. If I increase the thread count to, say, 700, it still maxes out at 10 files per second. Increasing the thread count to a very large number (over 1,000) seems to limit the download speed based on CPU speed.
So, my questions are:
Why are the first files I download downloaded so fast compared to the rest and can I maintain the original speed?
Why does the thread count have such diminishing returns for download speeds?
Here is my script:
#!/usr/bin/python
import inspect
import math
from queue import Queue
from urllib.request import ProxyHandler, build_opener
from ast import literal_eval
from time import time, sleep
from datetime import timedelta
import random
from threading import Thread, activeCount
import os
proxies = Queue()
threads = Queue()
agents = []
total_files = 0
finished_files = 0
downloaded_files = 0
start_time = 0
class Config(object):
DEBUG = False
PROXIES_PATH = '/home/shane/bin/proxies.txt'
AGENTS_PATH = '/home/shane/bin/user-agents.txt'
DESTINATION_PATH = '/home/shane/images/%d.jpg'
SOURCE_URL = 'https://example.org/%d.jpg'
MAX_THREADS = 500
TIMEOUT = 62
RETRIES = 1
RETRIES_TIME = 1
def get_files_per_second():
return float(downloaded_files) / (time() - start_time)
def get_time_remaining():
delta = timedelta(seconds=float(total_files - finished_files) / get_files_per_second())
seconds = delta.total_seconds()
days, remainder = divmod(seconds, 86400)
hours, remainder = divmod(remainder, 3600)
minutes, seconds = divmod(remainder, 60)
days = str(int(days)).zfill(2)
hours = str(int(hours)).zfill(2)
minutes = str(int(minutes)).zfill(2)
seconds = str(int(seconds)).zfill(2)
return "%s:%s:%s:%s" % (days, hours, minutes, seconds)
def release_proxy(opener):
if Config.DEBUG:
print('Releasing proxy')
for handler in opener.handlers:
if type(handler) is ProxyHandler:
proxies.put(handler)
return
raise Exception('No proxy found')
def get_new_proxy():
if Config.DEBUG:
print('Getting new proxy')
if proxies.empty():
raise Exception('No proxies')
return proxies.get()
def get_new_agent():
if len(agents) == 0:
raise Exception('No user agents')
return random.choice(agents)
def get_new_opener():
opener = build_opener(get_new_proxy())
opener.addheaders = [('User-Agent', get_new_agent())]
return opener
def download(opener, source, destination, tries=0):
global finished_files, downloaded_files
if Config.DEBUG:
print('Downloading %s to %s' % (source, destination))
try:
result = opener.open(source, timeout=Config.TIMEOUT).read()
with open(destination, 'wb') as d:
d.write(result)
release_proxy(opener)
finished_files += 1
downloaded_files += 1
to_print = '(%d/%d files) (%d proxies) (%f files/second, %s left) (%d threads) %s'
print(to_print % (finished_files, total_files, proxies.qsize(), round(get_files_per_second(), 2), get_time_remaining(), activeCount(), source))
except Exception as e:
if Config.DEBUG:
print(e)
if tries < Config.RETRIES:
sleep(Config.RETRIES_TIME)
download(opener, source, destination, tries + 1)
else:
if proxies.qsize() < Config.MAX_THREADS * 2:
release_proxy(opener)
download(get_new_opener(), source, destination, 0)
class Downloader(Thread):
def __init__(self, source, destination):
Thread.__init__(self)
self.source = source
self.destination = destination
def run(self):
if Config.DEBUG:
print('Running thread')
download(get_new_opener(), self.source, self.destination)
if threads.qsize() > 0:
threads.get().start()
def populate_proxies():
if Config.DEBUG:
print('Populating proxies')
with open(Config.PROXIES_PATH, 'r') as fh:
for line in fh:
line = line.replace('\n', '')
if Config.DEBUG:
print('Adding %s to proxies' % line)
proxies.put(ProxyHandler(literal_eval(line)))
def populate_agents():
if Config.DEBUG:
print('Populating agents')
with open(Config.AGENTS_PATH, 'r') as fh:
for line in fh:
line = line.replace('\n', '')
if Config.DEBUG:
print('Adding %s to agents' % line)
agents.append(line)
def populate_threads():
global total_files, finished_files
if Config.DEBUG:
print('Populating threads')
for x in range(0, 100000):
destination = Config.SOURCE_URL % x
# queue threads
print('Queueing %s' % destination)
threads.put(Downloader(source, destination))
def start_work():
global start_time
if threads.qsize() == 0:
raise Exception('No work to be done')
start_time = time()
for x in range(0, min(threads.qsize(), Config.MAX_THREADS)):
if Config.DEBUG:
print('Starting thread %d' % x)
threads.get().start()
populate_proxies()
populate_agents()
populate_threads()
start_work()

The no. of threads you are using is a very high number, python does not actually run the threads in parallel, it just switches between them frequently, which seems like parallel threads.
If the task is CPU intensive, then use multi-processing, else if the task is I/O intensive, threads will be useful.
Keep the thread count low (10-70), on a normal Quad-core PC, 8GB ram, else the switching time will reduce the speed of your code.
Check these 2 links:
Stack Over Flow Question
Executive Summary On this page.

Related

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time

Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.

queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

Only 1 Thread started in for loop

So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()

You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.

the efficiency compare between gevent and thread

Recently,I'm working on a gevent demo and I try to compare the efficiency between gevent and thread. Generally speaking，the gevent code should be more efficient than the thread code. But when I use time command to profile the program, I get the unusual result(my command is time python FILENAME.py 50 1000,the last two parameters means pool number or thread number,so I change the two number in the table below). The result shows that the thread is more efficient than the gevent code,so I want to know why this happen and what's wrong with my program? Thanks.
gevent VS thread
My code is below(The main idea is use thread or gevent to send multi HTTP request):
******This is the thread version code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import requests
import threading
import time
import urllib2
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(pagenum):
try:
r = requests.get(urlset[pagenum])
except Exception, e:
print e
pass
def DigJobByPagenum(pagenum, requestnum):
init_num = pagenum
print '%d begin' % init_num
while pagenum < requestnum:
GetResponse(pagenum)
pagenum += threadnum
print '%d over' % init_num
def NormalThread(threadnum):
startime = time.time()
print "%s is running..." % threading.current_thread().name
threads = []
global finished, requestnum
for i in xrange(threadnum):
thread = threading.Thread(target=DigJobByPagenum, args=(i, requestnum))
threads.append(thread)
for t in threads:
t.daemon = True
t.start()
for t in threads:
t.join()
finished += 1
endtime = time.time()
print "%s is stop.The total time is %0.2f" % \
(threading.current_thread().name, (endtime - startime))
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
if __name__ == '__main__':
threadnum = int(sys.argv[1])
requestnum = int(sys.argv[2])
print 'threadnum : %s,requestnum %s ' % (threadnum, requestnum)
originStartTime = time.time()
urlset = []
setUrlSet()
NormalThread(threadnum)
******This is the gevent verison code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import gevent
from gevent import pool
import requests
import time
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(url):
startime = time.time()
r = requests.get(url)
print url
endtime = time.time()
spendtime = endtime - startime
NormalSpendTime.append(spendtime)
global finished
finished += 1
print finished
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
def RunAsyncJob():
jobpool = pool.Pool(concurrent)
for url in urlset:
jobpool.spawn(GetResponse, url)
jobpool.join()
endtime = time.time()
allSpendTime = endtime - originStartime
print 'Total spend time is %0.3f, total request num is %s within %s \
seconds' % (allSpendTime, finished, timeoutNum)
print 'Each request time is %0.3f' % (GetAvageTime(NormalSpendTime))
if __name__ == '__main__':
concurrent = int(sys.argv[1])
requestnum = int(sys.argv[2])
timeoutNum = 100
NormalSpendTime = []
urlset = []
urlActionList = []
setUrlSet()
originStartime = time.time()
RunAsyncJob()

Try
gevent.monkey.patch_all(httplib=True)
It seems that by default gevent does not patch httplib (have a look at http://www.gevent.org/gevent.monkey.html : httplib=False) so you are actually doing blocking requests and you lose all advantages of the asynchronous framework. Although I'm not sure whether requests uses httplib.
If that doesn't work, then have a look at this lib:
https://github.com/kennethreitz/grequests

Re: httplib=False
You are already using requests library to make web calls. It has gevent flavour called grequests:
https://github.com/kennethreitz/grequests
Overall I don't immediately see much reason to prefer one style of threading to the other, if your pool is so small. Of course real threads are relatively heavy (start with 8MB stack), but you have to take that into proportion to the size of your job.
My take, try both (done), verify you are doing both right (to do) and let numbers do the talking.

python threads exits immediately

I am new to python.
I am trying out Hbase thrift client using thrift. I got some code on net, which I just modify to work with latest version of thrift but when I run the code , it just exit, no threads are started.
Here is the code.
import json, traceback, sys, datetime, time, logging, threading, random
import logging.handlers
import thrift
sys.path.append('gen-py')
from thrift.transport.TSocket import TSocket
from thrift.transport.TTransport import TBufferedTransport
from thrift.protocol import TBinaryProtocol
from hbase import THBaseService
gWritenItems = 0
gStartT = 0
gEndT = 0
recordsPerBatch = 300 #reports per client per day
columns = 3
#config
concurrent = 10
records = 60000#6000000 #6 million
bytesPerRecord = 1024
mylock = threading.RLock()
class writeThread(threading.Thread):
def __init__(self, threadname, RecordsThreadwillwrite):
threading.Thread.__init__(self, name = threadname)
bytesPerColumn = int(bytesPerRecord/columns) - 11 #suppose 3 columns
self.columnvalue = "value_" + "x"*bytesPerColumn + "_endv"
self.tbwBatch = int (RecordsThreadwillwrite / recordsPerBatch)
self.transport = TBufferedTransport(TSocket('pnq-adongrevm1', 5151), 40960)
self.transport.open()
protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = THBaseService.Client(protocol)
self.table = "example"
def run(self):
print "+%s start" % (self.getName())
global gEndT
global gWritenItems
threadWritenItem = 0
for loopidx in xrange(0, self.tbwBatch):
self.write_hbase() #write
threadWritenItem += recordsPerBatch
mylock.acquire()
gEndT = time.time()
gWritenItems += threadWritenItem
print "%s done, %s seconds past, %d reocrds saved" % (self.getName(), gEndT-gStartT, gWritenItems)
mylock.release()
self.transport.close()
def write_hbase(self): #write 50 rowkyes, and 3 column families in each rowkey
print self.getName(), "Start write"
batchmutations = []
for i in xrange(0, recordsPerBatch): # write to db, 300 items together
mutations = []
rowkey = "RK_%s_%s" % (random.random(), time.time())
for ii in xrange(0, columns):
mutations.append(THBaseService.TPut(row=rowkey, columnValues=[TColumnValue(family="f1", qualifier="%s"%ii, value=self.columnvalue)]))
self.client.putMultiple(self.table,mutations)
itemsPerThread = int(records / concurrent)
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
print "%d thread created, each thread will write %d records" % (concurrent, itemsPerThread)
I just get a message 10 thread created, each thread will write 6000 records

Yep, this is because you are not waiting for threads to finish their job, so the main thread just exits. Try this:
itemsPerThread = int(records / concurrent)
threads = []
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
threads.append(t)
# wait until all finish the job
for t in threads:
t.join()
EDIT Ha, I don't think I'm right here, because you didn't mark your threads as daemons. It should work even without joining. But have a look at this code:
class CustomThread(threading.Thread):
def run(self):
print "test"
for x in xrange(0, 10):
t = CustomThread()
t.start()
It will always reach print "test" line no matter what. So in your code it should always reach print "+%s start" % (self.getName()) no matter what. Are you sure it doesn't work? :)
If it doesn't, then there are only two possibilities:
There is a blocking operation and/or exception in your __init__ method. But then it would not reach final print;
concurrent variable is 0 for some reason (which is not consistent with the final print).

Cancel slow download in python

I am downloading files over http and displaying the progress using urllib and the following code - which works fine:
import sys
from urllib import urlretrieve
urlretrieve('http://example.com/file.zip', '/tmp/localfile', reporthook=dlProgress)
def dlProgress(count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize)
sys.stdout.write("\r" + "progress" + "...%d%%" % percent)
sys.stdout.flush()
Now I would also like to restart the download if it is going too slow (say less than 1MB in 15 seconds). How can I achieve this?

This should work.
It calculates the actual download rate and aborts if it is too low.
import sys
from urllib import urlretrieve
import time
url = "http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tgz" # 14.135.620 Byte
startTime = time.time()
class TooSlowException(Exception):
pass
def convertBToMb(bytes):
"""converts Bytes to Megabytes"""
bytes = float(bytes)
megabytes = bytes / 1048576
return megabytes
def dlProgress(count, blockSize, totalSize):
global startTime
alreadyLoaded = count*blockSize
timePassed = time.time() - startTime
transferRate = convertBToMb(alreadyLoaded) / timePassed # mbytes per second
transferRate *= 60 # mbytes per minute
percent = int(alreadyLoaded*100/totalSize)
sys.stdout.write("\r" + "progress" + "...%d%%" % percent)
sys.stdout.flush()
if transferRate < 4 and timePassed > 2: # download will be slow at the beginning, hence wait 2 seconds
print "\ndownload too slow! retrying..."
time.sleep(1) # let's not hammer the server
raise TooSlowException
def main():
try:
urlretrieve(url, '/tmp/localfile', reporthook=dlProgress)
except TooSlowException:
global startTime
startTime = time.time()
main()
if __name__ == "__main__":
main()

Something like this:
class Timeout(Exception):
pass
def try_one(func,t=3):
def timeout_handler(signum, frame):
raise Timeout()
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(t) # triger alarm in 3 seconds
try:
t1=time.clock()
func()
t2=time.clock()
except Timeout:
print('{} timed out after {} seconds'.format(func.__name__,t))
return None
finally:
signal.signal(signal.SIGALRM, old_handler)
signal.alarm(0)
return t2-t1
The call 'try_one' with the func you want to time out and the time to timeout:
try_one(downloader,15)
OR, you can do this:
import socket
socket.setdefaulttimeout(15)

HolyMackerel! Use the tools!
import urllib2, sys, socket, time, os
def url_tester(url = "http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tgz"):
file_name = url.split('/')[-1]
u = urllib2.urlopen(url,None,1) # Note the timeout to urllib2...
file_size = int(u.info().getheaders("Content-Length")[0])
print ("\nDownloading: {} Bytes: {:,}".format(file_name, file_size))
with open(file_name, 'wb') as f:
file_size_dl = 0
block_sz = 1024*4
time_outs=0
while True:
try:
buffer = u.read(block_sz)
except socket.timeout:
if time_outs > 3: # file has not had activity in max seconds...
print "\n\n\nsorry -- try back later"
os.unlink(file_name)
raise
else: # start counting time outs...
print "\nHmmm... little issue... I'll wait a couple of seconds"
time.sleep(3)
time_outs+=1
continue
if not buffer: # end of the download
sys.stdout.write('\rDone!'+' '*len(status)+'\n\n')
sys.stdout.flush()
break
file_size_dl += len(buffer)
f.write(buffer)
status = '{:20,} Bytes [{:.2%}] received'.format(file_size_dl,
file_size_dl * 1.0 / file_size)
sys.stdout.write('\r'+status)
sys.stdout.flush()
return file_name
This prints a status as expected. If I unplug my ethernet cable, I get:
Downloading: Python-2.7.3.tgz Bytes: 14,135,620
827,392 Bytes [5.85%] received
sorry -- try back later
If I unplug the cable, then plug it back in in less than 12 seconds, I get:
Downloading: Python-2.7.3.tgz Bytes: 14,135,620
716,800 Bytes [5.07%] received
Hmmm... little issue... I'll wait a couple of seconds
Hmmm... little issue... I'll wait a couple of seconds
Done!
The file is successfully downloaded.
You can see that urllib2 supports both timeouts and reconnects. If you disconnect and stay disconnected for 3 * 4 seconds == 12 seconds, it will timeout for good and raise a fatal exception. This could be dealt with as well.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Speeding up multithreaded downloads - python

Related

Multiprocessing Queue.get() hangs

Only 1 Thread started in for loop

the efficiency compare between gevent and thread

python threads exits immediately

Cancel slow download in python

Categories

Resources