Using Linux and Python 2.7.6, I have a script that uploads lots of files at one time. I am using multi-threading with the Queue and Threading modules.
I have a object that keeps track of the files that have been successfully uploaded and decrements after each successfull upload. I need to make this operation atomic/thread safe. Since the Queue module is high level and has it's own mutex on the lower level, can I impose my own lock/acquire in addition to it? I tried doing this and had no errors(at the bottom of the last code block where file_quantity.deduct() is). But I am not sure if it is truly working as it should. Here is the shortened version for readability:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
def deduct(self):
self.quantity -= 1
kill_received = False
lock = threading.Lock()
class CustomQueue(Queue.Queue):
#Can not use .join() because it would block any processing
#for SIGINT untill threads are done. To counter this,
# wait() is given a time out along with while not kill_received
#to be checked
def join(self):
self.all_tasks_done.acquire()
try:
while not kill_received and self.unfinished_tasks:
self.all_tasks_done.wait(10.0)
finally:
self.all_tasks_done.release()
def do_the_uploads(file_list, file_quantity,
retry_list, authenticate):
"""The uploading engine"""
value = raw_input(
"\nPlease enter how many concurent "
"uploads you want at one time(example: 200)> ")
value = int(value)
logger.info('{} concurent uploads will be used.'.format(value))
confirm = raw_input(
"\nProceed to upload files? Enter [Y/y] for yes: ").upper()
if confirm == "Y":
kill_received = False
sys.stdout.write("\x1b[2J\x1b[H")
q = CustomQueue()
def worker():
global kill_received
while not kill_received:
item = q.get()
upload_file(item, file_quantity, retry_list, authenticate, q)
q.task_done()
for i in range(value):
t = Thread(target=worker)
t.setDaemon(True)
t.start()
for item in file_list:
q.put(item)
q.join()
print "Finished. Cleaning up processes...",
#Allowing the threads to cleanup
time.sleep(4)
print "done."
def upload_file(file_obj, file_quantity, retry_list, authenticate, q):
"""Uploads a file. One file per it's own thread. No batch style. This way if one upload
fails no others are effected."""
absolute_path_filename, filename, dir_name, token, url = file_obj
url = url + dir_name + '/' + filename
try:
with open(absolute_path_filename) as f:
r = requests.put(url, data=f, headers=header_collection, timeout=20)
except requests.exceptions.ConnectionError as e:
pass
if src_md5 == r.headers['etag']:
lock.acquire()
file_quantity.deduct()
lock.release()
Well, the code you posted doesn't define lock anywhere, so hard to say for sure. It would be more common to protect the code that actually needs protecting:
def deduct(self):
with lock:
self.quantity -= 1
Sanest is to allocate a lock in the structure that needs it, like so:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
self.lock = threading.Lock()
def deduct(self):
with self.lock:
self.quantity -= 1
and use self.lock similarly for any other mutations of FileQuantity data members that may be invoked simultaneously by multiple threads.
Related
Although Python supports multiple threads of execution, the GIL causes only one of them to make forward progress at a time. However, after I read 《Effective Python》, I implemented pipeline example in this book. The code is below, the entire process is divided into 3 stages, namely download, resize and upload.
from threading import Thread
from queue import Queue
import time
# define operation functions for 3 stages: download, resize and upload
def download(item):
print("downloading {}".format(item))
time.sleep(2)
return item
def resize(item):
print("resizing {}".format(item))
time.sleep(3)
return item
def upload(item):
print("uploading {}".format(item))
time.sleep(5)
return item
# subclass of Thread
class StoppableWorker(Thread):
def __init__(self, func, in_queue, out_queue):
super(StoppableWorker, self).__init__()
self.func = func
self.in_queue = in_queue
self.out_queue = out_queue
def run(self):
for item in self.in_queue:
result = self.func(item)
self.out_queue.put(result)
# subclass of Queue
class ClosableQueue(Queue):
SENTINEL = object()
def close(self):
self.put(self.SENTINEL)
def __iter__(self):
while True:
item = self.get()
try:
if item is self.SENTINEL:
return
yield item
finally:
self.task_done()
download_queue = ClosableQueue()
resize_queue = ClosableQueue()
upload_queue = ClosableQueue()
output_queue = ClosableQueue()
threads = [StoppableWorker(download, download_queue, resize_queue),
StoppableWorker(resize, resize_queue, upload_queue),
StoppableWorker(upload, upload_queue, output_queue)]
for t in threads:
t.start()
st = time.time()
for i in range(10):
download_queue.put(i)
download_queue.close()
download_queue.join()
resize_queue.close()
resize_queue.join()
upload_queue.close()
upload_queue.join()
print("It took {}".format(time.time() - st))
print(output_queue.qsize(), 'items finished')
According to the output time, I found time spent for N task is 10 + (N-1)*5 seconds (10 = 2(download stage) + 3(resize stage) + 5(upload stage)). It seems that these 3 threads can work at the same time, which is contradicted to the above description that only one of the threads can make forward progress at a time
Threading in I/O tasks:
Threading is game-changing because many scripts related to network/data I/O spend the majority of their time waiting for data from a remote source. Because downloads might not be linked (i.e., scraping separate websites), the processor can download from different data sources in parallel and combine the result at the end. For CPU intensive processes, there is little benefit to using the threading module.
What Is Threading? Why Might You Want It?
TL;DR I want to collect the accumulated data in the globals of each worker when the pool is finished processing
Description of what I think I'm missing
As I'm new to multiprocessing, I don't know of all the features that exist. I am looking for a way to make a worker return the value it was initialized with (after manipulating that value a bunch of millions of times). Then, I hope I can collect and merge all these values at the end of the program when all the 'jobs' are done.
import multiprocessing as mp
from collections import defaultdict, Counter
from customtools import load_regexes #, . . .
import gzip
import nltk
result_dict = None
regexes = None
def create_worker():
global result_dict
global regexes
result_dict = defaultdict(Counter) # I want to return this at the end
# these are a bunch of huge regexes
regexes = load_regexes()
These functions represents the way I load and process data. The data is a big gzipfile with articles.
def load_data(semaphore):
with gzip.open('some10Gbfile') as f:
for line in file:
semaphore.acquire()
yield str(line, 'utf-8')
def worker_job(line):
global regexes
global result_dict
hits = defaultdict(Counter)
for sent in nltk.sent_tokenize(line[3:]):
for rename, regex in regex.items():
for hit in regex.finditer(sent):
hits[rename][hit.group(0)]+=1
# and more and more... results = _filter(_extract(hits))
# store some data in results_dict here . . .
return filtered_hits
Class ResultEater():
def __init__(self):
self.wordscounts=defaultdict(Counter)
self.filtered=Counter()
def eat_results(self, filte red_hits):
for k, v in filte.items():
for i, c in v.items():
self.wordscount[k][i]+=c
This is the main program
if __name__ == '__main__':
pool = mp.Pool(mp.cpu_count(), initializer=create_worker)
semaphore = mp.Semaphore(50)
loader = load_data(semaphore)
results = ResultEater()
for intermediate_result in pool.imap_unordered(worker_job, loader, chunksize=10):
results.eat_results(intermediate_result)
semaphore.release()
# results.eat_workers(the_leftover_workers_or_something)
results.print()
I don't really think I understand how exactly returning the data incrementally isn't sufficient, but it kinda seems like you need some sort of finalization function to send the data similar to how you have an initialization function. Unfortunately, I don't think this sort of thing exists for mp.Pool, so it'll require you to use a couple mp.Process's, and send input args, and return results with a couple mp.Queue's
On a side note your use of Semaphore is unncessary, as the call to the "load_data" iterator always happens on the main process. I have moved that to another "producer" process, which puts inputs to a queue, which is also already synchronized automatically by default. This allows you to have one process for gathering inputs, several processes for processing the inputs to outputs, and leaves the main (parent) process to gather outputs. If the "producer" generating the inputs is IO limited by file read speed (very likely), it could also be in a thread rather than a process, but in this case the difference is probably minimal.
I have created an example of a custom "Pool" which allows you to return some data at the end of each worker's "life" using aforementioned "producer-consumer" scheme. there are print statements to track what is going on in each process, but please also read the comments to track what's going on and why:
import multiprocessing as mp
from time import sleep
from queue import Empty
class ExitFlag:
def __init__(self, exit_value=None):
self.exit_value = exit_value #optionally pass value along with exit flag
def producer_func(input_q, n_workers):
for i in range(100): #100 lines of some long file
print(f"put {i}")
input_q.put(i) #put each line of the file to the work queue
print('stopping consumers')
for i in range(n_workers):
input_q.put(ExitFlag()) #send shut down signal to each of the workers
print('producer exiting')
def consumer_func(input_q, output_q, work_func):
counter = 0
while True:
try:
item = input_q.get(.1) #never wait forever on a "get". It's a recipe for deadlock.
except Empty:
continue
print(f"get {item}")
if isinstance(item, ExitFlag):
break
else:
counter += 1
output_q.put(work_func(item))
output_q.put(ExitFlag(exit_value=counter))
print('consumer exiting')
def work_func(number):
sleep(.1) #some heavy nltk work...
return number*2
if __name__ == '__main__':
input_q = mp.Queue(maxsize=10) #only bother limiting size if you have memory usage constraints
output_q = mp.Queue(maxsize=10)
n_workers = mp.cpu_count()
producer = mp.Process(target=producer_func, args=(input_q, n_workers)) #generate the input from another process. (this could just as easily be a thread as it seems it will be IO limited anyway)
producer.start()
consumers = [mp.Process(target=consumer_func, args=(input_q, output_q, work_func)) for _ in range(n_workers)]
for c in consumers: c.start()
total = 0
stop_signals = 0
exit_values = []
while True:
try:
item = output_q.get(.1)
except Empty:
continue
if isinstance(item, ExitFlag):
stop_signals += 1
if item.exit_value is not None:
exit_values.append(item.exit_value) #do something with the return at the end
if stop_signals >= n_workers: #stop waiting for more results once all consumers finish
break
else:
total += item #do something with the incremental return values
print(total)
print(exit_values)
#cleanup
producer.join()
print("producer joined")
for c in consumers: c.join()
print("consumers joined")
I want to build a tool that scan a website for sub domains, I know how to do his, but my function is slower, I looked up in the gobuster usage, and I saw that the gobuster can use many concurrent threads, how can I implement this too ?
I have asked Google many times, but I can't see anything about this, can someone give me an example ?
gobuster usage: -t Number of concurrent threads (default 10)
My current program:
def subdomaines(url, wordlist):
checks(url, wordlist) # just checking for valid args
num_lines = get_line_count(wordlist) # number of lines in a file
count = 0
for line in open(wordlist).readlines():
resp = requests.get(url + line) # resp
if resp.status_code in (301, 200):
print(f'Valid - {line}')
print(f'{count} / {num_lines}')
count += 1
Note* : gobuster is a very fast tool for searching subdomains in websites
If you're trying to use threading in python you should start from the basics and learn what's available. But here's a simple example taken from https://pymotw.com/2/threading/
import threading
def worker():
"""thread worker function"""
print 'Worker'
return
threads = []
for i in range(5):
t = threading.Thread(target=worker)
threads.append(t)
t.start()
To apply this to your task, a simple approach would be to spawn a thread for each request. Something like the code below. Note: if your wordlist is long this might be very expensive. Look into some of the thread pool libraries in python for better thread management that you won't need to explicitly control yourself.
import threading
def subdomains(url, wordlist):
checks(url, wordlist) # just checking for valid args
num_lines = get_line_count(wordlist) # number of lines in a file
count = 0
threads = []
for line in open(wordlist).readlines():
t = threading.Thread(target=checkUrl,args=(url,line))
threads.append(t)
t.start()
for thread in threads: #wait for all threads to complete
thread.join()
def checkUrl(url,line):
resp = requests.get(url + line)
if resp.status_code in (301, 200):
print(f'Valid - {line}')
To implement the counter you'll need to control shared access between threads to prevent race conditions (two threads accessing the variable at the same time resulting in... problems). A counter object with protected access is provided in the link above:
class Counter(object):
def __init__(self, start=0):
self.lock = threading.Lock()
self.value = start
def increment(self):
#Waiting for lock
self.lock.acquire()
try:
#Acquired lock
self.value = self.value + 1
finally:
#Release lock, so other threads can count
self.lock.release()
#usage:
#in subdomains()...
counter = Counter()
for ...
t = threading.Thread(target=checkUrl,args=(url,line,counter))
#in checkUrl()...
c.increment()
Final note: I have not compiled or tested any of this code.
Python have threading module.
The simplest way to use a Thread is to instantiate it with a target function and call start() to let it begin working.
import threading
def subdomains(url, wordlist):
checks(url, wordlist) # just checking for valid args
num_lines = get_line_count(wordlist) # number of lines in a file
count = 0
for line in open(wordlist).readlines():
resp = requests.get(url + line) # resp
if resp.status_code in (301, 200):
print(f'Valid - {line}')
print(f'{count} / {num_lines}')
count += 1
threads = []
for i in range(10):
t = threading.Thread(target=subdomains)
threads.append(t)
t.start()
I have a scenario where I get a list of jobs to be processed e.g. a list of web pages to be crawled from internet). Each job is independent and also the jobs can be processed in any order. Individual jobs may fail or succeed and may have to be handled accordingly (e.g. temporary data for a failed crawl task may have to be deleted and recrawled in next round)
I am trying to implement it using thread based processing in python. To mimic the actual task lets say I have a huge list of integer arrays and the individual job is to compute the Sum or Product of each array. What I am trying to do is to use a JobsProcessor class object to instantiate threads of JobWorker class objects which perform the actual processing by creating objects for other classes (Sum and Product here). The code for the same is mentioned below. A snippet is shown
from queue import Queue, Empty
from threading import Thread
import time
class Product:
def __init__(self,data):
self.data = data
def doOperation(self):
try:
product =self.data[0]
for d in self.data[1:]:
if d>100000:
raise Exception( "Forcefully throwing exception")
product*=d
time.sleep(1)
return product
except:
return "product computation failed"
class Sum:
def __init__(self,data):
self.data = data
def doOperation(self):
try:
sum =0
for d in self.data:
sum+=d
time.sleep(1)
return sum
except:
return "sum computation failed"
class JobWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
jobitem = self.queue.get_nowait()
if jobitem is None:
break
jobdata, optype = jobitem
if optype =='sum':
opobj = Sum(jobdata)
jobresult = opobj.doOperation()
elif optype =='product':
opobj = Product(jobdata)
jobresult = opobj.doOperation()
else:
print ("Invalid op type")
jobresult = 'Failed'
print(" job result", jobresult)
self.queue.task_done()
except Empty:
break
except:
print ("Some exception occured")
#How to pass it to up to the main jobs processor#
class JobsProcessor(object):
def __init__(self, joblist):
self.joblist = joblist
self.job_queue = Queue()
def process_resources(self):
try:
for job in self.joblist:
self.job_queue.put(job)
for i in range(2):
jobthread = JobWorker(self.job_queue)
jobthread.start()
'''
Write code here to monitor current status for all running jobs
'''
self.job_queue.join()
'''I want to write code here to track progress status for all jobs
Some jobs may have failed, not completed and based on that I may
want to take further action such as retry or flag them'''
print("Finished Jobs")
except:
pass
orgjobList = [ ([1,5,9,4],'sum'),
([5,4,5,8],'product'),
([100,45,678,999],'product'),
([3743,34,44324,543],'sum'),
([100001, 100002, 9876, 83989], 'product')]
mainprocessor = JobsProcessor(orgjobList)
mainprocessor.process_resources()
I want to add 2 functionalities to this process.
Consolidation : when all the job threads complete I want to know the status of all the JobWorker objects (e.g if they are completed successfully/ complete with failure). Failure/Exception may occur in the JobWorker object or may be even the Sum or Product object. The failure/success status should be propagate back to JobsProcessor, where I want to perform other actions such as reprocess/delete/send_elsewhere etc based on the returned status
Monitoring - also I want to have a Monitor functionality which can continuously check on the status of current running/completed jobs and perform the requisite actions such as delete immediately rather than waiting till the end for Consolidation
Please advise how I can add the above functionalities, and if only one of them would suffice for cases such as crawling pages. Any other suggestions are also welcome.
You can add both the functionalities in your code in any of the two ways -
Using Global Variables (simplest approach)
Using a getProgress and getStatus methods in your class (elegant approach)
You can create 2 threads, One thread does the actual work and updates the progress variable.
For the second approach, you can set two vars in __init__ class, like the following.
def __init__(self):
self.progress = 0
self.success = True
self.isDone = False
self.error = "No Error Occurred"
Then you can include the logic in your code like the following -
def actualWork(self):
self.isDone = 0
try:
for i in range(1000):
self.progress = i
time.sleep(0.01)
self.isDone = True
except Exception as e:
self.success = False
self.error = str(e)
def getProgress(self):
return self.progress
def getError(self):
return self.error
I have a project that requires a bunch of large matrices, which are stored in ~200 MB files, to be cross-correlated (i.e. FFT * conj(FFT)) with each other. The number of files is such that I can't just load them all up and then do my processing. On the other hand, reading in each file as I need it is slower than I'd like.
what I have so far is something like:
result=0
for i in xrange(N_files):
f1 = file_reader(file_list[i])
############################################################################
# here I want to have file_reader go start reading the next file I'll need #
############################################################################
in_place_processing(f1)
for j in xrange(i+1,N_files):
f2 = file_reader(file_list[j])
##################################################################
# here I want to have file_reader go start reading the next file #
##################################################################
in_place_processing(f2)
result += processing_function(f1,f2)
So basically, I just want to have two threads that will each read a file, give it to me when I ask for it (or as soon as it's done after I ask for it), and then go start reading the next file for when I ask for it. The object the file_reader returns is rather large and complicated, so I'm not sure if multiprocessing is the way to go here...
I've read about threading and queues but can't seem to figure out the part where I ask the thread to go read the file and can proceed with the program while it does. I don't want the threads to simply go about their business in the background -- am I missing a detail here, or is threading not the way to go?
Below is an example of using the multiprocessing module that will spawn off child processes to call your file_reader method and queue up their results. The queue should block when full, so you can control the number of read ahead's you'd like to perform with the QUEUE_SIZE constant.
This utilizes a standard Producer/Consumer model of multiprocess communication, with the child processes act as Producers, with the main thread being the Consumer. The join method call in the class destructor ensures the child process resources are cleaned up properly. There are some print statements interspersed for demonstration purposes.
Additionally, I added the ability for the QueuedFileReader class to offload work to a worker thread or run in the main thread, rather than using a child process, for comparison. This is done by specifying the mode parameter at class initialization to MODE_THREADS or MODE_SYNCHRONOUS, respectively.
import multiprocessing as mp
import Queue
import threading
import time
QUEUE_SIZE = 2 #buffer size of queue
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
MODE_SYNCHRONOUS = 0 #file_reader called in main thread synchronously
MODE_THREADS = 1 #file_reader executed in worker thread
MODE_PROCESS = 2 #file_reader executed in child_process
##################################################
## Class to encapsulate multiprocessing objects.
class QueuedFileReader():
def __init__(self, idlist, mode=MODE_PROCESS):
self.mode = mode
self.idlist = idlist
if mode == MODE_PROCESS:
self.queue = mp.Queue(QUEUE_SIZE)
self.process = mp.Process(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.process.start()
elif mode == MODE_THREADS:
self.queue = Queue.Queue(QUEUE_SIZE)
self.thread = threading.Thread(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.thread.start()
#staticmethod
def worker(queue, idlist):
for i in idlist:
queue.put((i, file_reader(file_list[i])))
print id(queue), 'queued', file_list[i]
queue.put('done')
def __iter__(self):
if self.mode == MODE_SYNCHRONOUS:
self.index = 0
return self
def next(self):
if self.mode == MODE_SYNCHRONOUS:
if self.index == len(self.idlist): raise StopIteration
q = (self.idlist[self.index],
file_reader(file_list[self.idlist[self.index]]))
self.index += 1
else:
q = self.queue.get()
if q == 'done': raise StopIteration
return q
def __del__(self):
if self.mode == MODE_PROCESS:
self.process.join()
elif self.mode == MODE_THREADS:
self.thread.join()
#mode = MODE_PROCESS
mode = MODE_THREADS
#mode = MODE_SYNCHRONOUS
result = 0
for i, f1 in QueuedFileReader(range(N_files),mode):
in_place_processing(f1)
for j, f2 in QueuedFileReader(range(i+1,N_files),mode):
in_place_processing(f2)
result += processing_function(f1,f2)
If your intermediate values are too large to pass through the Queue, you can execute each iteration of the outer loop in its own process. A handy way to do that would be using the Pool class in multiprocessing as in the example below.
import multiprocessing as mp
import time
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
def file_task(file_index):
print file_index
f1 = file_reader(file_list[file_index])
in_place_processing(f1)
task_result = 0
for j in range(file_index+1, N_files):
f2 = file_reader(file_list[j])
in_place_processing(f2)
task_result += processing_function(f1,f2)
return task_result
pool = mp.Pool(processes=None) #processes default to mp.cpu_count()
result = 0
for file_result in pool.map(file_task, range(N_files)):
result += file_result
print 'result', result
#or simply
#result = sum(pool.map(file_task, range(N_files)))