Threads not updating a variable eventhought lock is added - python

I am trying to update a boolean once all emails have been checked and added to a queue but it's not updating the boolean no matter what i try.
My system is using threads to insert items into a queue (as it takes a long time to read each email as it needs to first get the email content so i'm using threads to make it faster).
This function below should run my function that gets the email content and then another function after that inserts it into the queue for this imap account.
# X threads to insert all emails after getting their contents
# we need a way to tell that it has inserted all items
def setItemsToRun(self, doneAddingEmails, imap, imap_server, currQueue, q):
self.s_pint(f'[*] Running setItemsToRun for [{imap}]')
threads = []
# Make 3 threads to insert the data and read over all
for i in range(5):
t = Thread(daemon=False, target=self.insertEmailMsgesIntoQueue, args=[doneAddingEmails, imap, imap_server, currQueue, q])
threads.append(t)
for t in threads:
self.s_pint(f'[*] Thread started for setItemsToRun')
t.start()
for t in threads:
t.join()
self.s_pint(f'[*] After the setItemsToRun threads loop')
# We need to return true ONLY when ALL email items have been put inside
while True:
with theLock:
#self.s_pint(f'[*] doneAddingEmails: {doneAddingEmails}')
# Waiting for our threads to mark this as done when queue is empty and no more items to be added
if doneAddingEmails == True:
self.s_pint(f'[*] Found the true value!')
break
return True`
This is the code for getting email content from queue:
# Get id from the email queue for this imap and get the real email content for it
def insertEmailMsgesIntoQueue(self, doneAddingEmails, imap, imap_server, currQueue, q):
self.s_pint(f'[*] Started insertEmailMsgesIntoQueue [{imap}]')
doneAddingEmails = True
# While all emails have not been added, wait
while True:
with Lock():
# Update the doneAddingEmails here for a test (IT'S NOT UPDATING THE VALUE!)
self.s_pint(f'[*] Marked it as done')
doneAddingEmails = True
return
if not q or len(q) == 0:
self.s_pint(f'[*] Nothing left in email queue for {imap}')
# Update this variable so our thread knows its been updated
doneAddingEmails = True
return
try:
emailId = q.pop()
except Exception as e:
self.s_pint(f'[*] No item found yet..')
pass
if emailId:
try:
result = self.getEmailFromID(imap_server, emailId, currQueue)
except Exception as e:
self.s_pint(f'[*] Failed to run getEmailFromID: {e}')
continue
self.s_pint(f'[*] Result for getEmailFromID for {imap}: {result}')
else:
continue
return`
And this is the function to insert into the specific queue:
# Gets the actual email content from the id
def getEmailFromID(self, imap_server, emailId, currQueue):
#with email_grab_msg_lock:
if emailId:
#self.s_pint(f'[*] Email id inside getEmailFromID: {emailId}')
status, msg = imap_server.uid('fetch', emailId, "(RFC822)")
if msg:
#self.s_pint(msg)
self.s_pint(f'[*] Got message for email id: {emailId}')
# Put item into the real email messages queue
currQueue.put(msg)
return True
else:
return False
else:
return False`
I have tried adding a Lock() so it can try doing 1 a time but no luck..

Related

Main thread hanging at queue join because q.get() on empty queue does not return

I have a request manager that builds a queue and starts x worker threads (x currently == 1).
Each thread is looping and getting elements from the queue appending the results to a shared list.
If the queue is exhausted the queue.Empty exception is caught, the current job marked as done and the thread should exit. This does work.
This block at the end of the run() however seems to break things. The queue has an arbitrary length and it might occur that the queue is longer then actual results fetchable. In order to exit all threads early a thread checks if the result he got has len == 0. If this is the case the thread clears the queue of all items left, marks itself as done and exits.
if len(request_result) == 0:
with self.q.mutex:
self.q.queue.clear()
self.q.task_done()
return
My assumption was that every thread would then finish it's current job and exit.
However the execution of the main thread hangs at q.join() and I can't debug why. From the debugger it looks like the worker-thread is not terminating. But that's just guessing.
I've read: Threading queue hangs in Python
but that does not solve the problem. I however set q.unfinished_tasks to 0 manually but that is not thread safe and will cause the program to crash when threads try to call taks_done() when another thread just set q.unfinished_tasks to 0.
class RequestManager:
def __init__(self, config=None):
self.config = config
def request_all_heroes(self):
q = queue.Queue()
result_list = []
# todo: get range max from highest hero ID.
for skip in [x * 100 for x in range(1, 3)]:
q.put_nowait(skip)
for _ in range(int(self.config["meta"]["number_of_threads"])):
RequestWorker(q=q,
config=self.config,
query_name='all_heroes',
shared_result_list=result_list).start()
q.join()
return [Hero(item) for sublist in result_list for item in sublist]
class RequestWorker(threading.Thread):
def __init__(self,
q=None,
config=None,
query_name="",
shared_result_list=None, *args, **kwargs):
self.q = q
self.config = config
self.query_file_path = self.config["files"][query_name]
self.shared_result_list = shared_result_list
super().__init__(*args, **kwargs)
def run(self):
keep_running = True
while keep_running:
try:
skip_number = self.q.get()
except queue.Empty:
self.q.task_done()
return
sr = SpecificRequest(config=self.config, skip=skip_number, query_file_path=self.query_file_path)
request_result = sr.do_specific_request()
if len(request_result) == 0:
with self.q.mutex:
self.q.queue.clear()
self.q.task_done()
return
self.shared_result_list.append(request_result)
self.q.task_done()
EDIT 1
if not self.q.empty():
skip_number = self.q.get()
else:
return
This works, unfortunately it is plain wrong becase get is called after the check if the queue is empty. This will cause problems at some point because a thread can check, see an element in the queue and another thread can snatch that last element in the meantime. Unlikely but possible.
This question is now about why self.q.get() does not return.

concurrent requests with queue and thread

I am trying to make concurrent API calls with python.
I based my code on the solution (first answer) presented in this thread: What is the fastest way to send 100,000 HTTP requests in Python?
Currently, my code is broken.
I have a main function which creates the queue, populates it, initiates the threads, starts them, and joins the queue.
I also have a target function which should make the get requests to the API.
The difficulties I am experiencing right now is that
the target function does not execute the necessary work.
The target is called, but it acts as the queue is empty.
The first print is executed ("inside scraper worker"), while the second ("inside scraper worker - queue NOT empty") is not.
def main_scraper(flights):
print("main scraper was called, got: ")
print(flights)
data = []
q = Queue()
map(q.put, flights)
for i in range(0, 5):
t = Thread(target = scraper_worker, args = (q, data))
t.daemon = True
t.start()
q.join()
return data
def scraper_worker(q, data):
print("inside scraper worker")
while not q.empty():
print("inside scraper worker, queue not empty")
f = q.get()
url = kiwi_url(f)
response = requests.get(url)
response_data = response.json()
results = parseResults(response_data)
q.task_done()
print("task done. results:")
print(results)
#f._price = results[0]["price"]
#f._url = results[0]["deep_link"]
data.append(results)
return data
I hope this is enough information for you to help me out.
Otherwise, I will rewrite the code in order to create a code that can be run by anyone.
I would guess that the flights are not being put on the queue. map(q.put, flights) is lazy, and is never accessed so it is as if it didn't happen. I would just iterate.
def main_scraper(flights):
print("main scraper was called, got: ")
print(flights)
data = []
q = Queue()
for flight in flights:
q.put(flight)
for i in range(0, 5):
t = Thread(target = scraper_worker, args = (q, data))
t.daemon = True
t.start()
q.join()
return data

Creating custom Futures objects in Python

I have a simple set of objects for managing a background process using the Actor model. In this case I'm concerned with only a single actor. However, it is important that the actor maintains a persistent state between receiving messages.
The objects work by appending messages to a queue in the main thread. Then the main thread can execute as it pleases. Every once in awhile it checks to see if anything new is on the results queue. When this happens it knows the actor has completed the task.
I want to know if this be implemented in a cleaner way using Futures objects. My current implementation is as follows:
import multiprocessing
import time
import collections
class Client(object):
"""
Object used in the main thread to communicate with background actors
"""
def __init__(client):
client.manager = None
client.start()
def __del__(client):
if client.manager and client.manager.is_alive():
client.get(StopIteration)
def start(client):
client.task_queue = multiprocessing.JoinableQueue()
client.result_queue = multiprocessing.Queue()
client.result_history = collections.deque(maxlen=1000)
client.manager = Manager(client.task_queue, client.result_queue)
client.manager.start()
def post(client, payload):
client.task_queue.put(payload)
def get(client, payload):
# Exhaust any existing results
list(client.results())
# Post the command
client.post(payload)
# Wait for a response
result = client.wait_for_result()
return result
def wait_for_result(client):
wait = 0
while True:
for result in client.results():
return result
time.sleep(wait)
wait = max(1, wait + .01)
def results(client):
""" Look at results put on the result_queue """
while not client.result_queue.empty():
item = client.result_queue.get()
client.result_history.append(item)
yield item
class Manager(multiprocessing.Process):
"""
Manager manages a single actor.
A manager sends messages an actor and appends a response when it is done.
"""
def __init__(self, task_queue, result_queue):
super(Manager, self).__init__()
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
""" main loop """
terminate = False
# Create Actor in separate process and send messages to it
actor = Actor()
while not terminate:
message = self.task_queue.get()
print('Sending message={} to actor'.format(message))
try:
if message is StopIteration:
content = 'shutdown'
terminate = True
else:
content = actor.handle(message)
except Exception as ex:
print('Error handling message')
status = 'error'
content = repr(ex)
else:
status = 'success'
print('Actor finished handling message={}'.format(message))
# Send back result
response = {
'status': status,
'content': content
}
self.task_queue.task_done()
self.result_queue.put(response)
print('Manager is shutting down')
class Actor(object):
"""
An actor is given messages from its manager and performs actions in a
single thread. Its state is private and threadsafe.
"""
def __init__(actor):
actor.state = {}
def handle(actor, message):
if not isinstance(message, dict):
raise ValueError('Commands must be passed in a message dict')
message = message.copy()
action = message.pop('action', None)
if action is None:
raise ValueError('message must have an action item')
if action == 'hello world':
content = 'hello world'
return content
elif action == 'debug':
return actor
elif action == 'start':
actor.state['a'] = 3
return 'started'
elif action == 'add':
for i in range(10000000):
actor.state['a'] += 1
return 'added', actor.state['a']
else:
raise ValueError('Unknown action=%r' % (action,))
def test():
print('Starting Test')
client = Client()
print('About to send messages')
# Get sends a message and then blocks until the response is returned.
print(client.get({'action': 'hello world'}))
print(client.get({'action': 'start'}))
print(client.get({'action': 'add'}))
print('Test completed')
if __name__ == '__main__':
test()
I would like to modify this code to use Future objects. Whenever the client is about to send a message, is it possible to create a Future object, then send that over the multiprocessing queue? Then the manager could execute the actors function and then modify the state of the Future object instead of appending a result to the result_queue.
This seems like it would offer a cleaner way to associate results with messages sent to the actor. It would also remove the need for the get and results methods I have in the first example.
Intuitively, I want it to look something like this:
from concurrent import futures
import multiprocessing
class Client(object):
"""
Object used in the main thread to communicate with background actors
"""
def __init__(client):
client.manager = None
client.start()
def __del__(client):
if client.manager and client.manager.is_alive():
f = client.post(StopIteration)
def start(client):
client.task_queue = multiprocessing.JoinableQueue()
client.manager = Manager(client.task_queue)
client.manager.start()
def post(client, payload):
f = futures.Future()
client.task_queue.put((f, payload))
return f
class Manager(multiprocessing.Process):
"""
Manager manages a single actor.
"""
def __init__(self, task_queue):
super(Manager, self).__init__()
self.task_queue = task_queue
def run(self):
""" main loop """
terminate = False
# Create Actor in separate process and send messages to it
actor = Actor()
while not terminate:
f, message = self.task_queue.get()
f.set_running_or_notify_cancel()
print('Sending message={} to actor'.format(message))
try:
if message is StopIteration:
content = 'shutdown'
terminate = True
else:
content = actor.handle(message)
except Exception as ex:
print('Error handling message')
status = 'error'
content = repr(ex)
else:
status = 'success'
print('Actor finished handling message={}'.format(message))
# Send back result
response = {
'status': status,
'content': content
}
self.task_queue.task_done()
f.set_result(response)
print('Manager is shutting down')
class Actor(object):
"""
An actor is given messages from its manager and performs actions in a
single thread. Its state is private and threadsafe.
"""
def __init__(actor):
actor.state = {}
def handle(actor, message):
if not isinstance(message, dict):
raise ValueError('Commands must be passed in a message dict')
message = message.copy()
action = message.pop('action', None)
if action is None:
raise ValueError('message must have an action item')
if action == 'hello world':
content = 'hello world'
return content
elif action == 'debug':
return actor
elif action == 'start':
actor.state['a'] = 3
return 'started'
elif action == 'add':
for i in range(10000000):
actor.state['a'] += 1
return 'added', actor.state['a']
else:
raise ValueError('Unknown action=%r' % (action,))
def test():
print('Starting Test')
client = Client()
print('About to send messages')
f1 = client.post({'action': 'hello world'})
print(f1.result())
f2 = client.post({'action': 'start'})
print(f2.result())
f3 = client.post({'action': 'add'})
print(f3.result())
print('Test completed')
if __name__ == '__main__':
test()
However, this obviously doesn't execute correctly. I believe I need some sort of process pool manager to create the futures for me (because I'm calling methods that are documented saying that only the pool manager should call them). But I'm not quite sure how to go about doing that. I've used futures before to map singleton worker functions, but I've never managed an external process with state before.
Can someone help me out with this? Perhaps there is an even easier way to go about implementing this with Futures?
So, I went ahead and just made a library to do this:
https://github.com/Erotemic/futures_actors

Python multiprocessing: Parallel Pipeline implementation

I am trying to create a pipeline but I have bad exit issues(zombies) and performance ones. I have created this generic class:
class Generator(Process):
'''
<function>: function to call. None value means that the current class will
be used as a template for another class, with <function> being defined
there
<input_queues> : Queue or list of Queue objects , which refer to the input
to <function>.
<output_queues> : Queue or list of Queue objects , which are used to pass
output
<sema_to_acquire> : Condition or list of Condition objects, which are
blocking generation while not notified
<sema_to_release> : Condition or list of Condition objects, which will be
notified after <function> is called
'''
def __init__(self, function=None, input_queues=None, output_queues=None, sema_to_acquire=None,
sema_to_release=None):
Process.__init__(self)
self.input_queues = input_queues
self.output_queues = output_queues
self.sema_to_acquire = sema_to_acquire
self.sema_to_release = sema_to_release
if function is not None:
self.function = function
def run(self):
if self.sema_to_release is not None:
try:
self.sema_to_release.release()
except AttributeError:
[sema.release() for sema in self.sema_to_release]
while True:
if self.sema_to_acquire is not None:
try:
self.sema_to_acquire.acquire()
except AttributeError:
[sema.acquire() for sema in self.sema_to_acquire]
if self.input_queues is not None:
try:
data = self.input_queues.get()
except AttributeError:
data = [queue.get() for queue in self.input_queues]
isiterable = True
try:
iter(data)
res = self.function(*tuple(data))
except TypeError, te:
res = self.function(data)
else:
res = self.function()
if self.output_queues is not None:
try:
if self.output_queues.full():
self.output_queues.get(res)
self.output_queues.put(res)
except AttributeError:
[queue.put(res) for queue in self.output_queues]
if self.sema_to_release is not None:
if self.sema_to_release is not None:
try:
self.sema_to_release.release()
except AttributeError:
[sema.release() for sema in self.sema_to_release]
to simulate a worker inside a pipeline. The Generator is wanted to run an infinite while loop, in which a function is executed using input from n queues and the result is written to m queues. There are some semaphores which need to be acquired by a process, before one iteration happens, and when the iteration finishes some other semaphores are released. So, for processes needed to run on parallel and produce an input for another I send 'crossed' semaphores as arguments, in order to force them to perform together single iterations. For processes which do not need to run on parallel I do not use any conditions. An example (which I actually use, if anyone ignores the input functions) is the following:
import time
from multiprocess import Lock
print_lock = Lock()
_t_=0.5
def func0(data):
time.sleep(_t_)
print_lock.acquire()
print 'func0 sends',data
print_lock.release()
return data
def func1(data):
time.sleep(_t_)
print_lock.acquire()
print 'func1 receives and sends',data
print_lock.release()
return data
def func2(data):
time.sleep(_t_)
print_lock.acquire()
print 'func2 receives and sends',data
print_lock.release()
return data
def func3(*data):
print_lock.acquire()
print 'func3 receives',data
print_lock.release()
run_svm = Semaphore()
run_rf = Semaphore()
inp_rf = Queue()
inp_svm = Queue()
out_rf = Queue()
out_svm = Queue()
kin_stream = Queue()
res_mixed = Queue()
streamproc = Generator(func0,
input_queues=kin_stream,
output_queues=[inp_rf,
inp_svm])
streamproc.daemon = True
streamproc.start()
svm_class = Generator(func1,
input_queues=inp_svm,
output_queues=out_svm,
sema_to_acquire=run_svm,
sema_to_release=run_rf)
svm_class.daemon=True
svm_class.start()
rf_class = Generator(func2,
input_queues=inp_rf,
output_queues=out_rf,
sema_to_acquire=run_rf,
sema_to_release=run_svm)
rf_class.daemon=True
rf_class.start()
mixed_class = Generator(func3,
input_queues=[out_rf, out_svm])
mixed_class.daemon = True
mixed_class.start()
count = 1
while True:
kin_stream.put([count])
count+=1
time.sleep(1)
streamproc.join()
svm_class.join()
rf_class.join()
mixed_class.join()
This example gives:
func0 sends 1
func2 receives and sends 1
func1 receives and sends 1
func3 receives (1, 1)
func0 sends 2
func2 receives and sends 2
func1 receives and sends 2
func3 receives (2, 2)
func0 sends 3
func2 receives and sends 3
func1 receives and sends 3
func3 receives (3, 3)
...
All good. However, if I try to kill main then the other subprocesses are not guaranteed to terminate: the terminal might freeze, or the python compiler might remain running on the background (probably zombies) and I have no clue why this is happening, as I have set the corresponding daemons to True.
Does anyone have a better idea of implementing this type of pipeline or can suggest a solution to this evil problem? Thank you all.
EDIT
Fixed testing. The zombies still do exist however.
I was able to overcome this problem, by introducing a termination queue as additional argument to the given class and set up a signal handler for SIGINT interrupt, in order to stop the pipeline execution. I do not know if this is the most elegant way to get it working, but it works. Also, the way the signal handler is set is important, as it must be set before process.start() for some reason, if anyone knows why, he can comment. Furthermore the signal handler is inherited by the subprocesses, so I have to put the join inside a try:..except AssertionError:pass pattern, otherwise it will throw error (again, if someone knows how to bypass this, please elaborate). Anyways, it works.
SOURCE CODE
class Generator(Process):
'''
<term_queue>: Queue to write termination events, must be same for all
processes spawned
<function>: function to call. None value means that the current class will
be used as a template for another class, with <function> being defined
there
<input_queues> : Queue or list of Queue objects , which refer to the input
to <function>.
<output_queues> : Queue or list of Queue objects , which are used to pass
output
<sema_to_acquire> : Semaphore or list of Semaphore objects, which are
blocking function execution
<sema_to_release> : Semaphore or list of Semaphore objects, which will be
released after <function> is called
'''
def __init__(self, term_queue,
function=None, input_queues=None, output_queues=None, sema_to_acquire=None,
sema_to_release=None):
Process.__init__(self)
self.term_queue = term_queue
self.input_queues = input_queues
self.output_queues = output_queues
self.sema_to_acquire = sema_to_acquire
self.sema_to_release = sema_to_release
if function is not None:
self.function = function
def run(self):
if self.sema_to_release is not None:
try:
self.sema_to_release.release()
except AttributeError:
deb = [sema.release() for sema in self.sema_to_release]
while True:
if not self.term_queue.empty():
self.term_queue.put((self.name, 0))
break
try:
if self.sema_to_acquire is not None:
try:
self.sema_to_acquire.acquire()
except AttributeError:
deb = [sema.acquire() for sema in self.sema_to_acquire]
if self.input_queues is not None:
try:
data = self.input_queues.get()
except AttributeError:
data = tuple([queue.get()
for queue in self.input_queues])
res = self.function(data)
else:
res = self.function()
if self.output_queues is not None:
try:
if self.output_queues.full():
self.output_queues.get(res)
self.output_queues.put(res)
except AttributeError:
deb = [queue.put(res) for queue in self.output_queues]
if self.sema_to_release is not None:
if self.sema_to_release is not None:
try:
self.sema_to_release.release()
except AttributeError:
deb = [sema.release() for sema in self.sema_to_release]
except Exception as exc:
self.term_queue.put((self.name, exc))
break
def signal_handler(sig, frame, term_queue, processes):
'''
<term_queue> is the queue to write termination of the __main__
<processes> is a dicitonary holding all running processes
'''
term_queue.put((__name__, 'SIGINT'))
try:
[processes[key].join() for key in processes]
except AssertionError:
pass
sys.exit(0)
term_queue = Queue()
'''
initialize some Generators and add them to <processes> dicitonary
'''
signal.signal(signal.SIGINT, lambda sig,frame: signal_handler(sig,frame,
term_queue,processes))
[processes[key].start() for key in processes]
while True:
if not term_queue.empty():
[processes[key].join() for key in processes]
break
and the example is changed accordingly (comment if you want me to add it)
I have had to work on this issue as well, and indeed, passing some communication pipe or queue to the processes seems to be the easiest way to tell them to terminate.
However the termination code can take advantage of a finally: bloc in the main process, it will take care of any event including signals.
If your processes are supposed to terminate at the same time as an object, you might also want to play with weakref.finalize, but it can be tricky.

Wait for Object initiated to return list and then continue

I have a master python script, that creates two objects
obj1 = xmlobj()
list1, list2, list3 = obj1.parsexml("xmlfile")
//parsexml returns me three lists
obj2 = htmlobj()
str1 = obj2.createhtmltable(list1, list2, list3)
//creates a html table
But, when I run script, master script does not wait for the obj1.parsexml to return value, and executes and immediately goes to next statement of creating obj2, thus not even executing createhtmltable as list1, list2, list3 has no return values in it. Separately, parsexml works fine without any error
How can I fix this, thank you
Here is an example:
import Queue #python 3.x => import queue
import urllib2 #python 3.x => import urllib.request
import threading
urls = [
"http://www.apple.com",
"http://www.yahoo.com",
"http://www.google.com",
"http://www.espn.com",
]
def do_stuff():
while True:
url = task_q.get()
if url == "NO_MORE_DATA":
break
data = urllib2.urlopen(url).read()[:200]
results_q.put(data)
#Create the Queues:
task_q = Queue.Queue()
results_q = Queue.Queue()
num_worker_threads = 2
for i in range(num_worker_threads):
t = threading.Thread(target=do_stuff)
t.daemon = True #daemon threads are killed when the main thread finishes
t.start() #All threads block because the task_q is empty.
#Add the tasks/urls to the task_q:
for url in urls:
task_q.put(url)
#Like frantic dogs, all the threads are chomping on the task_q...
#Add a stop for each thread, so that after all the urls
#have been removed from the task_q, the threads will terminate:
for i in range(num_worker_threads):
task_q.put("NO_MORE_DATA")
results_count = len(urls) #Need to know how many results are expected in order to know when to stop reading from the results_q
while True:
result = results_q.get()
#process the result here
print result
print '-' * 10
results_count -= 1
if results_count == 0: #then all results have been processed, so you should stop trying to read from the results_q
break
Queue.join()
Blocks until all items in the queue have been gotten and
processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls
task_done() to indicate that the item was retrieved and all work on it
is complete. When the count of unfinished tasks drops to zero, join()
unblocks.
https://docs.python.org/2.7/library/queue.html#module-Queue
Yeah, but who wants to wait until all the results have been added to the results Queue before starting to process the results. Why not start processing the first result as soon as it is available?

Categories

Resources