python multiprocess queue with infinity loop - python

I create 3 processes and want the function wirte1 to write value 'A,B,C' to queue1 ,and function read1 read value from queue1 and put it to queue2 ,in the same time, function read2 read value from queue2, but value B,C can't read from queue2 in time and the process finished.
from multiprocessing import Process, Queue,Manager,Pool,Lock
import os, time, random
#向队列1写数据
def write1(q1,lock):
lock.acquire()
for value in ['A', 'B', 'C']:
print ('Put %s to queue111...%s' % (value,str(os.getpid())))
q1.put(value)
time.sleep(1)
lock.release()
#从队列1读取数据并写入队列2
def read1(q1,q2,lock):
lock.acquire()
while True:
time.sleep(1)
value=q1.get()
# if value is None:break
print('Get %s from queue111.%s' % (value,str(os.getpid())))
q2.put(value)
print('Put %s to queue222...%s' % (value,str(os.getpid())))
lock.release()
def read2(q2,lock):
lock.acquire()
while True:
# if not q2.empty() or not q1.empty():
time.sleep(2)
value=q2.get(True)
print('Get %s from queue222.%s' % (value,os.getpid()))
lock.release()
if __name__=='__main__':
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
q1 = manager.Queue()
q2 = manager.Queue()
lock1 = manager.Lock()
lock2 = manager.Lock()
lock3 = manager.Lock()
start=time.time()
p = Pool()
# pw = p.apply_async(write1, args=(q1,lock1,))
pw = Process(target=write1,args=(q1,lock1,))
# time.sleep(0.5)
# pr = p.apply_async(read1, args=(q1,q2,lock2,))
# pr2 = p.apply_async(read2, args=(q2,lock3))
pr=Process(target=read1,args=(q1,q2,lock2,))
pr2 = Process(target=read2,args=(q2,lock3,))
pw.start()
pr.start()
pr2.start()
# p.close()
# p.join()
pw.join()
pr.terminate()
pr2.terminate()
end=time.time()
# print
print('finished!!')
print(end-start)
the output is:
Put A to queue111...77678 Put B to queue111...77678 Get A from queue111.77680 Put A to queue222...77680 Put C to queue111...77678 Get A from queue222.77681 Get B from queue111.77680 Put B to queue222...77680 Get C from queue111.77680 Put C to queue222...77680 finished!! 3.025238275527954

You can’t use terminate to control a system like this: it races with completing the actual work. Instead, make your loops not be infinite, probably by using a sentinel value in each Queue (as in one commented-out line).

Related

processing very large text files in parallel using multiprocessing and threading

I have found several other questions that touch on this topic but none that are quite like my situation.
I have several very large text files (3+ gigabytes in size).
I would like to process them (say 2 documents) in parallel using multiprocessing. As part of my processing (within a single process) I need to make an API call and because of this would like to have each process have it's own threads to run asynchronously.
I have came up with a simplified example ( I have commented the code to try to explain what I think it should be doing):
import multiprocessing
from threading import Thread
import threading
from queue import Queue
import time
def process_huge_file(*, file_, batch_size=250, num_threads=4):
# create APICaller instance for each process that has it's own Queue
api_call = APICaller()
batch = []
# create threads that will run asynchronously to make API calls
# I expect these to immediately block since there is nothing in the Queue (which is was
# the api_call.run depends on to make a call
threads = []
for i in range(num_threads):
thread = Thread(target=api_call.run)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
####
# start processing the file line by line
for line in file_:
# if we are at our batch size, add the batch to the api_call to to let the threads do
# their api calling
if i % batch_size == 0:
api_call.queue.put(batch)
else:
# add fake line to batch
batch.append(fake_line)
class APICaller:
def __init__(self):
# thread safe queue to feed the threads which point at instances
of these APICaller objects
self.queue = Queue()
def run(self):
print("waiting for something to do")
self.queue.get()
print("processing item in queue")
time.sleep(0.1)
print("finished processing item in queue")
if __name__ == "__main__":
# fake docs
fake_line = "this is a fake line of some text"
# two fake docs with line length == 1000
fake_docs = [[fake_line] * 1000 for i in range(2)]
####
num_processes = 2
procs = []
for idx, doc in enumerate(fake_docs):
proc = multiprocessing.Process(target=process_huge_file, kwargs=dict(file_=doc))
proc.start()
procs.append(proc)
for proc in procs:
proc.join()
As the code is now, "waiting for something to do" prints 8 times (makes sense 4 threads per process) and then it stops or "deadlocks" which is not what I expect - I expect it to start sharing time with the threads as soon as I start putting items in the Queue but the code does not appear to make it this far. I ordinarily would step through to find a hang up but I still don't have a solid understanding of how to best debug using Threads (another topic for another day).
In the meantime, can someone help me figure out why my code is not doing what it should be doing?
I have made a few adjustments and additions and the code appears to do what it is supposed to now. The main adjustments are: adding a CloseableQueue class (from Brett Slatkins Effective Python Item 55), and ensuring that I call close and join on the queue so that the threads properly exit. Full code with these changes below:
import multiprocessing
from threading import Thread
import threading
from queue import Queue
import time
from concurrency_utils import CloseableQueue
def sync_process_huge_file(*, file_, batch_size=250):
batch = []
for idx, line in enumerate(file_):
# do processing on the text
if idx % batch_size == 0:
time.sleep(0.1)
batch = []
# api_call.queue.put(batch)
else:
computation = 0
for i in range(100000):
computation += i
batch.append(line)
def process_huge_file(*, file_, batch_size=250, num_threads=4):
api_call = APICaller()
batch = []
# api call threads
threads = []
for i in range(num_threads):
thread = Thread(target=api_call.run)
threads.append(thread)
thread.start()
for idx, line in enumerate(file_):
# do processing on the text
if idx % batch_size == 0:
api_call.queue.put(batch)
else:
computation = 0
for i in range(100000):
computation += i
batch.append(line)
for _ in threads:
api_call.queue.close()
api_call.queue.join()
for thread in threads:
thread.join()
class APICaller:
def __init__(self):
self.queue = CloseableQueue()
def run(self):
for item in self.queue:
print("waiting for something to do")
pass
print("processing item in queue")
time.sleep(0.1)
print("finished processing item in queue")
print("exiting run")
if __name__ == "__main__":
# fake docs
fake_line = "this is a fake line of some text"
# two fake docs with line length == 1000
fake_docs = [[fake_line] * 10000 for i in range(2)]
####
time_s = time.time()
num_processes = 2
procs = []
for idx, doc in enumerate(fake_docs):
proc = multiprocessing.Process(target=process_huge_file, kwargs=dict(file_=doc))
proc.start()
procs.append(proc)
for proc in procs:
proc.join()
time_e = time.time()
print(f"took {time_e-time_s} ")
class CloseableQueue(Queue):
SENTINEL = object()
def __init__(self, **kwargs):
super().__init__(**kwargs)
def close(self):
self.put(self.SENTINEL)
def __iter__(self):
while True:
item = self.get()
try:
if item is self.SENTINEL:
return # exit thread
yield item
finally:
self.task_done()
As expected this is a great speedup from running synchronously - 120 seconds vs 50 seconds.

How to exchange data back and forth between two separate processes in Python

PROBLEM
There are two separate processes that run in parallel and I would like them to communicate back-and-forth.
EXPLANATION OF THE CODE
The code is in Python2.7. In my stripped to minimum script, I use a queue for between processes communication. The process p1 puts data in a queue. The process p2 gets the data from the queue and does something with the data. Then the process p2 puts the modified data back in the queue and finally then the process p1 gets back the modified data from the queue. The modified data must return to the process p1 because this process really is an eventlet server that sends/receives requests.
CODE
#!/usr/bin/python2.7 python2.7
# -*- coding: utf-8 -*-
# script for back-and-forth data exchange between processes
# common modules
import os
import sys
import time
from multiprocessing import Process
from multiprocessing import Queue
from datetime import datetime
someData = {}
class Load():
def post(self):
timestamp = str(datetime.now())
someData = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
queue1.put(someData) # put into queue
print "#20 process 1: put in queue1 =>", someData
time.sleep(3)
while True: # queue1 checking loop, comment out the loop if use time.sleep only
if queue1.empty() == False:
timestamp = str(datetime.now())
res = queue1.get()
res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
print "#28 get from queue1 =>", res
break
else:
print "#31 queue1 empty"
time.sleep(1)
# while True: # queue2 checking loop
# if queue2.empty() == False:
# timestamp = str(datetime.now())
# res = queue2.get()
# res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
# print "#39 get from queue2 =>", res
# break
# else:
# print "#42 queue2 empty"
# time.sleep(1)
class Unload():
def get(self):
try:
if queue1.empty() == False:
data = queue1.get() # retrieve package from queue
#queue1.close()
#queue1.join_thread()
timestamp = str(datetime.now())
data = {"process":"p2","class":"Unload()","method":"get()","timestamp":timestamp}
print "#54 process 2: get from queue1 =>", data
self.doSomething(data) # call method
else:
print "#57 queue1 empty"
pass
except:
print "#60 queue1 error"
pass
def doSomething(self, data):
time.sleep(3)
timestamp = str(datetime.now())
someData = {"process":"p2","class":"Unload()","method":"doSomething()","timestamp":timestamp}
self.someData = someData
print "#68 process 2: do something =>", someData
self.put()
def put(self):
time.sleep(3)
timestamp = str(datetime.now())
self.someData = {"process":"p2","class":"Unload()","method":"put()","timestamp":timestamp}
print "#75 process 2: put back in queue1 =>", self.someData
res = self.someData
queue1.put(res)
#print "#78 process 2: put back in queue2 =>", self.someData
#res = self.someData
#queue2.put(res)
#queue2.close()
#queue2.join_thread()
# main
if __name__ == '__main__':
queue1 = Queue()
#queue2 = Queue()
global p1, p2
p1 = Process(target=Load().post(), args=(queue1,)) # process p1
#p1 = Process(target=Load().post(), args=(queue1,queue2,))
p1.daemon = True
p1.start()
p2 = Process(target=Unload().get(), args=(queue1,)) # process p2
#p2 = Process(target=Unload().get(), args=(queue1,queue2,))
p2.start()
p2.join()
QUESTION
I have checked other resources in regard but they all involve one direction communication. Below is the list of resources.
use-get-nowait-in-python-without-raising-empty-exception
in-python-how-do-you-get-data-back-from-a-particular-process-using-multiprocess
how-to-use-multiprocessing-queue-with-lock
multiprocessing module supports locks
thread-that-i-can-pause-and-resume
exchange-data-between-two-python-processes
How do I get the process1 to wait and retrieve the modified data from process2? Should I consider another approach for the communication between processes e.g pipes, zeroMQ?
ATTEMPT 1: using time.sleep() without the while loops in process 1
With only the time.sleep the data go up to back in the queue but never reach the final destination in process 1. So far so good but the final step is missing. The results are below.
#20 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-23 11:40:30.234466', 'class': 'Load()', 'method': 'post()'}
#54 process 2: get from queue1 => {'process': 'p2', 'timestamp': '2020-02-23 11:40:33.239113', 'class': 'Unload()', 'method': 'get()'}
#68 process 2: do something => {'process': 'p2', 'timestamp': '2020-02-23 11:40:36.242500', 'class': 'Unload()', 'method': 'doSomething()'}
#75 process 2: put back in queue1 => {'process': 'p2', 'timestamp': '2020-02-23 11:40:39.245856', 'class': 'Unload()', 'method': 'put()'}
ATTEMPT 2: using the while loop in process 1
With the while loop checking the queue the data go in the queue but get caught right after, they never reach the process 2. The results are below.
#20 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-23 11:46:14.606356', 'class': 'Load()', 'method': 'post()'}
#28 get from queue1 => {'process': 'p1', 'timestamp': '2020-02-23 11:46:17.610202', 'class': 'Load()', 'method': 'post()'}
#57 queue1 empty
ATTEMPT 3: using two queues
Using two queues: queue1 from process1 to process2, queue2 from process2 to process1. The data go in the queue1 but do not return in queue2, they mysteriously vanish. The results are below.
#20 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-23 11:53:39.745177', 'class': 'Load()', 'method': 'post()'}
#42 queue2 empty
----- UPDATE 20200224: attempts 4, 5 and 6 -----------------------------------------------------------------
ATTEMPT 4: using two queues with manager.Queue()
Using two queues with the manager.Queue(): queue1 from process1 to process2, queue2 from process2 to process1. The data go in the queue1 but do not return in queue2, again they mysteriously vanish. The code and results are below.
The code of the attempt 4:
#!/usr/bin/python2.7 python2.7
# -- coding: utf-8 --
# script for serialized interprocess data exchange
# common modules
import os
import sys
import time
import multiprocessing
from multiprocessing import Process
from multiprocessing import Queue
from multiprocessing import Manager
from datetime import datetime
someData = {}
manager = multiprocessing.Manager()
queue1 = manager.Queue()
queue2 = manager.Queue()
class Load():
def post(self):
timestamp = str(datetime.now())
someData = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
queue1.put(someData) # put into queue
print "#20 process 1: put in queue1 =>", someData
time.sleep(3)
# while True: # queue1 checking loop
# if queue1.empty() == False:
# timestamp = str(datetime.now())
# res = queue1.get()
# res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
# print "#28 get from queue1 =>", res
# break
# else:
# print "#31 queue1 empty"
# time.sleep(1)
while True: # queue2 checking loop
if queue2.empty() == False:
timestamp = str(datetime.now())
res = queue2.get()
res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
print "#39 get from queue2 =>", res
break
else:
print "#42 queue2 empty"
time.sleep(1)
class Unload():
def get(self):
try:
if queue1.empty() == False:
data = queue1.get() # retrieve package from queue
#queue1.close()
#queue1.join_thread()
timestamp = str(datetime.now())
data = {"process":"p2","class":"Unload()","method":"get()","timestamp":timestamp}
print "#54 process 2: get from queue1 =>", data
self.doSomething(data) # call method
else:
print "#57 queue1 empty"
pass
except:
print "#60 queue1 error"
pass
def doSomething(self, data):
time.sleep(3)
timestamp = str(datetime.now())
someData = {"process":"p2","class":"Unload()","method":"doSomething()","timestamp":timestamp}
self.someData = someData
print "#68 process 2: do something =>", someData
self.put()
def put(self):
time.sleep(3)
timestamp = str(datetime.now())
self.someData = {"process":"p2","class":"Unload()","method":"put()","timestamp":timestamp}
res = self.someData
#print "#75 process 2: put back in queue1 =>", self.someData
#queue1.put(res)
print "#78 process 2: put back in queue2 =>", self.someData
queue2.put(res)
#queue2.close()
#queue2.join_thread()
# main
if __name__ == '__main__':
manager = multiprocessing.Manager()
queue1 = manager.Queue()
queue2 = manager.Queue()
global p1, p2
#p1 = Process(target=Load().post(), args=(queue1,)) # process p1
p1 = Process(target=Load().post(), args=(queue1,queue2,))
p1.daemon = True
p1.start()
#p2 = Process(target=Unload().get(), args=(queue1,)) # process p2
p2 = Process(target=Unload().get(), args=(queue1,queue2,))
p2.start()
p2.join()
The results of the attempt 4:
#20 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-24 13:06:17.687762', 'class': 'Load()', 'method': 'post()'}
#42 queue2 empty
ATTEMPT 5: using one queue with manager.Queue()
Using one queue with the manager.Queue(): queue1 from process1 to process2, queue1 back from process2 to process1. The data go in the queue1 but get caught right after, they never reach the process 2. The code results are below.
The code of the attempt 5:
#!/usr/bin/python2.7 python2.7
# -*- coding: utf-8 -*-
# script for serialized interprocess data exchange
# common modules
import os
import sys
import time
import multiprocessing
from multiprocessing import Process
from multiprocessing import Queue
from multiprocessing import Manager
from datetime import datetime
someData = {}
manager = multiprocessing.Manager()
queue1 = manager.Queue()
#queue2 = manager.Queue()
class Load():
def post(self):
timestamp = str(datetime.now())
someData = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
queue1.put(someData) # put into queue
print "#25 process 1: put in queue1 =>", someData
time.sleep(3)
while True: # queue1 checking loop
if queue1.empty() == False:
timestamp = str(datetime.now())
res = queue1.get()
res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
print "#33 get from queue1 =>", res
break
else:
print "#36 queue1 empty"
time.sleep(1)
# while True: # queue2 checking loop
# if queue2.empty() == False:
# timestamp = str(datetime.now())
# res = queue2.get()
# res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
# print "#44 get from queue2 =>", res
# break
# else:
# print "#47 queue2 empty"
# time.sleep(1)
class Unload():
def get(self):
try:
if queue1.empty() == False:
data = queue1.get() # retrieve package from queue
#queue1.close()
#queue1.join_thread()
timestamp = str(datetime.now())
data = {"process":"p2","class":"Unload()","method":"get()","timestamp":timestamp}
print "#59 process 2: get from queue1 =>", data
self.doSomething(data) # call method
else:
print "#62 queue1 empty"
pass
except:
print "#65 queue1 error"
pass
def doSomething(self, data):
time.sleep(3)
timestamp = str(datetime.now())
someData = {"process":"p2","class":"Unload()","method":"doSomething()","timestamp":timestamp}
self.someData = someData
print "#73 process 2: do something =>", someData
self.put()
def put(self):
time.sleep(3)
timestamp = str(datetime.now())
self.someData = {"process":"p2","class":"Unload()","method":"put()","timestamp":timestamp}
res = self.someData
print "#81 process 2: put back in queue1 =>", self.someData
queue1.put(res)
#print "#83 process 2: put back in queue2 =>", self.someData
#queue2.put(res)
#queue2.close()
#queue2.join_thread()
# main
if __name__ == '__main__':
manager = multiprocessing.Manager()
queue1 = manager.Queue()
#queue2 = manager.Queue()
global p1, p2
p1 = Process(target=Load().post(), args=(queue1,)) # process p1
#p1 = Process(target=Load().post(), args=(queue1,queue2,))
p1.daemon = True
p1.start()
p2 = Process(target=Unload().get(), args=(queue1,)) # process p2
#p2 = Process(target=Unload().get(), args=(queue1,queue2,))
p2.start()
p2.join()
The result of the attempt 5:
#25 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-24 14:08:13.975886', 'class': 'Load()', 'method': 'post()'}
#33 get from queue1 => {'process': 'p1', 'timestamp': '2020-02-24 14:08:16.980382', 'class': 'Load()', 'method': 'post()'}
#62 queue1 empty
ATTEMPT 6: using the queue timeouts
As suggested I tried to correct the queue timeouts. The approach is again queue1 from process1 to process2, queue2 from process2 to process1. The data go in the queue1 but do not return in queue2, again they mysteriously vanish. The code and results are below.
The code of the attempt 6:
#!/usr/bin/python2.7 python2.7
# -*- coding: utf-8 -*-
# script for serialized interprocess data exchange
# common modules
import os
import sys
import time
import uuid
import Queue
#from Queue import Empty
import multiprocessing
from multiprocessing import Process
#from multiprocessing import Queue
from datetime import datetime
someData = {}
class Load():
def post(self):
timestamp = str(datetime.now())
someData = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
queue1.put(someData) # put into queue
print "#24 process 1: put in queue1 =>", someData
time.sleep(3)
# while True: # queue1 checking loop
# if queue1.empty() == False:
# timestamp = str(datetime.now())
# res = queue1.get()
# res = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
# print "#33 get from queue1 =>", res
# break
# else:
# print "#36 queue1 empty"
# time.sleep(1)
while True: # queue2 checking loop
try:
someData = queue2.get(True,1)
timestamp = str(datetime.now())
someData = {"process":"p1","class":"Load()","method":"post()","timestamp":timestamp}
print "#43 process 1: got from queue2 =>", someData
break
except Queue.Empty:
print "#46 process1: queue2 empty"
continue
class Unload():
def get(self):
while True: # queue2 checking loop
try:
someData = queue1.get(True,1)
timestamp = str(datetime.now())
someData = {"process":"p2","class":"Unload()","method":"get()","timestamp":timestamp}
print "#56 process2: got from queue1 =>", someData
break
except Queue.Empty:
print "#59 process2: queue1 empty"
continue
self.doSomething(someData) # call method
def doSomething(self, data):
time.sleep(3)
timestamp = str(datetime.now())
someData = {"process":"p2","class":"Unload()","method":"doSomething()","timestamp":timestamp}
self.someData = someData
print "#68 process2: do something =>", someData
self.put(someData)
def put(self,data):
time.sleep(3)
timestamp = str(datetime.now())
self.someData = {"process":"p2","class":"Unload()","method":"put()","timestamp":timestamp}
someData = self.someData
#print "#81 process 2: put back in queue1 =>", self.someData
#queue1.put(res)
print "#78 process2: put back in queue2 =>", someData
queue2.put(someData)
# main
if __name__ == '__main__':
queue1 = multiprocessing.Queue()
queue2 = multiprocessing.Queue()
global p1, p2
#p1 = Process(target=Load().post(), args=(queue1,)) # process p1
p1 = Process(target=Load().post(), args=(queue1,queue2,))
p1.daemon = True
p1.start()
#p2 = Process(target=Unload().get(), args=(queue1,)) # process p2
p2 = Process(target=Unload().get(), args=(queue1,queue2,))
p2.start()
p2.join()
The results of the attempt 6:
#24 process 1: put in queue1 => {'process': 'p1', 'timestamp': '2020-02-24 18:14:46.435661', 'class': 'Load()', 'method': 'post()'}
#46 process1: queue2 empty
NOTE: The suggested approach works when I use it without the classes. The code is below:
import uuid
import multiprocessing
from multiprocessing import Process
import Queue
def load(que_in, que_out):
request = {"id": uuid.uuid4(), "workload": "do_stuff", }
que_in.put(request)
print("load: sent request {}: {}".format(request["id"], request["workload"]))
while True:
try:
result = que_out.get(True, 1)
except Queue.Empty:
continue
print("load: got result {}: {}".format(result["id"], result["result"]))
def unload(que_in, que_out):
def processed(request):
return {"id": request["id"], "result": request["workload"] + " processed", }
while True:
try:
request = que_in.get(True, 1)
except Queue.Empty:
continue
print("unload: got request {}: {}".format(request["id"], request["workload"]))
result = processed(request)
que_out.put(result)
print("unload: sent result {}: {}".format(result["id"], result["result"]))
# main
if __name__ == '__main__':
que_in = multiprocessing.Queue()
que_out = multiprocessing.Queue()
p1 = Process(target=load, args=(que_in, que_out)) # process p1
p1.daemon = True
p1.start()
p2 = Process(target=unload, args=(que_in, que_out)) # process p2
p2.start()
p2.join()
----- UPDATE 20200225: attempt 7 ------------------------------------------------------------------------------
ATTEMPT 7: using one queue with queue timeouts in different classes (working)
In this attempt I use one shared queue between methods of different classes with the corrected timeouts. The data goes from process1 to process2 and back from process2 to process1 in a shared_queue. In this attempt the data travelled correctly. The code and results are below.
The code of the attempt 7:
import uuid
import multiprocessing
from multiprocessing import Process
import Queue
class Input():
def load(self, shared_queue):
request = {"id": uuid.uuid4(), "workload": "do_stuff", }
shared_queue.put(request)
print("load: sent request {}: {}".format(request["id"], request["workload"]))
while True:
try:
result = shared_queue.get(True, 1)
except Queue.Empty:
continue
print("load: got result {}: {}".format(result["id"], result["result"]))
break
class Output():
def unload(self, shared_queue):
def processed(request):
return {"id": request["id"], "result": request["workload"] + " processed", }
while True:
try:
request = shared_queue.get(True, 1)
except Queue.Empty:
continue
print("unload: got request {}: {}".format(request["id"], request["workload"]))
result = processed(request)
shared_queue.put(result)
print("unload: sent result {}: {}".format(result["id"], result["result"]))
# main
if __name__ == '__main__':
shared_queue = multiprocessing.Queue()
up = Input()
down = Output()
p1 = Process(target=up.load, args=(shared_queue,)) # process p1
p1.daemon = True
p1.start()
p2 = Process(target=down.unload, args=(shared_queue,)) # process p2
p2.start()
p1.join()
p2.join()
The results of the attempt 7:
load: sent request a461357a-b39a-43c4-89a8-a77486a5bf45: do_stuff
unload: got request a461357a-b39a-43c4-89a8-a77486a5bf45: do_stuff
unload: sent result a461357a-b39a-43c4-89a8-a77486a5bf45: do_stuff processed
load: got result a461357a-b39a-43c4-89a8-a77486a5bf45: do_stuff processed
I think you just missed queue timeouts usage
try:
result = que_out.get(True, 1)
except queue.Empty:
continue
This simplified example may help you:
import uuid
from multiprocessing import Process
from multiprocessing import Queue
import queue
def load(que_in, que_out):
request = {"id": uuid.uuid4(), "workload": "do_stuff", }
que_in.put(request)
print("load: sent request {}: {}".format(request["id"], request["workload"]))
while True:
try:
result = que_out.get(True, 1)
except queue.Empty:
continue
print("load: got result {}: {}".format(result["id"], result["result"]))
def unload(que_in, que_out):
def processed(request):
return {"id": request["id"], "result": request["workload"] + " processed", }
while True:
try:
request = que_in.get(True, 1)
except queue.Empty:
continue
print("unload: got request {}: {}".format(request["id"], request["workload"]))
result = processed(request)
que_out.put(result)
print("unload: sent result {}: {}".format(result["id"], result["result"]))
# main
if __name__ == '__main__':
que_in = Queue()
que_out = Queue()
p1 = Process(target=load, args=(que_in, que_out)) # process p1
p1.daemon = True
p1.start()
p2 = Process(target=unload, args=(que_in, que_out)) # process p2
p2.start()
p2.join()
Output
load: sent request d9894e41-3e8a-4474-9563-1a99797bc722: do_stuff
unload: got request d9894e41-3e8a-4474-9563-1a99797bc722: do_stuff
unload: sent result d9894e41-3e8a-4474-9563-1a99797bc722: do_stuff processed
load: got result d9894e41-3e8a-4474-9563-1a99797bc722: do_stuff processed
SOLUTION: using one shared queue
I solved the problem after following the suggestions and making some adjustments getting the right targeting of the different classes methods. The back and forth flow of the data between two separate processes is now correct. An important note for me is to pay extra attention to the someData package exchanged between two separate processes, it really has to be the same package that is tossed around. Hence the identifier entry "id": uuid.uuid4() to check if the package is the same with every passage.
#!/usr/bin/python2.7 python2.7
# -*- coding: utf-8 -*-
# script for back and forth communication between two separate processes using a shared queue
# common modules
import os
import sys
import time
import uuid
import Queue
import multiprocessing
from multiprocessing import Process
from datetime import datetime
someData = {}
class Load():
def post(self, sharedQueue):
timestamp = str(datetime.now()) # for timing checking
someData = {"timestamp":timestamp, "id": uuid.uuid4(), "workload": "do_stuff",}
self.someData = someData
sharedQueue.put(someData) # put into the shared queue
print("#25 p1 load: sent someData {}: {}".format(someData["id"], someData["timestamp"], someData["workload"]))
time.sleep(1) # for the time flow
while True: # sharedQueue checking loop
try:
time.sleep(1) # for the time flow
timestamp = str(datetime.now())
someData = sharedQueue.get(True,1)
someData["timestamp"] = timestamp
print("#37 p1 load: got back someData {}: {}".format(someData["id"], someData["timestamp"], someData["workload"]))
break
except Queue.Empty:
print("#37 p1: sharedQueue empty")
continue
break
class Unload():
def get(self, sharedQueue):
while True: # sharedQueue checking loop
try:
someData = sharedQueue.get(True,1)
self.someData = someData
timestamp = str(datetime.now())
someData["timestamp"] = timestamp
print("#50 p2 unload: got someData {}: {}".format(someData["id"], someData["timestamp"], someData["workload"]))
break
except Queue.Empty:
print("#53 p2: sharedQueue empty")
continue
time.sleep(1) # for the time flow
self.doSomething(someData) # pass the data to the method
def doSomething(self, someData): # execute some code here
timestamp = str(datetime.now())
someData["timestamp"] = timestamp
print("#62 p2 unload: doSomething {}: {}".format(someData["id"], someData["timestamp"], someData["workload"]))
self.put(someData)
time.sleep(1) # for the time flow
def put(self,someData):
timestamp = str(datetime.now())
someData["timestamp"] = timestamp
sharedQueue.put(someData)
print("#71 p2 unload: put someData {}: {}".format(someData["id"], someData["timestamp"], someData["workload"]))
time.sleep(1) # for the time flow
# main
if __name__ == '__main__':
sharedQueue = multiprocessing.Queue()
trx = Load()
rcx = Unload()
p1 = Process(target=trx.post, args=(sharedQueue,)) # process p1
p1.daemon = True
p1.start()
p2 = Process(target=rcx.get, args=(sharedQueue,)) # process p2
p2.start()
p1.join()
p2.join()
You have to use Manager-wrapped queue(s) to propagate changes across processes, otherwise each process has its separate queue object and can't see the other one(s). Manager creates a shared instance of the queue for all children processes.
So queue1 = Queue() becomes queue1 = manager.Queue() with from multiprocessing import Manager at the top. If you want to use your two queues approach, you obviously have to wrap the second queue in the same way.
Relevant resources:
Multiple queues from one multiprocessing Manager
Python documentation

Real-time-ability python multiprocessing (Queue and Pipe)

I am a little bit confused testing the multiprocessing module.
Let's simulate a digital timer. The code would look like:
start=datetime.now()
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
print s
time.sleep(1)
Which returns correctly:
8e-06
1.001072
2.00221
3.003353
4.004416
...
Now I want to read the clock from my virtual external digital clock device using a pipe:
def ask_timer(conn):
start=datetime.now()
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
conn.send(s)
parent_conn, child_conn = Pipe()
p = Process(target=ask_timer, args=(child_conn,))
p.start()
while True:
print parent_conn.recv()
time.sleep(1)
It returns:
2.9e-05
6.7e-05
7.7e-05
8.3e-05
8.9e-05
9.4e-05
0.0001
...
Here the timer doesn't seem to run permanently in the background..The implementation of "Queue" looks like:
def ask_timer(q):
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
q.put(s)
#conn.close()
q = Queue()
p = Process(target=ask_timer, args=(q,))
p.start()
while True:
print q.get()
time.sleep(1)
which does the same like pipe. Is this just my misconception of multiprocessing of python? How could I ask a value realtime from a running parallel-thread?
Everything is working correctly. The child process is executing ask_timer() function completely independently from you main process. You don't have any time.sleep() in this function, so it just prints or puts in the queue deltas in the infinite loop with interval of like 10ms.
Once a second your main process asks child process for data and get's it. Data is one of those small intervals.
The problem there is that you're putting much more data into pipe/queue, than taking from it. So you're getting old data, when you ask. To test that you can print queue size in the loop (won't work on OS X):
def ask_timer(q):
start = datetime.now()
while True:
now = datetime.now()
delta = now - start
s = delta.seconds + delta.microseconds / 1E6
q.put(s)
q = Queue()
p = Process(target=ask_timer, args=(q,))
p.start()
while True:
print q.get()
print q.qsize()
time.sleep(1)
The queue size will grow really fast.
Apparently you can use shared memory to read current value from the child process.
from multiprocessing import Process, Value
from datetime import datetime
import time
from ctypes import c_double
def ask_timer(v):
start = datetime.now()
while True:
now = datetime.now()
delta = now - start
s = delta.seconds + delta.microseconds / 1E6
v.value = s
val = Value(c_double, 0.0)
p = Process(target=ask_timer, args=(val,))
p.start()
while True:
print(val.value)
time.sleep(1)

process stop working while queue is not empty

I try to write a script in python to convert url into its corresponding ip. Since the url file is huge (nearly 10GB), so I'm trying to use multiprocessing lib.
I create one process to write output to file and a set of processes to convert url.
Here is my code:
import multiprocessing as mp
import socket
import time
num_processes = mp.cpu_count()
sentinel = None
def url2ip(inqueue, output):
v_url = inqueue.get()
print 'v_url '+v_url
try:
v_ip = socket.gethostbyname(v_url)
output_string = v_url+'|||'+v_ip+'\n'
except:
output_string = v_url+'|||-1'+'\n'
print 'output_string '+output_string
output.put(output_string)
print output.full()
def handle_output(output):
f_ip = open("outputfile", "a")
while True:
output_v = output.get()
if output_v:
print 'output_v '+output_v
f_ip.write(output_v)
else:
break
f_ip.close()
if __name__ == '__main__':
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()
print 'run in %d processes' % num_processes
for i in range(num_processes):
p = mp.Process(target=url2ip, args=(inqueue, output))
jobs.append(p)
p.start()
for line in open('inputfile','r'):
print 'ori '+line.strip()
inqueue.put(line.strip())
for i in range(num_processes):
# Send the sentinal to tell Simulation to end
inqueue.put(sentinel)
for p in jobs:
p.join()
output.put(None)
proc.join()
However, it did not work. It did produce several outputs (4 out of 10 urls in the test file) but it just suddenly stops while queues are not empty (I did check queue.empty())
Could anyone suggest what's wrong?Thanks
You're workers exit after processing a single url each, they need to loop internally until they get the sentinel. However, you should probably just look at multiprocessing.pool instead, as that does the bookkeeping for you.

Threads not stop in python

The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"

Categories

Resources