Starting multiple thread processes to process a queue - python

Using below code I start to thread processes, write_process writes to a queue and read_process reads from a queue :
import time
from multiprocessing import Process, Queue, Pool
class QueueFun():
def writing_queue(self, work_tasks):
while True:
print("Writing to queue")
work_tasks.put(1)
time.sleep(1)
def read_queue(self, work_tasks):
while True:
print('Reading from queue')
work_tasks.get()
time.sleep(2)
if __name__ == '__main__':
q = QueueFun()
work_tasks = Queue()
write_process = Process(target=q.writing_queue,
args=(work_tasks,))
write_process.start()
read_process = Process(target=q.read_queue,
args=(work_tasks,))
read_process.start()
write_process.join()
read_process.join()
Running above code prints:
Writing to queue
Reading from queue
Writing to queue
Reading from queue
Writing to queue
Writing to queue
Reading from queue
Writing to queue
How to start N processes to read from the queue?
I tried starting 3 processes using below code but just 1 process is started, this is because the .join() prevents the second process from starting?:
for i in range(0 , 3):
read_process = Process(target=q.read_queue,
args=(work_tasks,))
print('Starting read_process' , i)
read_process.start()
read_process.join()
I also considered using a Pool as described in https://docs.python.org/2/library/multiprocessing.html but this seems just relevant for transforming an existing collection :
print pool.map(f, range(10))
How to start n threads where each thread processes a shared queue?

You can just put it to list, and join it outside of creation loop:
if __name__ == '__main__':
q = QueueFun()
work_tasks = Queue()
write_process = Process(target=q.writing_queue,
args=(work_tasks,))
write_process.start()
processes = []
for i in range(0, 5):
processes.append(Process(target=q.read_queue,
args=(work_tasks,)))
for p in processes:
p.start()
write_process.join()
for p in processes:
p.join()

Related

processing very large text files in parallel using multiprocessing and threading

I have found several other questions that touch on this topic but none that are quite like my situation.
I have several very large text files (3+ gigabytes in size).
I would like to process them (say 2 documents) in parallel using multiprocessing. As part of my processing (within a single process) I need to make an API call and because of this would like to have each process have it's own threads to run asynchronously.
I have came up with a simplified example ( I have commented the code to try to explain what I think it should be doing):
import multiprocessing
from threading import Thread
import threading
from queue import Queue
import time
def process_huge_file(*, file_, batch_size=250, num_threads=4):
# create APICaller instance for each process that has it's own Queue
api_call = APICaller()
batch = []
# create threads that will run asynchronously to make API calls
# I expect these to immediately block since there is nothing in the Queue (which is was
# the api_call.run depends on to make a call
threads = []
for i in range(num_threads):
thread = Thread(target=api_call.run)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
####
# start processing the file line by line
for line in file_:
# if we are at our batch size, add the batch to the api_call to to let the threads do
# their api calling
if i % batch_size == 0:
api_call.queue.put(batch)
else:
# add fake line to batch
batch.append(fake_line)
class APICaller:
def __init__(self):
# thread safe queue to feed the threads which point at instances
of these APICaller objects
self.queue = Queue()
def run(self):
print("waiting for something to do")
self.queue.get()
print("processing item in queue")
time.sleep(0.1)
print("finished processing item in queue")
if __name__ == "__main__":
# fake docs
fake_line = "this is a fake line of some text"
# two fake docs with line length == 1000
fake_docs = [[fake_line] * 1000 for i in range(2)]
####
num_processes = 2
procs = []
for idx, doc in enumerate(fake_docs):
proc = multiprocessing.Process(target=process_huge_file, kwargs=dict(file_=doc))
proc.start()
procs.append(proc)
for proc in procs:
proc.join()
As the code is now, "waiting for something to do" prints 8 times (makes sense 4 threads per process) and then it stops or "deadlocks" which is not what I expect - I expect it to start sharing time with the threads as soon as I start putting items in the Queue but the code does not appear to make it this far. I ordinarily would step through to find a hang up but I still don't have a solid understanding of how to best debug using Threads (another topic for another day).
In the meantime, can someone help me figure out why my code is not doing what it should be doing?
I have made a few adjustments and additions and the code appears to do what it is supposed to now. The main adjustments are: adding a CloseableQueue class (from Brett Slatkins Effective Python Item 55), and ensuring that I call close and join on the queue so that the threads properly exit. Full code with these changes below:
import multiprocessing
from threading import Thread
import threading
from queue import Queue
import time
from concurrency_utils import CloseableQueue
def sync_process_huge_file(*, file_, batch_size=250):
batch = []
for idx, line in enumerate(file_):
# do processing on the text
if idx % batch_size == 0:
time.sleep(0.1)
batch = []
# api_call.queue.put(batch)
else:
computation = 0
for i in range(100000):
computation += i
batch.append(line)
def process_huge_file(*, file_, batch_size=250, num_threads=4):
api_call = APICaller()
batch = []
# api call threads
threads = []
for i in range(num_threads):
thread = Thread(target=api_call.run)
threads.append(thread)
thread.start()
for idx, line in enumerate(file_):
# do processing on the text
if idx % batch_size == 0:
api_call.queue.put(batch)
else:
computation = 0
for i in range(100000):
computation += i
batch.append(line)
for _ in threads:
api_call.queue.close()
api_call.queue.join()
for thread in threads:
thread.join()
class APICaller:
def __init__(self):
self.queue = CloseableQueue()
def run(self):
for item in self.queue:
print("waiting for something to do")
pass
print("processing item in queue")
time.sleep(0.1)
print("finished processing item in queue")
print("exiting run")
if __name__ == "__main__":
# fake docs
fake_line = "this is a fake line of some text"
# two fake docs with line length == 1000
fake_docs = [[fake_line] * 10000 for i in range(2)]
####
time_s = time.time()
num_processes = 2
procs = []
for idx, doc in enumerate(fake_docs):
proc = multiprocessing.Process(target=process_huge_file, kwargs=dict(file_=doc))
proc.start()
procs.append(proc)
for proc in procs:
proc.join()
time_e = time.time()
print(f"took {time_e-time_s} ")
class CloseableQueue(Queue):
SENTINEL = object()
def __init__(self, **kwargs):
super().__init__(**kwargs)
def close(self):
self.put(self.SENTINEL)
def __iter__(self):
while True:
item = self.get()
try:
if item is self.SENTINEL:
return # exit thread
yield item
finally:
self.task_done()
As expected this is a great speedup from running synchronously - 120 seconds vs 50 seconds.

How to process and save the continuous data coming from sensor

Example:
I have installed a sensor in the car, that is sending the data continuously, Now, I have to process(fusion) the continuous data coming from the sensor but at the same while process will be finishing its execution, data will also be coming so, how to store the data that is coming while process is taking time for execution for future?.
sample code:
buffer1=[]
buffer2=[]
def process_function(buffer):
//processing
while true:
//data receiving continously
buffer1.append(data)
if len(buffer1)>0: process(buffer1)
buffer2.append(data)
(while the process_function will take buffer1 to process, at the same time, the continuous data should be stored in buffer2 so that after finishing the process_function with buffer1 can process with buffer2 and repeat.)
You could use a multiprocessing Queue and two processes. One for the producer and one for the consumer:
from multiprocessing import Process, Queue
def collection_sensor_values(mp_queue):
fake_value = 0
while True:
mp_queue.put(f"SENSOR_DATA_{fake_value}")
fake_value += 1
time.sleep(2)
def process_function(mp_queue):
while True:
sensor_reading = mp_queue.get(block=True)
print(f"Received sensor reading: {sensor_reading}")
q = Queue()
sensor_collector_process = Process(target=collection_sensor_values, args=(q,))
readings_process = Process(target=process_function, args=(q,))
all_procs = [sensor_collector_process, readings_process]
for p in all_procs:
p.start()
for p in all_procs:
# run until either process stops
if p.is_alive():
p.join()
for p in all_procs:
if p.is_alive():
p.terminate()

subprocess does not return control to main process after finishing

I have a Python application where I use processes for computing classification. For communication processes use Queues. Everything works fine except that after all sub-processes are done the main process does not get control back. So, as I understand, the sub-processes did not terminated. But, why?
#!/usr/bin/python
from wraper import *
from multiprocessing import Process, Lock,Queue
def start_threads(data,counter,threads_num,reporter):
threads = []
d_lock = Lock()
c_lock = Lock()
r_lock = Lock()
dq = Queue()
rq = Queue()
cq = Queue()
dq.put(data)
rq.put(reporter)
cq.put(counter)
for i in range(threads_num):
t = Process(target=mule, args=(dq,cq,rq,d_lock,c_lock,r_lock))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
return rq.get()
def mule(dq,cq,rq,d_lock,c_lock,r_lock):
c_lock.acquire()
counter = cq.get()
can_continue = counter.next_ok()
idx = counter.get_features_indeces()
cq.put(counter)
c_lock.release()
while can_continue:
d_lock.acquire()
data = dq.get()
labels, features = data.get_features(idx)
dq.put(data)
d_lock.release()
accuracy = test_classifier(labels, features)
r_lock.acquire()
reporter = rq.get()
reporter.add_result(accuracy[0],idx)
rq.put(reporter)
r_lock.release()
c_lock.acquire()
counter = cq.get()
can_continue = counter.next_ok()
idx = counter.get_features_indeces()
cq.put(counter)
c_lock.release()
print('done' )
It writes for each process that it did it's job and that's it...

process stop working while queue is not empty

I try to write a script in python to convert url into its corresponding ip. Since the url file is huge (nearly 10GB), so I'm trying to use multiprocessing lib.
I create one process to write output to file and a set of processes to convert url.
Here is my code:
import multiprocessing as mp
import socket
import time
num_processes = mp.cpu_count()
sentinel = None
def url2ip(inqueue, output):
v_url = inqueue.get()
print 'v_url '+v_url
try:
v_ip = socket.gethostbyname(v_url)
output_string = v_url+'|||'+v_ip+'\n'
except:
output_string = v_url+'|||-1'+'\n'
print 'output_string '+output_string
output.put(output_string)
print output.full()
def handle_output(output):
f_ip = open("outputfile", "a")
while True:
output_v = output.get()
if output_v:
print 'output_v '+output_v
f_ip.write(output_v)
else:
break
f_ip.close()
if __name__ == '__main__':
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()
print 'run in %d processes' % num_processes
for i in range(num_processes):
p = mp.Process(target=url2ip, args=(inqueue, output))
jobs.append(p)
p.start()
for line in open('inputfile','r'):
print 'ori '+line.strip()
inqueue.put(line.strip())
for i in range(num_processes):
# Send the sentinal to tell Simulation to end
inqueue.put(sentinel)
for p in jobs:
p.join()
output.put(None)
proc.join()
However, it did not work. It did produce several outputs (4 out of 10 urls in the test file) but it just suddenly stops while queues are not empty (I did check queue.empty())
Could anyone suggest what's wrong?Thanks
You're workers exit after processing a single url each, they need to loop internally until they get the sentinel. However, you should probably just look at multiprocessing.pool instead, as that does the bookkeeping for you.

python threading issue

I want to have the result of my threads in a list.
I have the following sample code:
def parallelizer_task(processor,input,callback):
output = processor(input)
if callback:
callback(output)
return output
class ThreadsParallelizer(Parallelizer):
def parallelize(self,processors,input=None,callback=None):
threads = []
for processor in processors:
t = threading.Thread(target=parallelizer_task,args=(processor,input,callback))
threads.append(t)
t.start()
return threads
parallelizer = ThreadsParallelizer
But I have the output of threads list as ;
* <Thread(Thread-1, started 4418719744)>
* <Thread(Thread-2, started 4425617408)>
* <Thread(Thread-3, started 4429950976)>
Is there a way to have the threads result in the list?
Yes, for that you can use for example join. It will force the main thread to wait until child threads finish the work. You can then store the data in threading.Thread objects, something like this:
def parallelizer_task(processor,input,callback):
output = processor(input)
if callback:
callback(output)
# Attach result to current thread
thread = threading.currentThread()
thread.result = output
class ThreadsParallelizer(Parallelizer):
def parallelize(self,processors,input=None,callback=None):
threads = []
for processor in processors:
t = threading.Thread(...)
threads.append(t)
t.start()
# wait for threads to finish
for th in threads:
th.join()
# do something with results
results = [th.result for th in threads]
return results

Categories

Resources