From subprocess.Popen to multiprocessing - python

I got a function that invokes a process using subprocess.Popen in the following way:
def func():
...
process = subprocess.Popen(substr, shell=True, stdout=subprocess.PIPE)
timeout = {"value": False}
timer = Timer(timeout_sec, kill_proc, [process, timeout])
timer.start()
for line in process.stdout:
lines.append(line)
timer.cancel()
if timeout["value"] == True:
return 0
...
I call this function from other function using a loop (e.g from range(1,100) ) , how can I make multiple calls to the function with multiprocessing? that each time several processes will run in parallel
The processes doesn't depend on each other, the only constraint is that each process would be 'working' on only one index (e.g no two processes will work on index 1)
Thanks for your help

Just add the index to your Popen call and create a worker pool with as many CPU cores you have available.
import multiprocessing
def func(index):
....
process = subprocess.Popen(substr + " --index {}".format(index), shell=True, stdout=subprocess.PIPE)
....
if __name__ == '__main__':
p = multiprocessing.Pool(multiprocessing.cpu_count())
p.map(func, range(1, 100))

Related

Starting multiple thread processes to process a queue

Using below code I start to thread processes, write_process writes to a queue and read_process reads from a queue :
import time
from multiprocessing import Process, Queue, Pool
class QueueFun():
def writing_queue(self, work_tasks):
while True:
print("Writing to queue")
work_tasks.put(1)
time.sleep(1)
def read_queue(self, work_tasks):
while True:
print('Reading from queue')
work_tasks.get()
time.sleep(2)
if __name__ == '__main__':
q = QueueFun()
work_tasks = Queue()
write_process = Process(target=q.writing_queue,
args=(work_tasks,))
write_process.start()
read_process = Process(target=q.read_queue,
args=(work_tasks,))
read_process.start()
write_process.join()
read_process.join()
Running above code prints:
Writing to queue
Reading from queue
Writing to queue
Reading from queue
Writing to queue
Writing to queue
Reading from queue
Writing to queue
How to start N processes to read from the queue?
I tried starting 3 processes using below code but just 1 process is started, this is because the .join() prevents the second process from starting?:
for i in range(0 , 3):
read_process = Process(target=q.read_queue,
args=(work_tasks,))
print('Starting read_process' , i)
read_process.start()
read_process.join()
I also considered using a Pool as described in https://docs.python.org/2/library/multiprocessing.html but this seems just relevant for transforming an existing collection :
print pool.map(f, range(10))
How to start n threads where each thread processes a shared queue?
You can just put it to list, and join it outside of creation loop:
if __name__ == '__main__':
q = QueueFun()
work_tasks = Queue()
write_process = Process(target=q.writing_queue,
args=(work_tasks,))
write_process.start()
processes = []
for i in range(0, 5):
processes.append(Process(target=q.read_queue,
args=(work_tasks,)))
for p in processes:
p.start()
write_process.join()
for p in processes:
p.join()

How to pass Popen objects to concurrent.futures.ProcessPoolExecutor

As you can see below I have two Popen objects which are running in parallel but on completion of each process I want to perform some postprocess task with the data received from each process individually, but I want the postprocess task to execute in parallel but I get stuck the moment I call the executor.map function and I have observed that the CPU utilisation touches 100% for some time, later it goes down but no result is being achieved.
I get stuck indefinitely and the process keeps on running. It doesn't even print Inside letsee.
subProcess1 = Popen(cmd1, stdout=PIPE, stderr=PIPE, shell=True)
subProcess2 = Popen(cmd2, stdout=PIPE, stderr=PIPE, shell=True)
def letsSee(subProcessId):
print("Inside letsee")
while True:
stdoutVar = subProcessId.stdout.readline()
if stdoutVar == b'' and subProcessId.poll() is not None:
break
if stdoutVar:
print(subProcessId, type(stdoutVar), stdoutVar)
rc = subProcessId.poll()
return "Yes"
if __name__ == '__main__':
with concurrent.futures.ProcessPoolExecutor() as executor:
print("here")
xList = [subProcess1, subProcess2]
print(xList) #output is [<subprocess.Popen object at 0x01365290>, <subprocess.Popen object at 0x01365292>]
results = executor.map(letsSee, xList)
Is this acceptable for you?
def letsSee(command):
print("Inside letsee")
subProcessId = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
while True:
stdoutVar = subProcessId.stdout.readline()
if stdoutVar == b'' and subProcessId.poll() is not None:
break
if stdoutVar:
print(subProcessId, type(stdoutVar), stdoutVar)
rc = subProcessId.poll()
return "Yes"
if __name__ == '__main__':
with ProcessPoolExecutor() as executor:
print("here")
xList = [cmd1, cmd2]
print(xList)
results = executor.map(letsSee, xList)

multi threaded processes in python using queue to write to file checking if work had been done

from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
def func(a):
pthData = "C:/temp/temp.txt"
with open(pthData, 'r') as file:
done = file.read().splitlines()
if a in done:
return 'done'
q.put(a)
return a
def listener(q):
pthData = "C:/temp/temp.txt"
m = q.get()
with open(pthData, 'a') as the_file:
the_file.write( m + '\n')
#he_file.write(str(m) + '\n')
a = ['a', 'b', 'c', 'd', 'a', 'b']
# Make the Pool of workers
pool = ThreadPool(4)
#must use Manager queue here, or will not work
manager = mp.Manager()
q = manager.Queue()
#put listener to work first
watcher = pool.apply_async(listener, (q,))
pool.starmap(func, a, q)
## TypeError: '<=' not supported between instances of 'AutoProxy[Queue]' and 'int'
pool.starmap(func, a)
## Runs but only writes 'a' to temp file
pool.starmap(func, (a, q))
## func() takes 1 positional argument but 6 were given
pool.apply_async(func, (a, q))
## freezes on pool.join
# Close the pool and wait for the work to finish
pool.close()
pool.join()
Why is the apply_async freezing on the pool.join()? I tried putting it into a if name == 'main' but it had the same result.
How do I properly call func passing 1 argument (a) and the queue (q)?
How do I properly call func passing 1 argument (a) and the queue (q)?
This at-least does not freeze:
Ensure temp.txt exists before execution.
Add a q parameter to func.
def func(a,q):
print(f'func({a})')
...
Use apply_async in a list comprehension.
if __name__ == '__main__':
# Make the Pool of workers
with ThreadPool(4) as pool:
q = queue.Queue()
#put listener to work first
watcher = pool.apply_async(listener, (q,))
results = [pool.apply_async(func, (item, q)) for item in a]
# just check stuff
for result in results:
result.wait()
print(result, result.successful(),result.get())
pool.close()
pool.join()
You will need to work out some other problems like listener running once then stopping.
Many other ways to do this, I used apply_async because it was one of the options in your question.
I like using concurrent.futures myself.
You may benefit from reading through the search results using variations of python threading producer consumer site:stackoverflow.com

How to thread multiple subprocess instances in Python 2.7?

I have three commands that would otherwise be easily chained together on the command-line like so:
$ echo foo | firstCommand - | secondCommand - | thirdCommand - > finalOutput
In other words, the firstCommand processes foo from standard input and pipes the result to secondCommand, which in turn processes that input and pipes its output to thirdCommand, which does processing and redirects its output to the file finalOutput.
I have been trying to recapitulate this in a Python script, using threading. I'd like to use Python in order to manipulate the output from firstCommand before passing it to secondCommand, and again between secondCommand and thirdCommand.
Here's an excerpt of code that does not seem to work:
first_process = subprocess.Popen(['firstCommand', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
second_process = subprocess.Popen(['secondCommand', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
third_process = subprocess.Popen(['thirdCommand', '-'], stdin=subprocess.PIPE, stdout=sys.stdout)
first_thread = threading.Thread(target=consumeOutputFromStdin, args=(sys.stdin, first_process.stdin))
second_thread = threading.Thread(target=consumeOutputFromFirstCommand, args=(first_process.stdout, second_process.stdin))
third_thread = threading.Thread(target=consumeOutputFromSecondCommand, args=(second_process.stdout, third_process.stdin))
first_thread.start()
second_thread.start()
third_thread.start()
first_thread.join()
second_thread.join()
third_thread.join()
first_process.communicate()
second_process.communicate()
third_process.communicate()
# read 1K chunks from standard input
def consumeOutputFromStdin(from_stream, to_stream):
chunk = from_stream.read(1024)
while chunk:
to_stream.write(chunk)
to_stream.flush()
chunk = from_stream.read(1024)
def consumeOutputFromFirstCommand(from_stream, to_stream):
while True:
unprocessed_line = from_stream.readline()
if not unprocessed_line:
break
processed_line = some_python_function_that_processes_line(unprocessed_line)
to_stream.write(processed_line)
to_stream.flush()
def consumeOutputFromSecondCommand(from_stream, to_stream):
while True:
unprocessed_line = from_stream.readline()
if not unprocessed_line:
break
processed_line = a_different_python_function_that_processes_line(unprocessed_line)
to_stream.write(processed_line)
to_stream.flush()
When I run this, the script hangs:
$ echo foo | ./myConversionScript.py
** hangs here... **
If I hit Ctrl-C to terminate the script, the code is stuck on the line third_thread.join():
C-c C-c
Traceback (most recent call last):
File "./myConversionScript.py", line 786, in <module>
sys.exit(main(*sys.argv))
File "./myConversionScript.py", line 556, in main
third_thread.join()
File "/home/foo/proj/tools/lib/python2.7/threading.py", line 949, in join
self.__block.wait()
File "/home/foo/proj/tools/lib/python2.7/threading.py", line 339, in wait
waiter.acquire()
KeyboardInterrupt
If I don't use a third_process and third_thread, instead only passing data from the output of the first thread to the input of the second thread, there is no hang.
Something about the third thread seems to cause things to break, but I don't know why.
I thought the point of communicate() is that it will handle I/O for the three processes, so I'm not sure why there is an I/O hang.
How do I get three or more commands/processes working together, where one thread consumes the output of another thread/process?
UPDATE
Okay, I made some changes that seem to help, based on some comments here and on other sites. The processes are made to wait() for completion, and within the thread methods, I close() the pipes once the thread has processed all the data that it can. My concern is that memory usage will be very high for large datasets, but at least things are working:
first_process = subprocess.Popen(['firstCommand', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
second_process = subprocess.Popen(['secondCommand', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
third_process = subprocess.Popen(['thirdCommand', '-'], stdin=subprocess.PIPE, stdout=sys.stdout)
first_thread = threading.Thread(target=consumeOutputFromStdin, args=(sys.stdin, first_process.stdin))
second_thread = threading.Thread(target=consumeOutputFromFirstCommand, args=(first_process.stdout, second_process.stdin))
third_thread = threading.Thread(target=consumeOutputFromSecondCommand, args=(second_process.stdout, third_process.stdin))
first_thread.start()
second_thread.start()
third_thread.start()
first_thread.join()
second_thread.join()
third_thread.join()
first_process.wait()
second_process.wait()
third_process.wait()
# read 1K chunks from standard input
def consumeOutputFromStdin(from_stream, to_stream):
chunk = from_stream.read(1024)
while chunk:
to_stream.write(chunk)
to_stream.flush()
chunk = from_stream.read(1024)
def consumeOutputFromFirstCommand(from_stream, to_stream):
while True:
unprocessed_line = from_stream.readline()
if not unprocessed_line:
from_stream.close()
to_stream.close()
break
processed_line = some_python_function_that_processes_line(unprocessed_line)
to_stream.write(processed_line)
to_stream.flush()
def consumeOutputFromSecondCommand(from_stream, to_stream):
while True:
unprocessed_line = from_stream.readline()
if not unprocessed_line:
from_stream.close()
to_stream.close()
break
processed_line = a_different_python_function_that_processes_line(unprocessed_line)
to_stream.write(processed_line)
to_stream.flush()
To emulate:
echo foo |
firstCommand - | somePythonRoutine - |
secondCommand - | anotherPythonRoutine - |
thirdCommand - > finalOutput
your current approach with threads works:
from subprocess import Popen, PIPE
first = Popen(["firstCommand", "-"], stdin=PIPE, stdout=PIPE, bufsize=1)
second = Popen(["secondCommand", "-"], stdin=PIPE, stdout=PIPE, bufsize=1)
bind(first.stdout, second.stdin, somePythonRoutine)
with open("finalOutput", "wb") as file:
third = Popen(["thirdCommand", "-"], stdin=PIPE, stdout=file, bufsize=1)
bind(second.stdout, third.stdin, anotherPythonRoutine)
# provide input for the pipeline
first.stdin.write(b"foo")
first.stdin.close()
# wait for it to complete
pipestatus = [p.wait() for p in [first, second, third]]
where each bind() starts a new thread:
from threading import Thread
def bind(input_pipe, output_pipe, line_filter):
def f():
try:
for line in iter(input_pipe.readline, b''):
line = line_filter(line)
if line:
output_pipe.write(line) # no flush unless newline present
finally:
try:
output_pipe.close()
finally:
input_pipe.close()
t = Thread(target=f)
t.daemon = True # die if the program exits
t.start()
and somePythonRoutine, anotherPythonRoutine accept a single line and return it (possibly modified).
The point of communicate() is that it returns the output of the process. This collides with your pipe setup.
You should only call it once on the third process; all the other ones are connected via pipes and know how to communicate with each other - no other / manual intervention is necessary.

process stop working while queue is not empty

I try to write a script in python to convert url into its corresponding ip. Since the url file is huge (nearly 10GB), so I'm trying to use multiprocessing lib.
I create one process to write output to file and a set of processes to convert url.
Here is my code:
import multiprocessing as mp
import socket
import time
num_processes = mp.cpu_count()
sentinel = None
def url2ip(inqueue, output):
v_url = inqueue.get()
print 'v_url '+v_url
try:
v_ip = socket.gethostbyname(v_url)
output_string = v_url+'|||'+v_ip+'\n'
except:
output_string = v_url+'|||-1'+'\n'
print 'output_string '+output_string
output.put(output_string)
print output.full()
def handle_output(output):
f_ip = open("outputfile", "a")
while True:
output_v = output.get()
if output_v:
print 'output_v '+output_v
f_ip.write(output_v)
else:
break
f_ip.close()
if __name__ == '__main__':
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()
print 'run in %d processes' % num_processes
for i in range(num_processes):
p = mp.Process(target=url2ip, args=(inqueue, output))
jobs.append(p)
p.start()
for line in open('inputfile','r'):
print 'ori '+line.strip()
inqueue.put(line.strip())
for i in range(num_processes):
# Send the sentinal to tell Simulation to end
inqueue.put(sentinel)
for p in jobs:
p.join()
output.put(None)
proc.join()
However, it did not work. It did produce several outputs (4 out of 10 urls in the test file) but it just suddenly stops while queues are not empty (I did check queue.empty())
Could anyone suggest what's wrong?Thanks
You're workers exit after processing a single url each, they need to loop internally until they get the sentinel. However, you should probably just look at multiprocessing.pool instead, as that does the bookkeeping for you.

Categories

Resources