using multiprocessing.Pipe with subprocess for pipelining

using multiprocessing.Pipe with subprocess for pipelining - python

I am trying to export a bash pipeline into python and use multiple processes to speed up my pipeline. So I have couple of subprocesses spawned with mulitprocessing.Pipe connecting them (sample code below).
However, I always get a hang somewhere towards the end of the process, with an output file missing the last bits. Looks like a buffer deadlock or flush, but couldn't figure it out. I am sure I am missing something obvious, please show me :) Any help or comment would be appreciated. Thank you
#!/usr/bin/env python
from __future__ import print_function
import os
import sys
import multiprocessing as MP
import subprocess as SP
def usage():
Usage = "Need /path/to/infile"
log2err(Usage)
sys.exit(1)
def log2err(*s):
print(s, file=sys.stderr)
def pipeWrite(proc, outpipe):
# ouput only
global killToken
while 1:
s = proc.stdout.read(bufferSize)
if len(s) == 0:
break
#log2err('Write %s: %s' %(os.getpid(), len(s)))
outpipe.send_bytes(s)
log2err("Write PID %s: sending kill" % os.getpid())
outpipe.send_bytes(killToken)
outpipe.close()
def pipeRead(proc, inpipe):
# input only
global killToken
while 1:
s = inpipe.recv_bytes(bufferSize)
#log2err('Read %s: %s' %(os.getpid(), len(s)))
if s == killToken:
log2err("Read PID %s: received kill" % os.getpid())
break
proc.stdin.write(s)
proc.stdin.flush()
# final cleanup
proc.stdin.flush()
proc.stdin.close()
inpipe.close()
def testRead(infile, outpipe):
stmt = "cat %s" % infile
stmt += " | tee >( wc -l 1>&2)"
proc = SP.Popen(stmt, shell=True, stdout=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
pipeWrite(proc, outpipe)
proc.stdout.close()
outpipe.close()
log2err('testRead is DONE')
def testGzip(inpipe, outpipe):
stmt = "gzip -c - | tee >(wc -l 1>&2)"
proc = SP.Popen(stmt, shell=True, stdout=SP.PIPE, stdin=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
PR = MP.Process(target=pipeRead, args=(proc, inpipe))
PW = MP.Process(target=pipeWrite, args=(proc, outpipe))
PW.start()
PR.start()
PR.join()
proc.stdin.flush()
proc.stdin.close()
proc.stdout.flush()
proc.stdout.close()
log2err("testGzip PID:%s with Read:%s and Write:%s" %(os.getpid(), PR.pid, PW.pid))
PW.join()
log2err('testGzip is DONE')
def testOutput(infile, inpipe):
stmt = "tee %s.gz | sha512sum - > %s.gz.sha512" % (infile, infile)
proc = SP.Popen(stmt, shell=True, stdin=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
pipeRead(proc, inpipe)
inpipe.close()
log2err('outputFinal is DONE')
if __name__ == "__main__" :
try:
infile = sys.argv[1]
if infile in ('-h', '--help'):
usage()
except IndexError:
usage()
bufferSize = 256*256
killToken = "I am done with all of these processes"
# the curl task stream and the curl checksum and output stream
ingestRecv, ingestSend = MP.Pipe(False)
ingestProc = MP.Process(target=testRead, args=(infile, ingestSend))
# encrypt stream
workRecv, workSend = MP.Pipe(False)
workProc = MP.Process(target=testGzip, args=(ingestRecv, workSend))
# output to file and sha checksum stream
outputProc = MP.Process(target=testOutput, args=(infile, workRecv))
ingestProc.start()
log2err("ingestProc PID:%s" % ingestProc.pid)
workProc.start()
log2err("workProc PID:%s" % workProc.pid)
outputProc.start()
log2err("outputProc PID:%s" % outputProc.pid)
ingestProc.join()
log2err("ingestProc: joined")
workProc.join()
log2err("workProc: joined")
outputProc.join()
log2err("outputProc: joined")

Related

how to print variables after to use subprocess.call?

In my program, I use subprocess.call to run apache drill automatically.
After that, I make some queries and I would like to print the result.
Before to program the code to run apache drill automatically, I was doing it manually and i could print the results but now i cannot do it.
My last try was to write in a file, but the behavior is the same, nothing is written.
My code is bellow.
import subprocess
from pydrill.client import PyDrill
import sys
writer = open('resultado.txt', 'w')
cmdmsg = subprocess.check_output("cd C:\\Users\\Tito\\Downloads\\apache-drill-1.14.0\\bin & sqlline -u \"jdbc:drill:zk=local\"", shell = True)
writer.write("teste de msg: " + str(cmdmsg))
drill = PyDrill(host='localhost', port=8047)
if drill.is_active:
result = drill.query('''SELECT * FROM cp.`employee.json` LIMIT 3''')
result2 = drill.query('''SELECT * FROM dfs.`/Users/Tito/Desktop/banco_gal.csv` LIMIT 5''')
for tuple in result2:
writer.write(tuple)
writer.close

I could solve this problem.
3 things are important for this topic.
a) we should kill the java virtual machine and shell after that apache drill is turned on.
b) the windows buffer is very short, so the result has not been printed.
c) popen method is better than call for this task.
import os
import re
import subprocess
import traceback
from os import path
from pydrill.client import PyDrill
DRILL_HOST = 'localhost'
DRILL_PORT = 8047
JAVA_HOME = ''
JAVA_HOME = JAVA_HOME or ("JAVA_HOME" in os.environ and os.environ["JAVA_HOME"])
JPS_EXE_PATH = path.join(JAVA_HOME, 'bin', 'jps.exe')
KILL_DRILL_JVM = True
def getJvmPID(className):
pid = None
print('Running JPS cmd: %s' % JPS_EXE_PATH)
jpsOutput = subprocess.check_output(JPS_EXE_PATH)
jpsOutput = str(jpsOutput)
se = re.search(r'([0-9]*\s*)' + className, jpsOutput)
if se:
pid = se.group(1)
return pid
def killProcessByPID(pid):
killCmd = ['taskkill', '/f', '/pid', str(pid)]
print('Running taskkill cmd: %s' % killCmd)
killCmdOuput = subprocess.check_output(killCmd, stderr=subprocess.STDOUT)
print(str(killCmdOuput))
def killJvm(className):
pid = getJvmPID(className)
killProcessByPID(pid)
drillBinDir = 'C:/Users/Tito/Downloads/apache-drill-1.14.0/bin'
sqlinePath = path.join(drillBinDir, 'sqlline.bat')
drillCmdList = [sqlinePath , '-u', '"jdbc:drill:zk=local"']
drillCmdStr = " ".join(drillCmdList)
drillConAttempts = 2
while drillConAttempts > 0:
drillConAttempts -= 1
print("Connecting to drill on %s:%d..." % (DRILL_HOST, DRILL_PORT))
try:
drill = PyDrill(host=DRILL_HOST, port=DRILL_PORT)
except:
print("Exception when creating object")
traceback.print_exc()
print("Checking Drill conection...")
try:
if drill.is_active():
print("Connected.")
break
elif drillConAttempts > 0:
print("Could not connect to Drill. Trying to start Drill...")
print("Running cmd '%s > %s'" % (drillCmdStr, os.devnull) )
devNull = open(os.devnull,"w")
cmdProc = subprocess.Popen(drillCmdStr, cwd=drillBinDir, stdout=devNull, stderr=subprocess.STDOUT, shell=True)
print("Started CMD process with PID %d" %(cmdProc.pid))
except:
print("Exception when checking connection")
traceback.print_exc()
if drill.is_active():
result = drill.query('''SELECT * FROM cp.`employee.json` LIMIT 3''')
for resultTuple in result:
print(resultTuple)
if KILL_DRILL_JVM:
print('Killing Drill process...')
killJvm('SqlLine')

Display Popen.communicate() in real time [duplicate]

I have a python subprocess that I'm trying to read output and error streams from. Currently I have it working, but I'm only able to read from stderr after I've finished reading from stdout. Here's what it looks like:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_iterator = iter(process.stdout.readline, b"")
stderr_iterator = iter(process.stderr.readline, b"")
for line in stdout_iterator:
# Do stuff with line
print line
for line in stderr_iterator:
# Do stuff with line
print line
As you can see, the stderr for loop can't start until the stdout loop completes. How can I modify this to be able to read from both in the correct order the lines come in?
To clarify: I still need to be able to tell whether a line came from stdout or stderr because they will be treated differently in my code.

The code in your question may deadlock if the child process produces enough output on stderr (~100KB on my Linux machine).
There is a communicate() method that allows to read from both stdout and stderr separately:
from subprocess import Popen, PIPE
process = Popen(command, stdout=PIPE, stderr=PIPE)
output, err = process.communicate()
If you need to read the streams while the child process is still running then the portable solution is to use threads (not tested):
from subprocess import Popen, PIPE
from threading import Thread
from Queue import Queue # Python 2
def reader(pipe, queue):
try:
with pipe:
for line in iter(pipe.readline, b''):
queue.put((pipe, line))
finally:
queue.put(None)
process = Popen(command, stdout=PIPE, stderr=PIPE, bufsize=1)
q = Queue()
Thread(target=reader, args=[process.stdout, q]).start()
Thread(target=reader, args=[process.stderr, q]).start()
for _ in range(2):
for source, line in iter(q.get, None):
print "%s: %s" % (source, line),
See:
Python: read streaming input from subprocess.communicate()
Non-blocking read on a subprocess.PIPE in python
Python subprocess get children's output to file and terminal?

Here's a solution based on selectors, but one that preserves order, and streams variable-length characters (even single chars).
The trick is to use read1(), instead of read().
import selectors
import subprocess
import sys
p = subprocess.Popen(
["python", "random_out.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
sel = selectors.DefaultSelector()
sel.register(p.stdout, selectors.EVENT_READ)
sel.register(p.stderr, selectors.EVENT_READ)
while True:
for key, _ in sel.select():
data = key.fileobj.read1().decode()
if not data:
exit()
if key.fileobj is p.stdout:
print(data, end="")
else:
print(data, end="", file=sys.stderr)
If you want a test program, use this.
import sys
from time import sleep
for i in range(10):
print(f" x{i} ", file=sys.stderr, end="")
sleep(0.1)
print(f" y{i} ", end="")
sleep(0.1)

The order in which a process writes data to different pipes is lost after write.
There is no way you can tell if stdout has been written before stderr.
You can try to read data simultaneously from multiple file descriptors in a non-blocking way
as soon as data is available, but this would only minimize the probability that the order is incorrect.
This program should demonstrate this:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import select
import subprocess
testapps={
'slow': '''
import os
import time
os.write(1, 'aaa')
time.sleep(0.01)
os.write(2, 'bbb')
time.sleep(0.01)
os.write(1, 'ccc')
''',
'fast': '''
import os
os.write(1, 'aaa')
os.write(2, 'bbb')
os.write(1, 'ccc')
''',
'fast2': '''
import os
os.write(1, 'aaa')
os.write(2, 'bbbbbbbbbbbbbbb')
os.write(1, 'ccc')
'''
}
def readfds(fds, maxread):
while True:
fdsin, _, _ = select.select(fds,[],[])
for fd in fdsin:
s = os.read(fd, maxread)
if len(s) == 0:
fds.remove(fd)
continue
yield fd, s
if fds == []:
break
def readfromapp(app, rounds=10, maxread=1024):
f=open('testapp.py', 'w')
f.write(testapps[app])
f.close()
results={}
for i in range(0, rounds):
p = subprocess.Popen(['python', 'testapp.py'], stdout=subprocess.PIPE
, stderr=subprocess.PIPE)
data=''
for (fd, s) in readfds([p.stdout.fileno(), p.stderr.fileno()], maxread):
data = data + s
results[data] = results[data] + 1 if data in results else 1
print 'running %i rounds %s with maxread=%i' % (rounds, app, maxread)
results = sorted(results.items(), key=lambda (k,v): k, reverse=False)
for data, count in results:
print '%03i x %s' % (count, data)
print
print "=> if output is produced slowly this should work as whished"
print " and should return: aaabbbccc"
readfromapp('slow', rounds=100, maxread=1024)
print
print "=> now mostly aaacccbbb is returnd, not as it should be"
readfromapp('fast', rounds=100, maxread=1024)
print
print "=> you could try to read data one by one, and return"
print " e.g. a whole line only when LF is read"
print " (b's should be finished before c's)"
readfromapp('fast', rounds=100, maxread=1)
print
print "=> but even this won't work ..."
readfromapp('fast2', rounds=100, maxread=1)
and outputs something like this:
=> if output is produced slowly this should work as whished
and should return: aaabbbccc
running 100 rounds slow with maxread=1024
100 x aaabbbccc
=> now mostly aaacccbbb is returnd, not as it should be
running 100 rounds fast with maxread=1024
006 x aaabbbccc
094 x aaacccbbb
=> you could try to read data one by one, and return
e.g. a whole line only when LF is read
(b's should be finished before c's)
running 100 rounds fast with maxread=1
003 x aaabbbccc
003 x aababcbcc
094 x abababccc
=> but even this won't work ...
running 100 rounds fast2 with maxread=1
003 x aaabbbbbbbbbbbbbbbccc
001 x aaacbcbcbbbbbbbbbbbbb
008 x aababcbcbcbbbbbbbbbbb
088 x abababcbcbcbbbbbbbbbb

This works for Python3 (3.6):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, universal_newlines=True)
# Read both stdout and stderr simultaneously
sel = selectors.DefaultSelector()
sel.register(p.stdout, selectors.EVENT_READ)
sel.register(p.stderr, selectors.EVENT_READ)
ok = True
while ok:
for key, val1 in sel.select():
line = key.fileobj.readline()
if not line:
ok = False
break
if key.fileobj is p.stdout:
print(f"STDOUT: {line}", end="")
else:
print(f"STDERR: {line}", end="", file=sys.stderr)

from https://docs.python.org/3/library/subprocess.html#using-the-subprocess-module
If you wish to capture and combine both streams into one, use
stdout=PIPE and stderr=STDOUT instead of capture_output.
so the easiest solution would be:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout_iterator = iter(process.stdout.readline, b"")
for line in stdout_iterator:
# Do stuff with line
print line

I know this question is very old, but this answer may help others who stumble upon this page in researching a solution for a similar situation, so I'm posting it anyway.
I've built a simple python snippet that will merge any number of pipes into a single one. Of course, as stated above, the order cannot be guaranteed, but this is as close as I think you can get in Python.
It spawns a thread for each of the pipes, reads them line by line and puts them into a Queue (which is FIFO). The main thread loops through the queue, yielding each line.
import threading, queue
def merge_pipes(**named_pipes):
r'''
Merges multiple pipes from subprocess.Popen (maybe other sources as well).
The keyword argument keys will be used in the output to identify the source
of the line.
Example:
p = subprocess.Popen(['some', 'call'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
outputs = {'out': log.info, 'err': log.warn}
for name, line in merge_pipes(out=p.stdout, err=p.stderr):
outputs[name](line)
This will output stdout to the info logger, and stderr to the warning logger
'''
# Constants. Could also be placed outside of the method. I just put them here
# so the method is fully self-contained
PIPE_OPENED=1
PIPE_OUTPUT=2
PIPE_CLOSED=3
# Create a queue where the pipes will be read into
output = queue.Queue()
# This method is the run body for the threads that are instatiated below
# This could be easily rewritten to be outside of the merge_pipes method,
# but to make it fully self-contained I put it here
def pipe_reader(name, pipe):
r"""
reads a single pipe into the queue
"""
output.put( ( PIPE_OPENED, name, ) )
try:
for line in iter(pipe.readline,''):
output.put( ( PIPE_OUTPUT, name, line.rstrip(), ) )
finally:
output.put( ( PIPE_CLOSED, name, ) )
# Start a reader for each pipe
for name, pipe in named_pipes.items():
t=threading.Thread(target=pipe_reader, args=(name, pipe, ))
t.daemon = True
t.start()
# Use a counter to determine how many pipes are left open.
# If all are closed, we can return
pipe_count = 0
# Read the queue in order, blocking if there's no data
for data in iter(output.get,''):
code=data[0]
if code == PIPE_OPENED:
pipe_count += 1
elif code == PIPE_CLOSED:
pipe_count -= 1
elif code == PIPE_OUTPUT:
yield data[1:]
if pipe_count == 0:
return

This works for me (on windows):
https://github.com/waszil/subpiper
from subpiper import subpiper
def my_stdout_callback(line: str):
print(f'STDOUT: {line}')
def my_stderr_callback(line: str):
print(f'STDERR: {line}')
my_additional_path_list = [r'c:\important_location']
retcode = subpiper(cmd='echo magic',
stdout_callback=my_stdout_callback,
stderr_callback=my_stderr_callback,
add_path_list=my_additional_path_list)

Redirecting stdout with subprocess in python is very slow

When I use subprocess in Python to redirect stdout, I get a very slow throughput. Am I doing it wrong?
Basically, I pipe the standard output of an external program to put it in a queue. Then in another function, I print it in the console.
Here is a sample code with hexdump to generate random output:
from subprocess import Popen, PIPE
from queue import Queue
import sys
from threading import Thread, Event
import threading
class Buffer(Queue):
def __init__(self, *args, **kwargs):
Queue.__init__(self, *args, **kwargs)
def write(self, line):
self.put_nowait(line)
self.join()
def read(self):
element = self.get_nowait()
self.task_done()
return element
def write_output(buffer, stopped):
hexdump = Popen(['hexdump', '-C', '/dev/urandom'], stdout=PIPE)
while hexdump.returncode is None:
for line in hexdump.stdout.readlines(8192):
buffer.write(line)
if stopped.is_set():
hexdump.terminate()
hexdump.wait()
print('process terminated.')
break
def read_output(buffer, stopped):
while not stopped.is_set():
while not buffer.empty():
output = buffer.read()
print('********* output: {}'.format(output))
sys.stdout.flush()
print('stopped')
sys.stdout.flush()
buffer = Buffer()
stopped = Event()
generate_random_output = Thread(target=write_output, args=(buffer, stopped))
generate_random_output.name = 'generate_random_output'
generate_random_output.start()
process_output = Thread(target=read_output, args=(buffer, stopped))
process_output.name = 'process_output'
process_output.start()
try:
while True:
continue
except KeyboardInterrupt:
stopped.set()
generate_random_output.join()
process_output.join()
print('finished generating')
print('finished processing')
I would appreciate any help.

Instead of redirecting your output to Queue - process it directly:
def write_output(buffer, stopped):
hexdump = Popen(['hexdump', '-C', '/dev/urandom'], stdout=PIPE)
while hexdump.poll() is None:
while not stopped.is_set():
for line in iter(hexdump.stdout.readline, b''):
print('********* output: %s' % line.decode(), end='')
sys.stdout.flush()
hexdump.terminate()
hexdump.wait()
print('process terminated.')
break

Catch & Ignore Signal

I'm trying to write a program that will run a program on a remote machine using ssh and pass it SIGINT signals without killing my ssh connection. If I was running on a terminal, then this would be easy with something like
ssh -t -t host "command to run"
I tried this with the following script:
#!/bin/bash
ssh -t -t host "program $#"
and when I run that from the terminal it works fine, but when proofgeneral runs this script and sends it SIGINT it just ends up killing the program (I guess it can't allocate a terminal?).
I've got the following mess of code:
CTRL_C = "<CTRL-C>"
def endpoint(proc, handler):
signal.signal(2, signal.SIG_IGN)
inp = ""
out = ""
err = ""
inp_from_fd = sys.stdin.fileno()
inp_to_fd = proc.stdin.fileno()
out_from_fd = proc.stdout.fileno()
out_to_fd = sys.stdout.fileno()
err_from_fd = proc.stderr.fileno()
err_to_fd = sys.stderr.fileno()
while True:
try:
ins,outs,_ = select.select([inp_from_fd, out_from_fd, err_from_fd],
[inp_to_fd, out_to_fd, err_to_fd],
[])
for fd in ins:
if fd == inp_from_fd:
k = os.read(fd, 1024)
if k == "":
os.close(proc.stdin.fileno())
return
inp = inp + k
elif fd == out_from_fd:
k = os.read(fd, 1024)
out = out + k
elif fd == err_from_fd:
k = os.read(fd, 1024)
err = err + k
else:
assert False
for fd in outs:
if fd == inp_to_fd:
while CTRL_C in inp:
proc.send_signal(2)
p = inp.find(CTRL_C)
inp = inp[0:p] + inp[p+len(CTRL_C):]
k = os.write(fd, inp)
inp = inp[k:]
elif fd == out_to_fd:
k = os.write(fd, out)
out = out[k:]
elif fd == err_to_fd:
k = os.write(fd, err)
err = err[k:]
else:
assert False
except select.error:
pass
except KeyboardInterrupt:
handler()
except IOError, e:
pass
def usage(args):
print "must specify --client or --server"
if __name__ == '__main__':
if len(sys.argv) == 1:
usage(sys.argv)
elif sys.argv[1] == '--server':
proc = subprocess.Popen(sys.argv[2:],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def INT():
proc.stdin.write(CTRL_C)
proc.stdin.flush()
endpoint(proc, INT)
elif '--client' in sys.argv:
proc = subprocess.Popen(sys.argv[2:],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
import time
time.sleep(1)
def INT():
pass
endpoint(proc, INT)
else:
usage(sys.argv)
which I'm invoking using something like:
remote.py --client ssh -t -t host "remote.py --server <program-to-run> <args>"
Is there something that I'm doing wrong here to handle the signal? I've tried putting a print in the signal 2 handler and it does print it, but it is also killing ssh (I'm getting "Killed by signal 2." printed on the console). Is python forwarding the signal to it's children? Is there a way to get around this? Is there an easier way to do this?
Thanks for any pointers.

os.setpgrp (look at setpgrp manpage) otherwise the signal is propagated to children

python subprocess with timeout and large output (>64K)

I want to execute a process, limit the execution-time by some timeout in seconds and grab the output produced by the process. And I want to do this on windows, linux and freebsd.
I have tried implementing this in three different ways:
cmd - Without timeout and subprocess.PIPE for output capture.
BEHAVIOUR: Operates as expected but does not support timeout, i need timeout...
cmd_to - With timeout and subprocess.PIPE for output capture.
BEHAVIOUR: Blocks subprocess execution when output >= 2^16 bytes.
cmd_totf - With timeout and tempfile.NamedTemporaryfile for output capture.
BEHAVIOUR: Operates as expected but uses temporary files on disk.
These are available below for closer inspection.
As can be seen in the output below, then the timeout-code blocks the execution of the sub-process when using subprocessing.PIPE and output from the subprocess is >= 2^16 bytes.
The subprocess documentation states that this is expected when calling process.wait() and using subprocessing.PIPE, however no warnings are given when using process.poll(), so what is going wrong here?
I have a solution in cmd_totf which use the tempfile module but the tradeoff is that it writes the output to disk, something I would REALLY like to avoid.
So my questions are:
What am I doing wrong in cmd_to?
Is there a way to do what I want and without using tempfiles / keeping the output in memory.
Script to generate a bunch of output ('exp_gen.py'):
#!/usr/bin/env python
import sys
output = "b"*int(sys.argv[1])
print output
Three different implementations (cmd, cmd_to, cmd_totf) of wrappers around subprocessing.Popen:
#!/usr/bin/env python
import subprocess, time, tempfile
bufsize = -1
def cmd(cmdline, timeout=60):
"""
Execute cmdline.
Uses subprocessing and subprocess.PIPE.
"""
p = subprocess.Popen(
cmdline,
bufsize = bufsize,
shell = False,
stdin = subprocess.PIPE,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)
out, err = p.communicate()
returncode = p.returncode
return (returncode, err, out)
def cmd_to(cmdline, timeout=60):
"""
Execute cmdline, limit execution time to 'timeout' seconds.
Uses subprocessing and subprocess.PIPE.
"""
p = subprocess.Popen(
cmdline,
bufsize = bufsize,
shell = False,
stdin = subprocess.PIPE,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)
t_begin = time.time() # Monitor execution time
seconds_passed = 0
while p.poll() is None and seconds_passed < timeout:
seconds_passed = time.time() - t_begin
time.sleep(0.1)
#if seconds_passed > timeout:
#
# try:
# p.stdout.close() # If they are not closed the fds will hang around until
# p.stderr.close() # os.fdlimit is exceeded and cause a nasty exception
# p.terminate() # Important to close the fds prior to terminating the process!
# # NOTE: Are there any other "non-freed" resources?
# except:
# pass
#
# raise TimeoutInterrupt
out, err = p.communicate()
returncode = p.returncode
return (returncode, err, out)
def cmd_totf(cmdline, timeout=60):
"""
Execute cmdline, limit execution time to 'timeout' seconds.
Uses subprocessing and tempfile instead of subprocessing.PIPE.
"""
output = tempfile.NamedTemporaryFile(delete=False)
error = tempfile.NamedTemporaryFile(delete=False)
p = subprocess.Popen(
cmdline,
bufsize = 0,
shell = False,
stdin = None,
stdout = output,
stderr = error
)
t_begin = time.time() # Monitor execution time
seconds_passed = 0
while p.poll() is None and seconds_passed < timeout:
seconds_passed = time.time() - t_begin
time.sleep(0.1)
#if seconds_passed > timeout:
#
# try:
# p.stdout.close() # If they are not closed the fds will hang around until
# p.stderr.close() # os.fdlimit is exceeded and cause a nasty exception
# p.terminate() # Important to close the fds prior to terminating the process!
# # NOTE: Are there any other "non-freed" resources?
# except:
# pass
#
# raise TimeoutInterrupt
p.wait()
returncode = p.returncode
fd = open(output.name)
out = fd.read()
fd.close()
fd = open(error.name)
err = fd.read()
fd.close()
error.close()
output.close()
return (returncode, err, out)
if __name__ == "__main__":
implementations = [cmd, cmd_to, cmd_totf]
bytes = ['65535', '65536', str(1024*1024)]
timeouts = [5]
for timeout in timeouts:
for size in bytes:
for i in implementations:
t_begin = time.time()
seconds_passed = 0
rc, err, output = i(['exp_gen.py', size], timeout)
seconds_passed = time.time() - t_begin
filler = ' '*(8-len(i.func_name))
print "[%s%s: timeout=%d, iosize=%s, seconds=%f]" % (repr(i.func_name), filler, timeout, size, seconds_passed)
Output from execution:
['cmd' : timeout=5, iosize=65535, seconds=0.016447]
['cmd_to' : timeout=5, iosize=65535, seconds=0.103022]
['cmd_totf': timeout=5, iosize=65535, seconds=0.107176]
['cmd' : timeout=5, iosize=65536, seconds=0.028105]
['cmd_to' : timeout=5, iosize=65536, seconds=5.116658]
['cmd_totf': timeout=5, iosize=65536, seconds=0.104905]
['cmd' : timeout=5, iosize=1048576, seconds=0.025964]
['cmd_to' : timeout=5, iosize=1048576, seconds=5.128062]
['cmd_totf': timeout=5, iosize=1048576, seconds=0.103183]

As opposed to all the warnings in the subprocess documentation then directly reading from process.stdout and process.stderr has provided a better solution.
By better I mean that I can read output from a process that exceeds 2^16 bytes without having to temporarily store the output on disk.
The code follows:
import fcntl
import os
import subprocess
import time
def nonBlockRead(output):
fd = output.fileno()
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
try:
return output.read()
except:
return ''
def cmd(cmdline, timeout=60):
"""
Execute cmdline, limit execution time to 'timeout' seconds.
Uses the subprocess module and subprocess.PIPE.
Raises TimeoutInterrupt
"""
p = subprocess.Popen(
cmdline,
bufsize = bufsize, # default value of 0 (unbuffered) is best
shell = False, # not really needed; it's disabled by default
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)
t_begin = time.time() # Monitor execution time
seconds_passed = 0
stdout = ''
stderr = ''
while p.poll() is None and seconds_passed < timeout: # Monitor process
time.sleep(0.1) # Wait a little
seconds_passed = time.time() - t_begin
# p.std* blocks on read(), which messes up the timeout timer.
# To fix this, we use a nonblocking read()
# Note: Not sure if this is Windows compatible
stdout += nonBlockRead(p.stdout)
stderr += nonBlockRead(p.stderr)
if seconds_passed >= timeout:
try:
p.stdout.close() # If they are not closed the fds will hang around until
p.stderr.close() # os.fdlimit is exceeded and cause a nasty exception
p.terminate() # Important to close the fds prior to terminating the process!
# NOTE: Are there any other "non-freed" resources?
except:
pass
raise TimeoutInterrupt
returncode = p.returncode
return (returncode, stdout, stderr)

Disclaimer: This answer is not tested on windows, nor freebsd. But the used modules should work on these systems. I believe this should be a working answer to your question - it works for me.
Here's code I just hacked to solve the problem on linux. It is a combination of several Stackoverflow threads and my own research in the Python 3 documents.
Main characteristics of this code:
Uses processes not threads for blocking I/O because they can more reliably be p.terminated()
Implements a retriggerable timeout watchdog that restarts counting whenever some output happens
Implements a long-term timeout watchdog to limit overall runtime
Can feed in stdin (although I only need to feed in one-time short strings)
Can capture stdout/stderr in the usual Popen means (Only stdout is coded, and stderr redirected to stdout; but can easily be separated)
It's almost realtime because it only checks every 0.2 seconds for output. But you could decrease this or remove the waiting interval easily
Lots of debugging printouts still enabled to see whats happening when.
The only code dependency is enum as implemented here, but the code could easily be changed to work without. It's only used to distinguish the two timeouts - use separate exceptions if you like.
Here's the code - as usual - feedback is highly appreciated:
(Edit 29-Jun-2012 - the code is now actually working)
# Python module runcmd
# Implements a class to launch shell commands which
# are killed after a timeout. Timeouts can be reset
# after each line of output
#
# Use inside other script with:
#
# import runcmd
# (return_code, out) = runcmd.RunCmd(['ls', '-l', '/etc'],
# timeout_runtime,
# timeout_no_output,
# stdin_string).go()
#
import multiprocessing
import queue
import subprocess
import time
import enum
def timestamp():
return time.strftime('%Y%m%d-%H%M%S')
class ErrorRunCmd(Exception): pass
class ErrorRunCmdTimeOut(ErrorRunCmd): pass
class Enqueue_output(multiprocessing.Process):
def __init__(self, out, queue):
multiprocessing.Process.__init__(self)
self.out = out
self.queue = queue
self.daemon = True
def run(self):
try:
for line in iter(self.out.readline, b''):
#print('worker read:', line)
self.queue.put(line)
except ValueError: pass # Readline of closed file
self.out.close()
class Enqueue_input(multiprocessing.Process):
def __init__(self, inp, iterable):
multiprocessing.Process.__init__(self)
self.inp = inp
self.iterable = iterable
self.daemon = True
def run(self):
#print("writing stdin")
for line in self.iterable:
self.inp.write(bytes(line,'utf-8'))
self.inp.close()
#print("writing stdin DONE")
class RunCmd():
"""RunCmd - class to launch shell commands
Captures and returns stdout. Kills child after a given
amount (timeout_runtime) wallclock seconds. Can also
kill after timeout_retriggerable wallclock seconds.
This second timer is reset whenever the child does some
output
(return_code, out) = RunCmd(['ls', '-l', '/etc'],
timeout_runtime,
timeout_no_output,
stdin_string).go()
"""
Timeout = enum.Enum('No','Retriggerable','Runtime')
def __init__(self, cmd, timeout_runtime, timeout_retriggerable, stdin=None):
self.dbg = False
self.cmd = cmd
self.timeout_retriggerable = timeout_retriggerable
self.timeout_runtime = timeout_runtime
self.timeout_hit = self.Timeout.No
self.stdout = '--Cmd did not yield any output--'
self.stdin = stdin
def read_queue(self, q):
time_last_output = None
try:
bstr = q.get(False) # non-blocking
if self.dbg: print('{} chars read'.format(len(bstr)))
time_last_output = time.time()
self.stdout += bstr
except queue.Empty:
#print('queue empty')
pass
return time_last_output
def go(self):
if self.stdin:
pstdin = subprocess.PIPE
else:
pstdin = None
p = subprocess.Popen(self.cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=pstdin)
pin = None
if (pstdin):
pin = Enqueue_input(p.stdin, [self.stdin + '\n'])
pin.start()
q = multiprocessing.Queue()
pout = Enqueue_output(p.stdout, q)
pout.start()
try:
if self.dbg: print('Beginning subprocess with timeout {}/{} s on {}'.format(self.timeout_retriggerable, self.timeout_runtime, time.asctime()))
time_begin = time.time()
time_last_output = time_begin
seconds_passed = 0
self.stdout = b''
once = True # ensure loop's executed at least once
# some child cmds may exit very fast, but still produce output
while once or p.poll() is None or not q.empty():
once = False
if self.dbg: print('a) {} of {}/{} secs passed and overall {} chars read'.format(seconds_passed, self.timeout_retriggerable, self.timeout_runtime, len(self.stdout)))
tlo = self.read_queue(q)
if tlo:
time_last_output = tlo
now = time.time()
if now - time_last_output >= self.timeout_retriggerable:
self.timeout_hit = self.Timeout.Retriggerable
raise ErrorRunCmdTimeOut(self)
if now - time_begin >= self.timeout_runtime:
self.timeout_hit = self.Timeout.Runtime
raise ErrorRunCmdTimeOut(self)
if q.empty():
time.sleep(0.1)
# Final try to get "last-millisecond" output
self.read_queue(q)
finally:
self._close(p, [pout, pin])
return (self.returncode, self.stdout)
def _close(self, p, procs):
if self.dbg:
if self.timeout_hit != self.Timeout.No:
print('{} A TIMEOUT occured: {}'.format(timestamp(), self.timeout_hit))
else:
print('{} No timeout occured'.format(timestamp()))
for process in [proc for proc in procs if proc]:
try:
process.terminate()
except:
print('{} Process termination raised trouble'.format(timestamp()))
raise
try:
p.stdin.close()
except: pass
if self.dbg: print('{} _closed stdin'.format(timestamp()))
try:
p.stdout.close() # If they are not closed the fds will hang around until
except: pass
if self.dbg: print('{} _closed stdout'.format(timestamp()))
#p.stderr.close() # os.fdlimit is exceeded and cause a nasty exception
try:
p.terminate() # Important to close the fds prior to terminating the process!
# NOTE: Are there any other "non-freed" resources?
except: pass
if self.dbg: print('{} _closed Popen'.format(timestamp()))
try:
self.stdout = self.stdout.decode('utf-8')
except: pass
self.returncode = p.returncode
if self.dbg: print('{} _closed all'.format(timestamp()))
Use with:
import runcmd
cmd = ['ls', '-l', '/etc']
worker = runcmd.RunCmd(cmd,
40, # limit runtime [wallclock seconds]
2, # limit runtime after last output [wallclk secs]
'' # stdin input string
)
(return_code, out) = worker.go()
if worker.timeout_hit != worker.Timeout.No:
print('A TIMEOUT occured: {}'.format(worker.timeout_hit))
else:
print('No timeout occured')
print("Running '{:s}' returned {:d} and {:d} chars of output".format(cmd, return_code, len(out)))
print('Output:')
print(out)
command - the first argument - should be a list of a command and its arguments. It is used for the Popen(shell=False) call and its timeouts are in seconds. There's currently no code to disable the timeouts. Set timeout_no_output to time_runtime to effectively disable the retriggerable timeout_no_output.
stdin_string can be any string which is to be sent to the command's standard input. Set to None if your command does not need any input. If a string is provided, a final '\n' is appended.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

using multiprocessing.Pipe with subprocess for pipelining - python

Related

how to print variables after to use subprocess.call?

Display Popen.communicate() in real time [duplicate]

Redirecting stdout with subprocess in python is very slow

Catch & Ignore Signal

python subprocess with timeout and large output (>64K)

Categories

Resources