Does Popen().stdout.close() return a value? - python

I have in the code I maintain:
app7z = dirs['mopy'].join('7z.exe').s # path to 7z.exe
command = '"%s" a "%s" -y -r "%s\\*"' % (app7z, dstFile.temp.s, srcDir.s)
ins = Popen(command, stdout=PIPE, startupinfo=startupinfo).stdout
#--Error checking and progress feedback
reCompressing = re.compile('Compressing\s+(.+)')
regMatch = reCompressing.match
reError = re.compile('Error: (.*)')
regErrMatch = reError.match
errorLine = []
for line in ins:
maCompressing = regMatch(line)
if len(errorLine) or regErrMatch(line):
errorLine.append(line)
if maCompressing:
# update progress
result = ins.close() # THIS
if result:
dstFile.temp.remove()
raise StateError(_("%s: Compression failed:\n%s") % (dstFile.s,
"\n".join(errorLine)))
(full code)
Does ins.close() return a non None value on failure ? My IDE (pycharm 3.4.2/4.5.2) warns me that it does not, but then not consistently.
I am on windows if this makes a difference - python 2.7.8

What do you think, close can return?
You probably want to use wait to get the exit code:
app7z = dirs['mopy'].join('7z.exe').s # path to 7z.exe
command = [app7z, 'a', dstFile.temp.s, "-y", "-r", os.path.join(src.Dir.s, '*')]
process = Popen(command, stdout=PIPE, startupinfo=startupinfo)
out = process.stdout
regMatch = re.compile('Compressing\s+(.+)').match
regErrMatch = re.compile('Error: (.*)').match
errorLine = []
for line in out:
maCompressing = regMatch(line)
if len(errorLine) or regErrMatch(line):
errorLine.append(line)
if maCompressing:
# update progress
result = process.wait()
if result:
dstFile.temp.remove()
raise StateError(_("%s: Compression failed:\n%s") % (dstFile.s,
"\n".join(errorLine)))

Related

strange output when using flags in python

I'm currently writing a script in python that takes a number of flags. This is my first attempt at such a program, and I am getting an output from the bash script that I don't quite understand. For example when I run the script in the bash shell:
$ python my_script.py -f <input_file.txt> -k <test_string> -c <user_input>
I get this output before my script's output:
usage: rm [-f | -i] [-dPRrvW] file ...
unlink file
I can't seem to get rid of this, which is frustrating for the prettiness of the output. Any help would be great!
The code I'm using:
import sys, getopt, re, subprocess, collections, itertools
def find_kmers( arguments=sys.argv[1:] ):
required_opts = ['-f','-c','-k']
opts, args = getopt.getopt(arguments,'f:k:c:')
opt_dic = dict(opts)
for opt in required_opts:
if opt not in opt_dic:
return "incorrect arguments, please format as: python_script.py -f <filename> -k <kmer> -c <chromosome_name>"
def rev_comp(sequence):
reversed_dic = {'A':'T','T':'A','C':'G','G':'C'}
return ''.join(reversed_dic[_] for _ in sequence[::-1])
kmer = opt_dic['-k']
subprocess.call(['bash','-c',"grep '>' S288C_R64.fasta > grep.tmp"])
chromosomes = [_[1:].strip() for _ in open('grep.tmp')]
subprocess.call(['bash','-c','rm','grep.tmp'])
found = False
if any(opt_dic['-c']==_ for _ in chromosomes):
found = True
def get_sequence(file):
sequence = ''
for line in file:
if line.startswith('>'): break
sequence += line.strip()
return sequence.upper()
ofile = open(opt_dic['-f'])
if found == True:
for line in ofile:
if line.startswith('>'):
if line[1:].strip() == opt_dic['-c']:
sequence = get_sequence(ofile)
break
else:
return 'chromosome not found in %s. \n chromosomes in file are:%s'%(opt_dic['-f'],', '.join(str(_) for _ in chromosomes))
kmer_matches1 = re.finditer('(?=%s)'%opt_dic['-k'],sequence)
kmer_matches2 = re.finditer('(?=%s)'%opt_dic['-k'],rev_comp(sequence))
def print_statement(start,strand):
return '%s\thw1_script\tkmer=%s\t%s\t%s\t.\t%s\t.\tID=S288C;Name=S288C\n'%(opt_dic['-c'],opt_dic['-k'],start,start+len(opt_dic['-k'])-1,strand)
pos_strand = collections.deque()
neg_strand = collections.deque()
for match1,match2 in itertools.izip(kmer_matches1,kmer_matches2):
pos_strand.append(match1.start()+1)
neg_strand.append(match2.start()+1)
wfile = open('answer.gff3','w')
while len(pos_strand)>0 and len(neg_strand)>0:
if pos_strand[0]<neg_strand[0]:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
else:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
while len(pos_strand)>0:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
while len(neg_strand)>0:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
wfile.close()
return 'percent-GC = %s'%str(sum(sequence.count(gc) for gc in ["G","C"])/float(len(sequence)))
if __name__ == '__main__':
print find_kmers()
Invoking bash one-liners requires that the bash commands be a single string. Change:
subprocess.call(['bash','-c','rm','grep.tmp'])
to:
subprocess.call(['bash', '-c', 'rm grep.tmp'])
Or, more reasonably, don't use subprocesses for this, just do:
os.unlink('grep.tmp') # Or os.remove; same thing, different names
which is much faster and less error prone.
In fact, all of your subprocess usage could be replaced with real Python code, and it would improve it substantially (and much of the Python code simplifies too):
def find_kmers( arguments=sys.argv[1:] ):
required_opts = ['-f','-c','-k']
opts, args = getopt.getopt(arguments,'f:k:c:')
opt_dic = dict(opts)
for opt in required_opts:
if opt not in opt_dic:
return "incorrect arguments, please format as: python_script.py -f <filename> -k <kmer> -c <chromosome_name>"
def rev_comp(sequence):
reversed_dic = {'A':'T','T':'A','C':'G','G':'C'}
return ''.join(reversed_dic[_] for _ in sequence[::-1])
kmer = opt_dic['-k']
# Replaces grep with temp file with trivial Python equivalent
with open('S288C_R64.fasta') as f:
chromosomes = [line[1:].strip() for line in f if '>' in line]
# No need for any loop when just checking for exact value
if opt_dic['-c'] not in chromosomes:
return 'chromosome not found in %s. \n chromosomes in file are:%s'%(opt_dic['-f'],', '.join(str(_) for _ in chromosomes))
def get_sequence(file):
sequence = ''
for line in file:
if line.startswith('>'): break
sequence += line.strip()
return sequence.upper()
with open(opt_dic['-f']) as ofile:
for line in ofile:
if line.startswith('>'):
if line[1:].strip() == opt_dic['-c']:
sequence = get_sequence(ofile)
break
kmer_matches1 = re.finditer('(?=%s)'%opt_dic['-k'],sequence)
kmer_matches2 = re.finditer('(?=%s)'%opt_dic['-k'],rev_comp(sequence))
def print_statement(start,strand):
return '%s\thw1_script\tkmer=%s\t%s\t%s\t.\t%s\t.\tID=S288C;Name=S288C\n'%(opt_dic['-c'],opt_dic['-k'],start,start+len(opt_dic['-k'])-1,strand)
pos_strand = collections.deque()
neg_strand = collections.deque()
for match1,match2 in itertools.izip(kmer_matches1,kmer_matches2):
pos_strand.append(match1.start()+1)
neg_strand.append(match2.start()+1)
with open('answer.gff3','w') as wfile:
while pos_strand and neg_strand:
if pos_strand[0]<neg_strand[0]:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
else:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
for start in pos_strand:
wfile.write(print_statement(start,'+'))
for start in neg_strand:
wfile.write(print_statement(start,'-'))
return 'percent-GC = %s'%str(sum(sequence.count(gc) for gc in ["G","C"])/float(len(sequence)))

How to programmatically count the number of files in an archive using python

In the program I maintain it is done as in:
# count the files in the archive
length = 0
command = ur'"%s" l -slt "%s"' % (u'path/to/7z.exe', srcFile)
ins, err = Popen(command, stdout=PIPE, stdin=PIPE,
startupinfo=startupinfo).communicate()
ins = StringIO.StringIO(ins)
for line in ins: length += 1
ins.close()
Is it really the only way ? I can't seem to find any other command but it seems a bit odd that I can't just ask for the number of files
What about error checking ? Would it be enough to modify this to:
proc = Popen(command, stdout=PIPE, stdin=PIPE,
startupinfo=startupinfo)
out = proc.stdout
# ... count
returncode = proc.wait()
if returncode:
raise Exception(u'Failed reading number of files from ' + srcFile)
or should I actually parse the output of Popen ?
EDIT: interested in 7z, rar, zip archives (that are supported by 7z.exe) - but 7z and zip would be enough for starters
To count the number of archive members in a zip archive in Python:
#!/usr/bin/env python
import sys
from contextlib import closing
from zipfile import ZipFile
with closing(ZipFile(sys.argv[1])) as archive:
count = len(archive.infolist())
print(count)
It may use zlib, bz2, lzma modules if available, to decompress the archive.
To count the number of regular files in a tar archive:
#!/usr/bin/env python
import sys
import tarfile
with tarfile.open(sys.argv[1]) as archive:
count = sum(1 for member in archive if member.isreg())
print(count)
It may support gzip, bz2 and lzma compression depending on version of Python.
You could find a 3rd-party module that would provide a similar functionality for 7z archives.
To get the number of files in an archive using 7z utility:
import os
import subprocess
def count_files_7z(archive):
s = subprocess.check_output(["7z", "l", archive], env=dict(os.environ, LC_ALL="C"))
return int(re.search(br'(\d+)\s+files,\s+\d+\s+folders$', s).group(1))
Here's version that may use less memory if there are many files in the archive:
import os
import re
from subprocess import Popen, PIPE, CalledProcessError
def count_files_7z(archive):
command = ["7z", "l", archive]
p = Popen(command, stdout=PIPE, bufsize=1, env=dict(os.environ, LC_ALL="C"))
with p.stdout:
for line in p.stdout:
if line.startswith(b'Error:'): # found error
error = line + b"".join(p.stdout)
raise CalledProcessError(p.wait(), command, error)
returncode = p.wait()
assert returncode == 0
return int(re.search(br'(\d+)\s+files,\s+\d+\s+folders', line).group(1))
Example:
import sys
try:
print(count_files_7z(sys.argv[1]))
except CalledProcessError as e:
getattr(sys.stderr, 'buffer', sys.stderr).write(e.output)
sys.exit(e.returncode)
To count the number of lines in the output of a generic subprocess:
from functools import partial
from subprocess import Popen, PIPE, CalledProcessError
p = Popen(command, stdout=PIPE, bufsize=-1)
with p.stdout:
read_chunk = partial(p.stdout.read, 1 << 15)
count = sum(chunk.count(b'\n') for chunk in iter(read_chunk, b''))
if p.wait() != 0:
raise CalledProcessError(p.returncode, command)
print(count)
It supports unlimited output.
Could you explain why buffsize=-1 (as opposed to buffsize=1 in your previous answer: stackoverflow.com/a/30984882/281545)
bufsize=-1 means use the default I/O buffer size instead of bufsize=0 (unbuffered) on Python 2. It is a performance boost on Python 2. It is default on the recent Python 3 versions. You might get a short read (lose data) if on some earlier Python 3 versions where bufsize is not changed to bufsize=-1.
This answer reads in chunks and therefore the stream is fully buffered for efficiency. The solution you've linked is line-oriented. bufsize=1 means "line buffered". There is minimal difference from bufsize=-1 otherwise.
and also what the read_chunk = partial(p.stdout.read, 1 << 15) buys us ?
It is equivalent to read_chunk = lambda: p.stdout.read(1<<15) but provides more introspection in general. It is used to implement wc -l in Python efficiently.
Since I already have 7z.exe bundled with the app and I surely want to avoid a third party lib, while I do need to parse rar and 7z archives I think I will go with:
regErrMatch = re.compile(u'Error:', re.U).match # needs more testing
r"""7z list command output is of the form:
Date Time Attr Size Compressed Name
------------------- ----- ------------ ------------ ------------------------
2015-06-29 21:14:04 ....A <size> <filename>
where ....A is the attribute value for normal files, ....D for directories
"""
reFileMatch = re.compile(ur'(\d|:|-|\s)*\.\.\.\.A', re.U).match
def countFilesInArchive(srcArch, listFilePath=None):
"""Count all regular files in srcArch (or only the subset in
listFilePath)."""
# https://stackoverflow.com/q/31124670/281545
command = ur'"%s" l -scsUTF-8 -sccUTF-8 "%s"' % ('compiled/7z.exe', srcArch)
if listFilePath: command += u' #"%s"' % listFilePath
proc = Popen(command, stdout=PIPE, startupinfo=startupinfo, bufsize=-1)
length, errorLine = 0, []
with proc.stdout as out:
for line in iter(out.readline, b''):
line = unicode(line, 'utf8')
if errorLine or regErrMatch(line):
errorLine.append(line)
elif reFileMatch(line):
length += 1
returncode = proc.wait()
if returncode or errorLine: raise StateError(u'%s: Listing failed\n' +
srcArch + u'7z.exe return value: ' + str(returncode) +
u'\n' + u'\n'.join([x.strip() for x in errorLine if x.strip()]))
return length
Error checking as in Python Popen - wait vs communicate vs CalledProcessError by #JFSebastien
My final(ish) based on accepted answer - unicode may not be needed, kept it for now as I use it everywhere. Also kept regex (which I may expand, I have seen things like re.compile(u'^(Error:.+|.+ Data Error?|Sub items Errors:.+)',re.U). Will have to look into check_output and CalledProcessError.
def countFilesInArchive(srcArch, listFilePath=None):
"""Count all regular files in srcArch (or only the subset in
listFilePath)."""
command = [exe7z, u'l', u'-scsUTF-8', u'-sccUTF-8', srcArch]
if listFilePath: command += [u'#%s' % listFilePath]
proc = Popen(command, stdout=PIPE, stdin=PIPE, # stdin needed if listFilePath
startupinfo=startupinfo, bufsize=1)
errorLine = line = u''
with proc.stdout as out:
for line in iter(out.readline, b''): # consider io.TextIOWrapper
line = unicode(line, 'utf8')
if regErrMatch(line):
errorLine = line + u''.join(out)
break
returncode = proc.wait()
msg = u'%s: Listing failed\n' % srcArch.s
if returncode or errorLine:
msg += u'7z.exe return value: ' + str(returncode) + u'\n' + errorLine
elif not line: # should not happen
msg += u'Empty output'
else: msg = u''
if msg: raise StateError(msg) # consider using CalledProcessError
# number of files is reported in the last line - example:
# 3534900 325332 75 files, 29 folders
return int(re.search(ur'(\d+)\s+files,\s+\d+\s+folders', line).group(1))
Will edit this with my findings.

Python subprocess: capture output of ffmpeg and run regular expression against it

I have the following code
import subprocess
import re
from itertools import *
command = ['ffprobe', '-i', '/media/some_file.mp4']
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
text = p.stderr.read()
retcode = p.wait()
text = text.decode('utf-8')
p = re.compile("Duration(.*)")
num = 0 #for debugging
for line in iter(text.splitlines()):
print(str(num) + line) #for debugging
m = p.match(str(line))
if m != None:
print(m.group(1))
When I look at the output there is a line that says "Duration" on it, however it is not captured, print(m.group(1)) is never reached. If I change the text variable to a hardcoded string of "Duration blahblah" I get " blahblah", which is what I expect. It seems like the regex doesn't recognize the text coming back from stderr. How can I get the text into a format that the regex will recognize and match on?
I have come up with the following solution, should it help anyone else attempting to capture duration from ffmpeg using python
import subprocess
import re
command = ['ffprobe', '-i', '/media/some_file.mp4']
p = subprocess.Popen(command, stderr=subprocess.PIPE)
text = p.stderr.read()
retcode = p.wait()
text = text.decode('utf-8')
p = re.compile(".*Duration:\s([0-9:\.]*),", re.MULTILINE|re.DOTALL)
m = p.match(text)
print(m.group(1))
p = re.compile(r".*?Duration(.*)")
Try this.match starts from the begining while there may might be something before duration.

Initializing...Command 'sox' returned non-zero exit status 2

I have checked similar error message questions but have not found anything that quite fits my situation. I am trying to time align a .wav file with a .lab file using HTK, Prosodylab-aligner, and SoX.
Here is my input (using Prosodylab-aligner):
./align.py /path/to/files
All that comes up is this line of code:
Command 'sox' returned non-zero exit status 2
I looked up what this code means and apparently it means there is a command or keyword missing
I believe that the problem is in the align.py file but I am not sure where exactly.
Here is the area of the file that references SoX.
def _check_aud(self, wav_list, train=False):
"""
Check audio files, mixing down to mono and downsampling if
necessary. Writes copy_scp and the training or testing SCP files
"""
copy_scp = open(self.copy_scp, 'a')
check_scp = open(self.train_scp if train else self.test_scp, 'w')
i = 0
if self.has_sox:
for wav in wav_list:
head = os.path.splitext(os.path.split(wav)[1])[0]
mfc = os.path.join(self.aud_dir, head + '.mfc')
w = wave.open(wav, 'r')
pids = [] # pids
if (w.getframerate() != self.sr) or (w.getnchannels() > 1):
new_wav = os.path.join(self.aud_dir, head + '.wav')
pids.append(Popen(['sox', '-G', wav, '-b', '16',
new_wav, 'remix', '-',
'rate', str(self.sr),
'dither', '-s'], stderr=PIPE))
wav = new_wav
for pid in pids: # do a join
retcode = pid.wait()
if retcode != 0:
raise CalledProcessError(retcode, 'sox')
print >> copy_scp, '"{0}" "{1}"'.format(wav, mfc)
print >> check_scp, '"{0}"'.format(mfc)
w.close()
else:
for wav in wav_list:
head = os.path.splitext(wav)[0]
mfc = os.path.join(self.aud_dir, head + '.mfc')
w = wave.open(wav, 'r')
if (w.getframerate() != self.sr) or (w.getnchannels() != 1):
error('File {0} needs resampled but Sox not found ', w)
print >> copy_scp, '"{0}" "{1}"'.format(wav, mfc)
print >> check_scp, '"{0}"'.format(mfc)
w.close()
copy_scp.close()
check_scp.close()
Factor out the sox command line in the pids.append(Popen(...)) call into a variable like cmd, and print that before running it.
That should give you a command line that you can reproduce the problem with, possibly see a more descpriptive error message, and maybe narrow the problem down by tweaking the arguments.
# ...
new_wav = os.path.join(self.aud_dir, head + '.wav')
cmd = ['sox', '-G', wav, '-b', '16',
new_wav, 'remix', '-', 'rate',
str(self.sr), 'dither', '-s']
print "About to execute command:\n%s" % ' '.join(cmd)
pids.append(Popen(cmd, stderr=PIPE))
wav = new_wav
# ...

Python. Second step of subprocess.Popen truncates results of first

In the snipet of my python script below, I think that temp2 doesn't wait for temp to finish running, the output can be large, but is just text. This truncates the result ('out') from temp, it stops mid line. 'out' from temp works fine until temp 2 is added. I tried adding time.wait() as well as subprocess.Popen.wait(temp). These both allow temp to run to completion so that 'out' is not truncated but disrupt the chaining process so that there is no 'out2'. Any ideas?
temp = subprocess.Popen(call, stdout=subprocess.PIPE)
#time.wait(1)
#subprocess.Popen.wait(temp)
temp2 = subprocess.Popen(call2, stdin=temp.stdout, stdout=subprocess.PIPE)
out, err = temp.communicate()
out2, err2 = temp2.communicate()
According to the Python Docs communicate() can accept a stream to be sent as input. If you change stdin of temp2 to subprocess.PIPE and put out in communicate(), the data is properly piped.
#!/usr/bin/env python
import subprocess
import time
call = ["echo", "hello\nworld"]
call2 = ["grep", "w"]
temp = subprocess.Popen(call, stdout=subprocess.PIPE)
temp2 = subprocess.Popen(call2, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
out, err = temp.communicate()
out2, err2 = temp2.communicate(out)
print("Out: {0!r}, Err: {1!r}".format(out, err))
# Out: b'hello\nworld\n', Err: None
print("Out2: {0!r}, Err2: {1!r}".format(out2, err2))
# Out2: b'world\n', Err2: None
Following "Replacing shell pipeline" section from the docs:
temp = subprocess.Popen(call, stdout=subprocess.PIPE)
temp2 = subprocess.Popen(call2, stdin=temp.stdout, stdout=subprocess.PIPE)
temp.stdout.close()
out2 = temp2.communicate()[0]

Categories

Resources