python unicode implementation (using external programs: cygnative plink ssh rsync) - python

I have a backup applications in python that needs to work on Windows. It needs UTF compatibility (to be able to backup directories that contain UTF characters like italian accents). The problem is it uses external programs (plink, cygwin, ssh and rsync) and I can't get them working. The prototype is 32 lines long, please take a look:
# -*- coding: utf-8 -*-
import subprocess
def safestr(obj, encoding='utf-8'):
r"""Converts any given object to utf-8 encoded string.
>>> safestr('hello')
'hello'
>>> safestr(u'\u1234')
'\xe1\x88\xb4'
>>> safestr(2)
'2'
"""
if isinstance(obj, unicode):
return obj.encode("utf-8")
elif isinstance(obj, str):
return obj.encode
else:
return str(obj)
def execute(command):
pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
out, errs = pipe.communicate()
retcode = pipe.poll()
print "OUT: " + repr(out)
print "ERRS: " + repr(errs)
print "RET: " + str(retcode)
command = u'rsync --stats -az --numeric-ids --delete --blocking-io --modify-window=2 --no-group --chmod=u=rwX,g=,o= -e \'cygnative plink -ssh -2 -batch -pw test \' "/cygdrive/c/κόσμε" vaidab#192.168.1.86:/volatile/backup/vaidab/2010-03-03.15_41_56/ --link-dest=../2010-03-03.15_00_57'.encode('utf-8')
execute(command)
Still doesn't work with nosklo's version, check the result:
python prototype_unicode_new.py
'rsync.exe --stats -az --numeric-ids --delete --blocking-io --modify-window=2 --
no-group --chmod=u=rwX,g=,o= -e "cygnative plink -ssh -2 -batch -pw test" /cygdr
ive/c/\xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5 vaidab#192.168.1.86:/volatile/bac
kup/vaidab/2010-03-03.15_41_56/'
OUT: '\nNumber of files: 0\nNumber of files transferred: 0\nTotal file size: 0 b
ytes\nTotal transferred file size: 0 bytes\nLiteral data: 0 bytes\nMatched data:
0 bytes\nFile list size: 9\nFile list generation time: 0.001 seconds\nFile list
transfer time: 0.000 seconds\nTotal bytes sent: 22\nTotal bytes received: 12\n\
nsent 22 bytes received 12 bytes 68.00 bytes/sec\ntotal size is 0 speedup is
0.00\n'
ERRS: 'rsync: link_stat "/cygdrive/c/\xc3\x8e\xc2\xba\xc3\x8f\xc5\x92\xc3\x8f\xc
6\x92\xc3\x8e\xc2\xbc\xc3\x8e\xc2\xb5" failed: No such file or directory (2)\nrs
ync error: some files/attrs were not transferred (see previous errors) (code 23)
at /home/lapo/packaging/rsync-3.0.6-1/src/rsync-3.0.6/main.c(1039) [sender=3.0.
6]\n'
RET: 23

Don't use shell=True. EVER. It needlessy invokes a shell to call your program.
Pass the parameters as a list instead of a string.
This example should work, provided the parameters are right and the rsync.exe is in current folder (or PATH):
# -*- coding: utf-8 -*-
import subprocess
def execute(command):
pipe = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, errs = pipe.communicate()
retcode = pipe.poll()
print "OUT: " + repr(out)
print "ERRS: " + repr(errs)
print "RET: " + str(retcode)
return out
command = ['rsync.exe', '--stats', '-az', '--numeric-ids', '--delete',
'--blocking-io', '--modify-window=2', '--no-group',
'--chmod=u=rwX,g=,o=', '-e',
'cygnative plink -ssh -2 -batch -pw test',
u'/cygdrive/c/κόσμε'.encode('utf-8'),
'vaidab#192.168.1.86:/volatile/backup/vaidab/2010-03-03.15_41_56/',
'--link-dest=../2010-03-03.15_00_57']
execute(command)

A piece of code that passeth all understanding:
if isinstance(obj, unicode):
return obj.encode("utf-8")
elif isinstance(obj, str):
return obj.encode
# the above is returning a METHOD ***************************
else:
return str(obj)
What's the point of doctests if you don't run them?

Related

subprocess.Popen() sends awk and grep lines differently than expected

On a CentOS 7.2 I have a file called cpuload, which contains the latest CPU load data in the following format:
last 30 sec:
average load: 0
cpu0 total load: 0
cpu1 total load: 0
cpu2 total load: 0
cpu3 total load: 1
cpu4 total load: 0
cpu5 total load: 0
cpu6 total load: 0
cpu7 total load: 0
last sec:
average load: 1
cpu0 total load: 5
cpu1 total load: 1
cpu2 total load: 1
cpu3 total load: 3
cpu4 total load: 2
cpu5 total load: 1
cpu6 total load: 0
cpu7 total load: 0
I want to get the number after the "average load:" of the "last sec" bit.
Two cli commands give me that information when I run them as shell commands on the terminal:
grep 'average load:' cpuload | sed -n 's/.*load: //p' | tail -n1
and
awk 'NR > 2 && /average load:/ {print $3}' cpuload
But when I run them in subprocess.Popen() with Shell=True I only get stderr:
for:
import subprocess
cmd = ["grep", "'average load:'", "cpuload", "|", "sed", "-n", "'s/.*load: //p'", "|", "tail", "-n1"]
test = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
test.stderr.read()
I get:
b"Usage: grep [OPTION]... PATTERN [FILE]...\nTry 'grep --help' for more information.\n"
and for:
import subprocess
cmd = cmd = ["awk", "'NR > 2 && /average load:/ {print $3}'", "cpuload"]
test = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
test.stderr.read()
I also get:
b"awk: cmd. line:1: 'NR > 2 && /average load:/ {print $3}'\nawk: cmd. line:1: ^ invalid char ''' in expression\n"
Even though I avoided using a |
or if shell=True I get:
b"Usage: awk [POSIX or GNU style options] -f progfile [--] file ...\nUsage: awk [POSIX or GNU style options] [--] 'program' file ...\nPOSIX options:\t\tGNU long options: (standard)\n\t-f progfile\t\t--file=progfile\n\t-F fs\t\t\t--field-separator=fs\n\t-v var=val\t\t--assign=var=val\nShort options:\t\tGNU long options: (extensions)\n\t-b\t\t\t--characters-as-bytes\n\t-c\t\t\t--traditional\n\t-C\t\t\t--copyright\n\t-d[file]\t\t--dump-variables[=file]\n\t-e 'program-text'\t--source='program-text'\n\t-E file\t\t\t--exec=file\n\t-g\t\t\t--gen-pot\n\t-h\t\t\t--help\n\t-L [fatal]\t\t--lint[=fatal]\n\t-n\t\t\t--non-decimal-data\n\t-N\t\t\t--use-lc-numeric\n\t-O\t\t\t--optimize\n\t-p[file]\t\t--profile[=file]\n\t-P\t\t\t--posix\n\t-r\t\t\t--re-interval\n\t-S\t\t\t--sandbox\n\t-t\t\t\t--lint-old\n\t-V\t\t\t--version\n\nTo report bugs, see node `Bugs' in `gawk.info', which is\nsection `Reporting Problems and Bugs' in the printed version.\n\ngawk is a pattern scanning and processing language.\nBy default it reads standard input and writes standard output.\n\nExamples:\n\tgawk '{ sum += $1 }; END { print sum }' file\n\tgawk -F: '{ print $1 }' /etc/passwd\n"
What am I doing wrong?
I have a file called cpuload, which contains the latest CPU load data ...I want to get the number after the "average load:" of the "last sec" bit
why not just to use simple python code in order to get the value you are looking for?
with open('cpuload') as f:
lines = [l.strip() for l in f.readlines()]
got_it = False
for line in lines:
if got_it:
parts = line.split(':')
result = parts[-1].strip()
print(result)
break
if line == 'last sec:':
got_it = True
output
1
First case with grep, sed, tail... and pipes.
You need to use shell = True parameter for Popen method and a single string for the command. We need to put cotes around parameters:
import subprocess
cmd = "grep 'average load:' cpuload | sed -n 's/.*load: //p' | tail -n1"
output = ""
test = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
while True:
output += test.stdout.readline().decode("utf-8")
if test.poll() is not None:
break
print("output=<%s>" % (output))
Second case, without pipe:
You don't need to use shell = True parameter for Popen method and a single string for the command. We don't put cotes around parameters:
import subprocess
cmd = ["/usr/bin/awk", "NR > 2 && /^average load:/ {print $3}", "cpuload"]
output = ""
test = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
while True:
output += test.stdout.readline().decode("utf-8")
if test.poll() is not None:
break
print("output=<%s>" % (output))
The issue was with passing the awk parameter to subprocess with ' around them, as detailed here
Did not accept Ed Morton's comment as it did not specify what should have been done.

How to port python2 code to python3?

I find out the following code that print shellcode or machine code of the objdump in the output but in python3 i can't run it. How can I port it to python3:
import subprocess
import sys
from subprocess import Popen
num_of_args = len(sys.argv)
binary_file = sys.argv[1]
#| awk -F'[:]' '{print $2}' | awk -F'[ ]' '{print $1}' | tr -d '[[:space:]]'
proc = subprocess.Popen(['arm-linux-gnueabi-objdump','-d',binary_file], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
while True:
line = proc.stdout.readline()
if line != '':
array = line.rstrip().split(':')
if len(array) > 1:
if array[1]:
array2 = array[1].split(' ')
array2 = array2[0].lstrip().rstrip()
if array2:
sc_part = '"'
sc_part += '\\x'
sc_part += '\\x'.join(a+b for a,b in zip(array2[::2], array2[1::2]))
sc_part += '"'
print sc_part
else:
break
Use the 2to3 tool that comes bundled with the python distribution. An example of its usage is this:
foo.py
def foo:
for i in xrange(5):
print i,
foo()
In the command line, you'll type:
$ 2to3 -w foo.py
foo.py (post command):
def foo():
for i in range(5):
print(i, end=' ')
foo()
Running 2to3 on your code, it appears only the print needs changing: print(sc_part)
print(sc_part) may well be the only thing you need to change.
It would be helpful if you pasted the stacktrace.
Anyways, try replacing print sc_part with print(sc_part)
Furthermore, you can read all of the changes in Python3 here.

How to programmatically count the number of files in an archive using python

In the program I maintain it is done as in:
# count the files in the archive
length = 0
command = ur'"%s" l -slt "%s"' % (u'path/to/7z.exe', srcFile)
ins, err = Popen(command, stdout=PIPE, stdin=PIPE,
startupinfo=startupinfo).communicate()
ins = StringIO.StringIO(ins)
for line in ins: length += 1
ins.close()
Is it really the only way ? I can't seem to find any other command but it seems a bit odd that I can't just ask for the number of files
What about error checking ? Would it be enough to modify this to:
proc = Popen(command, stdout=PIPE, stdin=PIPE,
startupinfo=startupinfo)
out = proc.stdout
# ... count
returncode = proc.wait()
if returncode:
raise Exception(u'Failed reading number of files from ' + srcFile)
or should I actually parse the output of Popen ?
EDIT: interested in 7z, rar, zip archives (that are supported by 7z.exe) - but 7z and zip would be enough for starters
To count the number of archive members in a zip archive in Python:
#!/usr/bin/env python
import sys
from contextlib import closing
from zipfile import ZipFile
with closing(ZipFile(sys.argv[1])) as archive:
count = len(archive.infolist())
print(count)
It may use zlib, bz2, lzma modules if available, to decompress the archive.
To count the number of regular files in a tar archive:
#!/usr/bin/env python
import sys
import tarfile
with tarfile.open(sys.argv[1]) as archive:
count = sum(1 for member in archive if member.isreg())
print(count)
It may support gzip, bz2 and lzma compression depending on version of Python.
You could find a 3rd-party module that would provide a similar functionality for 7z archives.
To get the number of files in an archive using 7z utility:
import os
import subprocess
def count_files_7z(archive):
s = subprocess.check_output(["7z", "l", archive], env=dict(os.environ, LC_ALL="C"))
return int(re.search(br'(\d+)\s+files,\s+\d+\s+folders$', s).group(1))
Here's version that may use less memory if there are many files in the archive:
import os
import re
from subprocess import Popen, PIPE, CalledProcessError
def count_files_7z(archive):
command = ["7z", "l", archive]
p = Popen(command, stdout=PIPE, bufsize=1, env=dict(os.environ, LC_ALL="C"))
with p.stdout:
for line in p.stdout:
if line.startswith(b'Error:'): # found error
error = line + b"".join(p.stdout)
raise CalledProcessError(p.wait(), command, error)
returncode = p.wait()
assert returncode == 0
return int(re.search(br'(\d+)\s+files,\s+\d+\s+folders', line).group(1))
Example:
import sys
try:
print(count_files_7z(sys.argv[1]))
except CalledProcessError as e:
getattr(sys.stderr, 'buffer', sys.stderr).write(e.output)
sys.exit(e.returncode)
To count the number of lines in the output of a generic subprocess:
from functools import partial
from subprocess import Popen, PIPE, CalledProcessError
p = Popen(command, stdout=PIPE, bufsize=-1)
with p.stdout:
read_chunk = partial(p.stdout.read, 1 << 15)
count = sum(chunk.count(b'\n') for chunk in iter(read_chunk, b''))
if p.wait() != 0:
raise CalledProcessError(p.returncode, command)
print(count)
It supports unlimited output.
Could you explain why buffsize=-1 (as opposed to buffsize=1 in your previous answer: stackoverflow.com/a/30984882/281545)
bufsize=-1 means use the default I/O buffer size instead of bufsize=0 (unbuffered) on Python 2. It is a performance boost on Python 2. It is default on the recent Python 3 versions. You might get a short read (lose data) if on some earlier Python 3 versions where bufsize is not changed to bufsize=-1.
This answer reads in chunks and therefore the stream is fully buffered for efficiency. The solution you've linked is line-oriented. bufsize=1 means "line buffered". There is minimal difference from bufsize=-1 otherwise.
and also what the read_chunk = partial(p.stdout.read, 1 << 15) buys us ?
It is equivalent to read_chunk = lambda: p.stdout.read(1<<15) but provides more introspection in general. It is used to implement wc -l in Python efficiently.
Since I already have 7z.exe bundled with the app and I surely want to avoid a third party lib, while I do need to parse rar and 7z archives I think I will go with:
regErrMatch = re.compile(u'Error:', re.U).match # needs more testing
r"""7z list command output is of the form:
Date Time Attr Size Compressed Name
------------------- ----- ------------ ------------ ------------------------
2015-06-29 21:14:04 ....A <size> <filename>
where ....A is the attribute value for normal files, ....D for directories
"""
reFileMatch = re.compile(ur'(\d|:|-|\s)*\.\.\.\.A', re.U).match
def countFilesInArchive(srcArch, listFilePath=None):
"""Count all regular files in srcArch (or only the subset in
listFilePath)."""
# https://stackoverflow.com/q/31124670/281545
command = ur'"%s" l -scsUTF-8 -sccUTF-8 "%s"' % ('compiled/7z.exe', srcArch)
if listFilePath: command += u' #"%s"' % listFilePath
proc = Popen(command, stdout=PIPE, startupinfo=startupinfo, bufsize=-1)
length, errorLine = 0, []
with proc.stdout as out:
for line in iter(out.readline, b''):
line = unicode(line, 'utf8')
if errorLine or regErrMatch(line):
errorLine.append(line)
elif reFileMatch(line):
length += 1
returncode = proc.wait()
if returncode or errorLine: raise StateError(u'%s: Listing failed\n' +
srcArch + u'7z.exe return value: ' + str(returncode) +
u'\n' + u'\n'.join([x.strip() for x in errorLine if x.strip()]))
return length
Error checking as in Python Popen - wait vs communicate vs CalledProcessError by #JFSebastien
My final(ish) based on accepted answer - unicode may not be needed, kept it for now as I use it everywhere. Also kept regex (which I may expand, I have seen things like re.compile(u'^(Error:.+|.+ Data Error?|Sub items Errors:.+)',re.U). Will have to look into check_output and CalledProcessError.
def countFilesInArchive(srcArch, listFilePath=None):
"""Count all regular files in srcArch (or only the subset in
listFilePath)."""
command = [exe7z, u'l', u'-scsUTF-8', u'-sccUTF-8', srcArch]
if listFilePath: command += [u'#%s' % listFilePath]
proc = Popen(command, stdout=PIPE, stdin=PIPE, # stdin needed if listFilePath
startupinfo=startupinfo, bufsize=1)
errorLine = line = u''
with proc.stdout as out:
for line in iter(out.readline, b''): # consider io.TextIOWrapper
line = unicode(line, 'utf8')
if regErrMatch(line):
errorLine = line + u''.join(out)
break
returncode = proc.wait()
msg = u'%s: Listing failed\n' % srcArch.s
if returncode or errorLine:
msg += u'7z.exe return value: ' + str(returncode) + u'\n' + errorLine
elif not line: # should not happen
msg += u'Empty output'
else: msg = u''
if msg: raise StateError(msg) # consider using CalledProcessError
# number of files is reported in the last line - example:
# 3534900 325332 75 files, 29 folders
return int(re.search(ur'(\d+)\s+files,\s+\d+\s+folders', line).group(1))
Will edit this with my findings.

Invoking perl script with variable input and file output as arguments from python

I have a perl script that can be executed from the console as follows:
perl perlscript.pl -i input.txt -o output.txt --append
I want to execute this script from my python code. I figured out that subprocess.Popen can be used to connect to perl and I can pass my arguments with it. But, I also want to pass a variable (made by splitting up a text file) in place of input.txt.
I have tried the following but it doesn't seem to work and gives an obvious TypeError in line 8:
import re, shlex, subprocess, StringIO
f=open('fulltext.txt','rb')
text= f.read()
l = re.split('\n\n',str(text))
intxt = StringIO.StringIO()
for i in range(len(l)):
intxt.write(l[i])
command_line='perl cnv_ltrfinder2gff.pl -i '+intxt+' -o output.gff --append'
args=shlex.split(command_line)
p = subprocess.Popen(args)
Is there any other work around for this?
EDIT: Here is a sample of the file fulltext.txt. Entries are separated by a line.
Predict protein Domains 0.021 second
>Sequence: seq1 Len:13143 [1] seq1 Len:13143 Location : 9 - 13124 Len: 13116 Strand:+ Score : 6 [LTR region similarity:0.959] Status : 11110110000 5'-LTR : 9 - 501 Len: 493 3'-LTR : 12633 - 13124 Len: 492 5'-TG : TG , TG 3'-CA : CA , CA TSR : NOT FOUND Sharpness: 1,1 Strand + : PBS : [14/20] 524 - 543 (LysTTT) PPT : [12/15] 12553 - 12567
Predict protein Domains 0.019 second
>Sequence: seq5 Len:11539 [1] seq5 Len:11539 Location : 7 - 11535 Len: 11529 Strand:+ Score : 6 [LTR region similarity:0.984] Status : 11110110000 5'-LTR : 7 - 506 Len: 500 3'-LTR : 11036 - 11535 Len: 500 5'-TG : TG , TG 3'-CA : CA , CA TSR : NOT FOUND Sharpness: 1,1 Strand + : PBS : [15/22] 515 - 536 (LysTTT) PPT : [11/15] 11020 - 11034
I want to separate them and pass each entry block to the perl script. All the files are in the same directory.
you might be interested in the os module
and string formatting
Edit
I think I uderstand what you want now. correct me if I am wrong, but I think:
You want to split your fulltext.txt into blocks.
Every block contains a seq(number)
You want to run your perl script once for every block with as input file your seq(number)
if this is what you want, you could use the following code.
import os
in_file = 'fulltext.txt'
seq = []
with open(in_file,'r') as handle:
lines = handle.readlines()
for i in range(0,len(lines)):
if lines[i].startswith(">"):
seq.append(lines[i].rstrip().split(" ")[1])
for x in seq:
command = "perl perl cnv_ltrfinder2gff.pl -i %s.txt -o output.txt --append"%x
os.system(command)
The docs for --infile option:
Path of the input file. If an input file is not provided, the program
will expect input from STDIN.
You could omit --infile and pass input via a pipe (stdin) instead:
#!/usr/bin/env python
from subprocess import Popen, PIPE
with open('fulltext.txt') as file: # read input data
blocks = file.read().split('\n\n')
# run a separate perl process for each block
args = 'perl cnv_ltrfinder2gff.pl -o output.gff --append'.split()
for block in blocks:
p = Popen(args, stdin=PIPE, universal_newlines=True)
p.communicate(block)
if p.returncode != 0:
print('non-zero exit status: %s on block: %r' % (p.returncode, block))
You can run several perl scripts concurrently:
from multiprocessing.dummy import Pool # use threads
def run((i, block)):
filename = 'out%03d.gff' % i
args = ['perl', 'cnv_ltrfinder2gff.pl', '-o', filename]
p = Popen(args, stdin=PIPE, universal_newlines=True, close_fds=True)
p.communicate(block)
return p.returncode, filename
exit_statuses, filenames = zip(*Pool().map(run, enumerate(blocks, start=1)))
It runs several (equal to the number of CPUs on your system) child processes in parallel. You could specify a different number of worker threads (pass to Pool()).

Pass python script output to another programs stdin

I have an application that takes input, either from the terminal directly or I can use a pipe to pass the output of another program into the stdin of this one. What I am trying to do is use python to generate the output so it's formatted correctly and pass that to the stdin of this program all from the same script. Here is the code:
#!/usr/bin/python
import os
import subprocess
import plistlib
import sys
def appScan():
os.system("system_profiler -xml SPApplicationsDataType > apps.xml")
appList = plistlib.readPlist("apps.xml")
sys.stdout.write( "Mac_App_List\n"
"Delimiters=\"^\"\n"
"string50 string50\n"
"Name^Version\n")
appDict = appList[0]['_items']
for x in appDict:
if 'version' in x:
print x['_name'] + "^" + x['version'] + "^"
else:
print x['_name'] + "^" + "no version found" + "^"
proc = subprocess.Popen(["/opt/altiris/notification/inventory/lib/helpers/aex- sendcustominv","-t","-"], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
proc.communicate(input=appScan())
For some reason this subprocess I am calling doesn't like what is coming into stdin. However if I remove the subprocess items and just have the script print to stdout and then call the script from the terminal (python appScan.py | aex-sendcustominv), aex-sendcustominv is able to accept the input just fine. Is there any way to take a functions output in python and send it to the stdin of an subprocess?
The problem is that appScan() only prints to stdout; appScan() returns None, so proc.communicate(input=appScan()) is equivalent to proc.communicate(input=None). You need appScan to return a string.
Try this (not tested):
def appScan():
os.system("system_profiler -xml SPApplicationsDataType > apps.xml")
appList = plistlib.readPlist("apps.xml")
output_str = 'Delimiters="^"\nstring50 string50\nName^Version\n'
appDict = appList[0]['_items']
for x in appDict:
if 'version' in x:
output_str = output_str + x['_name'] + "^" + x['version'] + "^"
else:
output_str = output_str + x['_name'] + "^" + "no version found" + "^"
return output_str
proc = subprocess.Popen(["/opt/altiris/notification/inventory/lib/helpers/aex- sendcustominv","-t","-"], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
proc.communicate(input=appScan())

Categories

Resources