Given a Python script with print() statements, I'd like to be able to run through the script and insert a comment after each statement that shows the output from each. To demonstrate, take this script named example.py:
a, b = 1, 2
print('a + b:', a + b)
c, d = 3, 4
print('c + d:', c + d)
The desired output would be:
a, b = 1, 2
print('a + b:', a + b)
# a + b: 3
c, d = 3, 4
print('c + d:', c + d)
# c + d: 7
Here's my attempt, which works for simple examples like the one above:
import sys
from io import StringIO
def intercept_stdout(func):
"redirect stdout from a target function"
def wrapper(*args, **kwargs):
"wrapper function for intercepting stdout"
# save original stdout
original_stdout = sys.stdout
# set up StringIO object to temporarily capture stdout
capture_stdout = StringIO()
sys.stdout = capture_stdout
# execute wrapped function
func(*args, **kwargs)
# assign captured stdout to value
func_output = capture_stdout.getvalue()
# reset stdout
sys.stdout = original_stdout
# return captured value
return func_output
return wrapper
#intercept_stdout
def exec_target(name):
"execute a target script"
with open(name, 'r') as f:
exec(f.read())
def read_target(name):
"read source code from a target script & return it as a list of lines"
with open(name) as f:
source = f.readlines()
# to properly format last comment, ensure source ends in a newline
if len(source[-1]) >= 1 and source[-1][-1] != '\n':
source[-1] += '\n'
return source
def annotate_source(target):
"given a target script, return the source with comments under each print()"
target_source = read_target(target)
# find each line that starts with 'print(' & get indices in reverse order
print_line_indices = [i for i, j in enumerate(target_source)
if len(j) > 6 and j[:6] == 'print(']
print_line_indices.reverse()
# execute the target script and get each line output in reverse order
target_output = exec_target(target)
printed_lines = target_output.split('\n')
printed_lines.reverse()
# iterate over the source and insert commented target output line-by-line
annotated_source = []
for i, line in enumerate(target_source):
annotated_source.append(line)
if print_line_indices and i == print_line_indices[-1]:
annotated_source.append('# ' + printed_lines.pop() + '\n')
print_line_indices.pop()
# return new annotated source as a string
return ''.join(annotated_source)
if __name__ == '__main__':
target_script = 'example.py'
with open('annotated_example.py', 'w') as f:
f.write(annotate_source(target_script))
However, it fails for scripts with print() statements that span multiple lines, as well as for print() statements that aren't at the start of a line. In a best-case scenario, it would even work for print() statements inside a function. Take the following example:
print('''print to multiple lines, first line
second line
third line''')
print('print from partial line, first part') if True else 0
1 if False else print('print from partial line, second part')
print('print from compound statement, first part'); pass
pass; print('print from compound statement, second part')
def foo():
print('bar')
foo()
Ideally, the output would look like this:
print('''print to multiple lines, first line
second line
third line''')
# print to multiple lines, first line
# second line
# third line
print('print from partial line, first part') if True else 0
# print from partial line, first part
1 if False else print('print from partial line, second part')
# print from partial line, second part
print('print from compound statement, first part'); pass
# print from compound statement, first part
pass; print('print from compound statement, second part')
# print from compound statement, second part
def foo():
print('bar')
foo()
# bar
But the script above mangles it like so:
print('''print to multiple lines, first line
# print to multiple lines, first line
second line
third line''')
print('print from partial line, first part') if True else 0
# second line
1 if False else print('print from partial line, second part')
print('print from compound statement, first part'); pass
# third line
pass; print('print from compound statement, second part')
def foo():
print('bar')
foo()
What approach would make this process more robust?
Have you considered using the inspect module? If you are willing to say that you always want the annotations next to the top most call, and the file you are annotating is simple enough, you can get reasonable results. The following is my attempt, which overrides the built in print function and looks at a stack trace to determine where print was called:
import inspect
import sys
from io import StringIO
file_changes = {}
def anno_print(old_print, *args, **kwargs):
(frame, filename, line_number,
function_name, lines, index) = inspect.getouterframes(inspect.currentframe())[-2]
if filename not in file_changes:
file_changes[filename] = {}
if line_number not in file_changes[filename]:
file_changes[filename][line_number] = []
orig_stdout = sys.stdout
capture_stdout = StringIO()
sys.stdout = capture_stdout
old_print(*args, **kwargs)
output = capture_stdout.getvalue()
file_changes[filename][line_number].append(output)
sys.stdout = orig_stdout
return
def make_annotated_file(old_source, new_source):
changes = file_changes[old_source]
old_source_F = open(old_source)
new_source_F = open(new_source, 'w')
content = old_source_F.readlines()
for i in range(len(content)):
line_num = i + 1
new_source_F.write(content[i])
if content[i][-1] != '\n':
new_source_F.write('\n')
if line_num in changes:
for output in changes[line_num]:
output = output[:-1].replace('\n', '\n#') + '\n'
new_source_F.write("#" + output)
new_source_F.close()
if __name__=='__main__':
target_source = "foo.py"
old_print = __builtins__.print
__builtins__.print = lambda *args, **kwargs: anno_print(old_print, *args, **kwargs)
with open(target_source) as f:
code = compile(f.read(), target_source, 'exec')
exec(code)
__builtins__.print = old_print
make_annotated_file(target_source, "foo_annotated.py")
If I run it on the following file "foo.py":
def foo():
print("a")
print("b")
def cool():
foo()
print("c")
def doesnt_print():
a = 2 + 3
print(1+2)
foo()
doesnt_print()
cool()
The output is "foo_annotated.py":
def foo():
print("a")
print("b")
def cool():
foo()
print("c")
def doesnt_print():
a = 2 + 3
print(1+2)
#3
foo()
#a
#b
doesnt_print()
cool()
#a
#b
#c
You can make it a lot easier by using an existing python parser to extract top level statements from your code. The ast module in the standard library for example. However, ast loses some information like comments.
Libraries built with source code transformations (which you are doing) in mind might be more suited here. redbaron is a nice example.
To carry globals to the next exec(), you have to use the second parameter (documentation):
environment = {}
for statement in statements:
exec(statement, environment)
Thanks to feedback from #Lennart, I've almost got it working... It iterates through line-by-line, clumping lines into longer and longer blocks as long as the current block contains a SyntaxError when fed to exec(). Here it is in case it's of use to anyone else:
import sys
from io import StringIO
def intercept_stdout(func):
"redirect stdout from a target function"
def wrapper(*args, **kwargs):
"wrapper function for intercepting stdout"
# save original stdout
original_stdout = sys.stdout
# set up StringIO object to temporarily capture stdout
capture_stdout = StringIO()
sys.stdout = capture_stdout
# execute wrapped function
func(*args, **kwargs)
# assign captured stdout to value
func_output = capture_stdout.getvalue()
# reset stdout
sys.stdout = original_stdout
# return captured value
return func_output
return wrapper
#intercept_stdout
def exec_line(source, block_globals):
"execute a target block of source code and get output"
exec(source, block_globals)
def read_target(name):
"read source code from a target script & return it as a list of lines"
with open(name) as f:
source = f.readlines()
# to properly format last comment, ensure source ends in a newline
if len(source[-1]) >= 1 and source[-1][-1] != '\n':
source[-1] += '\n'
return source
def get_blocks(target, block_globals):
"get outputs for each block of code in source"
outputs = []
lines = 1
#intercept_stdout
def eval_blocks(start_index, end_index, full_source, block_globals):
"work through a group of lines of source code and exec each block"
nonlocal lines
try:
exec(''.join(full_source[start_index:end_index]), block_globals)
except SyntaxError:
lines += 1
eval_blocks(start_index, start_index + lines,
full_source, block_globals)
for i, s in enumerate(target):
if lines > 1:
lines -= 1
continue
outputs.append((eval_blocks(i, i+1, target, block_globals), i, lines))
return [(i[1], i[1] + i[2]) for i in outputs]
def annotate_source(target, block_globals={}):
"given a target script, return the source with comments under each print()"
target_source = read_target(target)
# get each block's start and end indices
outputs = get_blocks(target_source, block_globals)
code_blocks = [''.join(target_source[i[0]:i[1]]) for i in outputs]
# iterate through each
annotated_source = []
for c in code_blocks:
annotated_source.append(c)
printed_lines = exec_line(c, block_globals).split('\n')
if printed_lines and printed_lines[-1] == '':
printed_lines.pop()
for line in printed_lines:
annotated_source.append('# ' + line + '\n')
# return new annotated source as a string
return ''.join(annotated_source)
def main():
### script to format goes here
target_script = 'example.py'
### name of formatted script goes here
new_script = 'annotated_example.py'
new_code = annotate_source(target_script)
with open(new_script, 'w') as f:
f.write(new_code)
if __name__ == '__main__':
main()
It works for each of the two examples above. However, when trying to execute the following:
def foo():
print('bar')
print('baz')
foo()
Instead of giving me the desired output:
def foo():
print('bar')
print('baz')
foo()
# bar
# baz
It fails with a very long traceback:
Traceback (most recent call last):
File "ex.py", line 55, in eval_blocks
exec(''.join(full_source[start_index:end_index]), block_globals)
File "<string>", line 1
print('baz')
^
IndentationError: unexpected indent
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ex.py", line 55, in eval_blocks
exec(''.join(full_source[start_index:end_index]), block_globals)
File "<string>", line 1
print('baz')
^
IndentationError: unexpected indent
During handling of the above exception, another exception occurred:
...
Traceback (most recent call last):
File "ex.py", line 55, in eval_blocks
exec(''.join(full_source[start_index:end_index]), block_globals)
File "<string>", line 1
print('baz')
^
IndentationError: unexpected indent
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ex.py", line 102, in <module>
main()
File "ex.py", line 97, in main
new_code = annotate_source(target_script)
File "ex.py", line 74, in annotate_source
outputs = get_blocks(target_source, block_globals)
File "ex.py", line 65, in get_blocks
outputs.append((eval_blocks(i, i+1, target, block_globals), i, lines))
File "ex.py", line 16, in wrapper
func(*args, **kwargs)
File "ex.py", line 59, in eval_blocks
full_source, block_globals)
File "ex.py", line 16, in wrapper
func(*args, **kwargs)
...
File "ex.py", line 16, in wrapper
func(*args, **kwargs)
File "ex.py", line 55, in eval_blocks
exec(''.join(full_source[start_index:end_index]), block_globals)
RecursionError: maximum recursion depth exceeded while calling a Python object
Looks like this happens due to def foo(): print('bar') being valid code and so print('baz') isn't being included in the function, causing it to fail with an IndentationError. Any ideas as to how to avoid this issue? I suspect it may require diving into ast as suggested above but would love further input or a usage example.
It looks like except SyntaxError isn't a sufficient check for a full function, as it will finish the block a the first line which doesn't create a syntax error. What you want is to make sure the whole function is encompassed in the same block. To accomplish this:
check if the current block is a function. Check if the first line starts with def.
check if the next line in full_source begins with a higher or equal number of spaces as the second line of the function (the one which defines the indent). This will mean that the eval_blocks will check if the next line of the code has higher or equal spacing, and is therefore inside the function.
The code for get_blocks might look something like this:
# function for finding num of spaces at beginning (could be in global spectrum)
def get_front_whitespace(string):
spaces = 0
for char in string:
# end loop at end of spaces
if char not in ('\t', ' '):
break
# a tab is equal to 8 spaces
elif char == '\t':
spaces += 8
# otherwise must be a space
else:
spaces += 1
return spaces
...
def get_blocks(target, block_globals):
"get outputs for each block of code in source"
outputs = []
lines = 1
# variable to check if current block is a function
block_is_func = False
#intercept_stdout
def eval_blocks(start_index, end_index, full_source, block_globals):
"work through a group of lines of source code and exec each block"
nonlocal lines
nonlocal block_is_func
# check if block is a function
block_is_func = ( full_source[start_index][:3] == 'def' )
try:
exec(''.join(full_source[start_index:end_index]), block_globals)
except SyntaxError:
lines += 1
eval_blocks(start_index, start_index + lines,
full_source, block_globals)
else:
# if the block is a function, check for indents
if block_is_func:
# get number of spaces in first indent of function
func_indent= get_front_whitespace( full_source[start_index + 1] )
# get number of spaces in the next index
next_index_spaces = get_front_whitespace( full_source[end_index + 1] )
# if the next line is equally or more indented than the function indent, continue to next recursion layer
if func_indent >= next_index_spaces:
lines += 1
eval_blocks(start_index, start_index + lines,
full_source, block_globals)
for i, s in enumerate(target):
# reset the function variable for next block
if block_is_func: block_is_func = False
if lines > 1:
lines -= 1
continue
outputs.append((eval_blocks(i, i+1, target, block_globals), i, lines))
return [(i[1], i[1] + i[2]) for i in outputs]
This might create an index error if the last line of the function was the end of the file though, due to the forward indexing at end_index_spaces = get_front_whitespace( full_source[end_index + 1] )
This could also be used for selection statements and loops, which may have the same problem: just check for if for and while at the beginning of the start_index line as well as for def. This would cause the comment to be after the indented region, but as printed output inside indented regions are dependent on the variables which are used to call them, I think having the output outside the indent would be necessary in any case.
Try https://github.com/eevleevs/hashequal/
I made this as an attempt to replace Mathcad. Does not act on print statements, but on #= comments, e.g.:
a = 1 + 1 #=
becomes
a = 1 + 1 #= 2
Related
I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.
I have a function:
def get_translations(string):
"""stuff"""
list1 = []
empty = []
lines = string.splitlines()
order = 1 if string.split(',')[0] == "english" else -1
if 'english,māori' in string:
for seperate_words in lines:
list1.append(tuple(seperate_words.split(",")[::order]))
return list1[1:]
if 'māori,english' in string:
for seperate_words in lines:
list1.append(tuple(seperate_words.split(",")[::order]))
return list1[1:]
else:
print("Header language not recognized!")
return empty
I am trying to call this function in a new function get_translations_from_file so I can perform what the first function is meant to do. This is because I am trying to open a file and use that file as the string for the first function. This is what I have tried to call the first function but I have had no success:
def get_translations_from_file(filename):
"""stuff"""
file = open(filename)
string = file.read()
get_translations(string)
The test is:
filename = 'places.txt'
for terms in get_translations_from_file(filename):
print(terms)
The contents of the file is:
english,māori
New Zealand,Aotearoa
North Island,Te Ika-a-Māui
South Island,Te Waipounamu
Wellington,Te Whanganui a Tara
Christchurch,Ōtautahi
Hamilton,Kirikiriroa
Auckland,Tāmaki Makaurau
You are calling get_translations, but ignoring the return value. Since get_translations_from_file has no explicit return statement, it implicitly returns None. To make a long story short, you need to return the value from get_translations:
def get_translations_from_file(filename):
"""stuff"""
file = open(filename)
string = file.read()
return get_translations(string) # Here!
I've made a Python programm with a interface that receives the name of the file and a numerical data. When I create methods to manipulate filename, directory, among others, it returns an error.
I believe the error comes with object orientation. How can I solve this?
I've divided the program in two parts: one to solve my problem (no object orientation) and another to receive user data.
Error:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Python27\lib\lib-tk\Tkinter.py", line 1541, in __call__
return self.func(*args)
File "teste.py", line 60, in verificaSenha
if (Procura_nome(nome_arq) == 1):
NameError: global name 'Procura_nome' is not defined
The complete code: https://pastebin.com/Br6JAcuR
Problematic method:
def Procura_nome(nome_arq):
dir = Percorre_dir_entrada()
arquivo = dir + dir[2] + nome_arq + + ".shp"
os.path.isfile(nome_arq)
try:
with open(arquivo, 'r') as f:
return 1
except IOError:
return 0
All python method class must have self param as first argument, this argument refers to the instance of your class, also when using class methods and attributs inside your class you should refer to them with self.
You probably need to add self to all your method class in your file.
You also need to remove one '+' on the 3rd line.
def Procura_nome(self, nome_arq):
dir = self.Percorre_dir_entrada()
arquivo = dir + dir[2] + nome_arq + ".shp"
os.path.isfile(nome_arq)
try:
with open(arquivo, 'r') as f:
return 1
except IOError:
return 0
Your Percorre_dir_entrada and Percorre_dir_saida function are doing exactly the same thing on different files, you should think about doing a generic version who take the file name as param like so :
def Percorre_dir(self, file_name):
achou = 0
dirlist = os.listdir(".")
for i in dirlist:
filename = os.path.abspath(i)
if((filename.find(file_name)) != -1):
achou = 1
return filename
if(achou == 0):
return 0
then call it :
Percorre_dir("Saida")
Percorre_dir("Entrada")
I'm having some trouble optimizing this part of code.
It works, but seems unnecessary slow.
The function searches after a searchString in a file starting on line line_nr and returns the line number for first hit.
import linecache
def searchStr(fileName, searchString, line_nr = 1, linesInFile):
# The above string is the input to this function
# line_nr is needed to search after certain lines.
# linesInFile is total number of lines in the file.
while line_nr < linesInFile + 1:
line = linecache.getline(fileName, line_nr)
has_match = line.find(searchString)
if has_match >= 0:
return line_nr
break
line_nr += 1
I've tried something along these lines, but never managed to implement the "start on a certain line number"-input.
Edit: The usecase. I'm post processing analysis files containing text and numbers that are split into different sections with headers. The headers on line_nr are used to break out chunks of the data for further processing.
Example of call:
startOnLine = searchStr(fileName, 'Header 1', 1, 10000000):
endOnLine = searchStr(fileName, 'Header 2', startOnLine, 10000000):
Why don't you start with simplest possible implementation ?
def search_file(filename, target, start_at = 0):
with open(filename) as infile:
for line_no, line in enumerate(infile):
if line_no < start_at:
continue
if line.find(target) >= 0:
return line_no
return None
I guess your file is like:
Header1 data11 data12 data13..
name1 value1 value2 value3...
...
...
Header2 data21 data22 data23..
nameN valueN1 valueN2 valueN3..
...
Does the 'Header' string contains any constant formats(i.e: all start with '#' or sth). If so, you can read the line directly, judge if the line contains this format (i.e: if line[0]=='#') and write different code for different kinds of lines(difination line and data line in your example).
Record class:
class Record:
def __init__(self):
self.data={}
self.header={}
def set_header(self, line):
...
def add_data(self, line):
...
iterate part:
def parse(p_file):
record = None
for line in p_file:
if line[0] == "#":
if record : yield record
else:
record = Record()
record.set_header(line)
else:
record.add_data(line)
yield record
main func:
data_file = open(...)
for rec in parse(data_file):
...
I have a very very large text file (much larger than can fit in memory). What I would like to do is use something similar to:
for record in myFile:
process_record();
with the added trick that my records are separated by blank lines (with all kinds of stuff in between). For example...
data1
data2,data3,moredata
anotherrecord,otherstuff
yippee
kaiyay
mom
aThird,record:here
How would one iterate through the file in python where each loop iteration accesses a single record from the file?
You can do it with a generator function:
def records(textfile):
record_lines = []
for line in textfile:
if line != '\n':
record_lines.append(line)
else:
yield ''.join(record_lines)
record_lines = []
yield ''.join(record_lines)
for record in records(the_file):
process(record)
You could create an iterator that joins the lines until you find a blank line.
class MyIter:
def __init__(self, infile):
self.infile=infile
def __iter__(self):
return self
def next(self):
lines = []
for line in infile:
line = line.strip()
if len(line) > 0:
lines.append(line)
else:
break
if len(lines)==0:
raise StopIteration
else:
return ",".join(lines)
and try it with
for line in MyIter(infile):
print line