Python unified diff with line numbers from both "files" - python

I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files.
The closest I can come is using diff on the command line like so:
/usr/bin/diff
--unchanged-line-format=' %.2dn %L'
--old-line-format="-%.2dn %L"
--new-line-format="+%.2dn %L"
file1.py
file2.py
BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options").
Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes):
+01: def renamed_function()
-01: def original_function():
02:
+03: """ Neat stuff here """
04:
21:
+22: # Here's a new comment
23:
85: # Output the value of foo()
+86: print "Foo is %s"%(foo())
-86: print foo()
87:

I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate.
def unified_diff(to_file_path, from_file_path, context=1):
""" Returns a list of differences between two files based
on some context. This is probably over-complicated. """
pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##')
from_lines = []
if os.path.exists(from_file_path):
from_fh = open(from_file_path,'r')
from_lines = from_fh.readlines()
from_fh.close()
to_lines = []
if os.path.exists(to_file_path):
to_fh = open(to_file_path,'r')
to_lines = to_fh.readlines()
to_fh.close()
diff_lines = []
lines = difflib.unified_diff(to_lines, from_lines, n=context)
for line in lines:
if line.startswith('--') or line.startswith('++'):
continue
m = pat_diff.match(line)
if m:
left = m.group(1)
right = m.group(2)
lstart = left.split(',')[0][1:]
rstart = right.split(',')[0][1:]
diff_lines.append("## %s %s ##\n"%(left, right))
to_lnum = int(lstart)
from_lnum = int(rstart)
continue
code = line[0]
lnum = from_lnum
if code == '-':
lnum = to_lnum
diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:]))
if code == '-':
to_lnum += 1
elif code == '+':
from_lnum += 1
else:
to_lnum += 1
from_lnum += 1
return diff_lines

Related

matplotlib xlim TypeError: '>' not supported between instances of 'int' and 'list'

this is the original repo i'm trying to run in my computer: https://github.com/kreamkorokke/cs244-final-project
import os
import matplotlib.pyplot as plt
import argparse
from attacker import check_attack_type
IMG_DIR = "./plots"
def read_lines(f, d):
lines = f.readlines()[:-1]
for line in lines:
typ, time, num = line.split(',')
if typ == 'seq':
d['seq']['time'].append(float(time))
d['seq']['num'].append(float(num))
elif typ == 'ack':
d['ack']['time'].append(float(time))
d['ack']['num'].append(float(num))
else:
raise "Unknown type read while parsing log file: %s" % typ
def main():
parser = argparse.ArgumentParser(description="Plot script for plotting sequence numbers.")
parser.add_argument('--save', dest='save_imgs', action='store_true',
help="Set this to true to save images under specified output directory.")
parser.add_argument('--attack', dest='attack',
nargs='?', const="", type=check_attack_type,
help="Attack name (used in plot names).")
parser.add_argument('--output', dest='output_dir', default=IMG_DIR,
help="Directory to store plots.")
args = parser.parse_args()
save_imgs = args.save_imgs
output_dir = args.output_dir
attack_name = args.attack
if save_imgs and attack_name not in ['div', 'dup', 'opt'] :
print("Attack name needed for saving plot figures.")
return
normal_log = {'seq':{'time':[], 'num':[]}, 'ack':{'time':[], 'num':[]}}
attack_log = {'seq':{'time':[], 'num':[]}, 'ack':{'time':[], 'num':[]}}
normal_f = open('log.txt', 'r')
attack_f = open('%s_attack_log.txt' % attack_name, 'r')
read_lines(normal_f, normal_log)
read_lines(attack_f, attack_log)
if attack_name == 'div':
attack_desc = 'ACK Division'
elif attack_name == 'dup':
attack_desc = 'DupACK Spoofing'
elif attack_name == 'opt':
attack_desc = 'Optimistic ACKing'
else:
raise 'Unknown attack type: %s' % attack_name
norm_seq_time, norm_seq_num = normal_log['seq']['time'], normal_log['seq']['num']
norm_ack_time, norm_ack_num = normal_log['ack']['time'], normal_log['ack']['num']
atck_seq_time, atck_seq_num = attack_log['seq']['time'], attack_log['seq']['num']
atck_ack_time, atck_ack_num = attack_log['ack']['time'], attack_log['ack']['num']
plt.plot(norm_seq_time, norm_seq_num, 'b^', label='Regular TCP Data Segments')
plt.plot(norm_ack_time, norm_ack_num, 'bx', label='Regular TCP ACKs')
plt.plot(atck_seq_time, atck_seq_num, 'rs', label='%s Attack Data Segments' % attack_desc)
plt.plot(atck_ack_time, atck_ack_num, 'r+', label='%s Attack ACKs' % attack_desc)
plt.legend(loc='upper left')
x = max(max(norm_seq_time, norm_ack_time),max(atck_seq_time, atck_ack_time))
y = max(max(norm_seq_num, norm_ack_num),max(atck_seq_num, atck_ack_num))
plt.xlim(0, x)
plt.ylim(0,y)
plt.xlabel('Time (s)')
plt.ylabel('Sequence Number (Bytes)')
if save_imgs:
# Save images to figure/
if not os.path.exists(output_dir):
os.makedirs(output_dir)
plt.savefig(output_dir + "/" + attack_name)
else:
plt.show()
normal_f.close()
attack_f.close()
if __name__ == "__main__":
main()
after running this i get this error
Traceback (most recent call last):
File "plot.py", line 85, in <module>
main()
File "plot.py", line 66, in main
plt.xlim(0, a)
File "/usr/lib/python3/dist-packages/matplotlib/pyplot.py", line 1427, in xlim
ret = ax.set_xlim(*args, **kwargs)
File "/usr/lib/python3/dist-packages/matplotlib/axes/_base.py", line 3267, in set_xlim
reverse = left > right
TypeError: '>' not supported between instances of 'int' and 'list'
Done! Please check ./plots for all generated plots.
how can i solve this problem? or better yet if there is another way of running this project? i installed matplotlib via pip3 install matplotlib command (same with scapy) and my main python version is python2 right now but i run the project with python3, could the issue be about this? what am i missing? or is it about mininet itself?
The problem is in this line
x = max(max(norm_seq_time, norm_ack_time),max(atck_seq_time, atck_ack_time))
IIUC, you wanna assign to x the maximum value among all those four lists. However, when you pass two lists to the max function, such as max(norm_seq_time, norm_ack_time), it will return the list it considers the greater one, and not the highest value considering both lists.
Instead, you can do something like:
x = max(norm_seq_time + norm_ack_time + atck_seq_time + atck_ack_time)
This will concatenate the four lists into a single one. Then, the function will return the highest value among all of them. You might wanna do that to the calculation of y as well.
If this is not what you wanted, or if you have any further issues, please let us know.
with the help of a friend we solved this problem by changing a part in code into this:
max_norm_seq_time = max(norm_seq_time) if len(norm_seq_time) > 0 else 0
max_norm_ack_time = max(norm_ack_time) if len(norm_ack_time) > 0 else 0
max_atck_seq_time = max(atck_seq_time) if len(atck_seq_time) > 0 else 0
max_atck_ack_time = max(atck_ack_time) if len(atck_ack_time) > 0 else 0
x = max((max_norm_seq_time, max_norm_ack_time,\
max_atck_seq_time, max_atck_ack_time))
plt.xlim([0,x])
max_norm_seq_num = max(norm_seq_num) if len(norm_seq_num) > 0 else 0
max_norm_ack_num = max(norm_ack_num) if len(norm_ack_num) > 0 else 0
max_atck_seq_num = max(atck_seq_num) if len(atck_seq_num) > 0 else 0
max_atck_ack_num = max(atck_ack_num) if len(atck_ack_num) > 0 else 0
plt.ylim([0, max((max_norm_seq_num, max_norm_ack_num,\
max_atck_seq_num, max_atck_ack_num))])
```
writing here just in case anyone else needs it.

Cyclic relations between nodes A>B>C>A

This is how it looks like before and after, for the problem am trying to solve using Python. I have been trying for weeks. And am failing so miserable to tell Python to do the following:
STEP1: If you find on this document: "LinkedTo=" * (Example value: Node_3)*
STEP2: Then Stop
STEP3: Go to the previous NodePosX= and copy the value * (Example value: 10)*
STEP4: Go to the previous NotePosY= and copy the value * (Example value: 100)*
STEP5: Then find the next "Node_3" on the document
STEP6: And replace inside the NodePosX=30 and NodePosY=300 for the copied values 10 and 100
STEP7: Then look for the next "LinkedTo=" * (Example value: Node_5)* and repeat the STEP2 to STEP5
This is how it looks like Before running the Python script:
Begin
Name="Node_1"
NodePosX=10
NodePosY=100
LinkedTo=Node_3
LinkedTo=Node_5
End Object
Begin
Name="Node_2"
NodePosX=20
NodePosY=200
End Object
Begin
Name="Node_3"
NodePosX=30
NodePosY=300
End Object
Begin
Name="Node_4"
NodePosX=40
NodePosY=400
End Object
Begin
Name="Node_5"
NodePosX=50
NodePosY=500
End Object
This is how it should look like AFTER running the Python script:
Begin
Name="Node_1"
NodePosX=10
NodePosY=100
LinkedTo=Node_3
LinkedTo=Node_5
End Object
Begin
Name="Node_2"
NodePosX=20
NodePosY=200
End Object
Begin
Name="Node_3"
NodePosX=10
NodePosY=100
End Object
Begin
Name="Node_4"
NodePosX=40
NodePosY=400
End Object
Begin
Name="Node_5"
NodePosX=10
NodePosY=100
End Object
Do you think am asking to much from Python to do?
Any better suggestions for the title to this problem?
I hired a developer and this is the code they wrote
'''
By: Alex Reichenbach
'''
import re
begin_regex = re.compile("Begin")
name_regex = "(?<=Name=\"Node_).*(?=\")"
posX_regex = "(?<=NodePosX=).*"
posY_regex = "(?<=NodePosY=).*"
linkedTo_regex = "(?<=LinkedTo=).*"
end_regex = re.compile("End Object")
## Reading the contents of the file
text = open("1-Example-Original.txt", "r").read()
class Node:
def __init__(self):
self.name = ""
self.nodePosX = 0
self.nodePosY = 0
self.linked_to = []
def __str__(self):
linked = ""
for l in self.linked_to:
linked += "\nLinkedTo="+l
return """Begin
Name="%s"
NodePosX=%s
NodePosY=%s%s
End Object
"""%(self.name, self.nodePosX, self.nodePosY, linked)
## Read the text into the node objects
nodes = []
current_node = None
for line in text.split('\n'): ## Iterate through each line
if begin_regex.match(line): ## Begin
current_node = Node()
nodes.append(current_node)
elif re.findall(name_regex, line): ## Name
name = re.findall(name_regex, line)[0]
current_node.name = name
elif re.findall(posX_regex, line): ## PosX
posX = re.findall(posX_regex, line)[0]
current_node.nodePosX = posX
elif re.findall(posY_regex, line): ## PosY
posY = re.findall(posY_regex, line)[0]
current_node.nodePosY = posY
elif re.findall(linkedTo_regex, line): ## LinkedTo
name = re.findall(linkedTo_regex, line)[0]
current_node.linked_to.append(name)
## Copy the linked_to attributes
for i in range(len(nodes)):
for j in range(i, len(nodes)):
node1 = nodes[i]
node2 = nodes[j]
if node2.name in node1.linked_to:
node2.nodePosX = node1.nodePosX
node2.nodePosY = node1.nodePosY
## Print it all out
s = ""
for node in nodes:
s += str(node)
print(s)
## Write to File?
open("_edited.txt", "w").write(s)

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!
Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0
Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.
Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

Python: File formatting

I have a for loop which references a dictionary and prints out the value associated with the key. Code is below:
for i in data:
if i in dict:
print dict[i],
How would i format the output so a new line is created every 60 characters? and with the character count along the side for example:
0001
MRQLLLISDLDNTWVGDQQALEHLQEYLGDRRGNFYLAYATGRSYHSARELQKQVGLMEP
0061
DYWLTAVGSEIYHPEGLDQHWADYLSEHWQRDILQAIADGFEALKPQSPLEQNPWKISYH
0121 LDPQACPTVIDQLTEMLKETGIPVQVIFSSGKDVDLLPQRSNKGNATQYLQQHLAMEPSQ
It's a finicky formatting problem, but I think the following code:
import sys
class EveryN(object):
def __init__(self, n, outs):
self.n = n # chars/line
self.outs = outs # output stream
self.numo = 1 # next tag to write
self.tll = 0 # tot chars on this line
def write(self, s):
while True:
if self.tll == 0: # start of line: emit tag
self.outs.write('%4.4d ' % self.numo)
self.numo += self.n
# wite up to N chars/line, no more
numw = min(len(s), self.n - self.tll)
self.outs.write(s[:numw])
self.tll += numw
if self.tll >= self.n:
self.tll = 0
self.outs.write('\n')
s = s[numw:]
if not s: break
if __name__ == '__main__':
sys.stdout = EveryN(60, sys.stdout)
for i, a in enumerate('abcdefgh'):
print a*(5+ i*5),
shows how to do it -- the output when running for demonstration purposes as the main script (five a's, ten b's, etc, with spaces in-between) is:
0001 aaaaa bbbbbbbbbb ccccccccccccccc dddddddddddddddddddd eeeeee
0061 eeeeeeeeeeeeeeeeeee ffffffffffffffffffffffffffffff ggggggggg
0121 gggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
0181 hhhhhhh
# test data
data = range(10)
the_dict = dict((i, str(i)*200) for i in range( 10 ))
# your loops as a generator
lines = ( the_dict[i] for i in data if i in the_dict )
def format( line ):
def splitter():
k = 0
while True:
r = line[k:k+60] # take a 60 char block
if r: # if there are any chars left
yield "%04d %s" % (k+1, r) # format them
else:
break
k += 60
return '\n'.join(splitter()) # join all the numbered blocks
for line in lines:
print format(line)
I haven't tested it on actual data, but I believe the code below would do the job. It first builds up the whole string, then outputs it a 60-character line at a time. It uses the three-argument version of range() to count by 60.
s = ''.join(dict[i] for i in data if i in dict)
for i in range(0, len(s), 60):
print '%04d %s' % (i+1, s[i:i+60])
It seems like you're looking for textwrap
The textwrap module provides two convenience functions, wrap() and
fill(), as well as TextWrapper, the class that does all the work, and
a utility function dedent(). If you’re just wrapping or filling one or
two text strings, the convenience functions should be good enough;
otherwise, you should use an instance of TextWrapper for efficiency.

Categories

Resources