Replacing characters in python

Replacing characters in python - python

I wrote a compressor based on the Huffman algorithm to compress text:
# compressor Huffman quijote
from collections import Counter
import math
import pickle
import re
quijote = open("quijote.txt", encoding="utf8")
num_lletres = 0
llistaQ = []
for linia in quijote:
for lletra in linia:
llistaQ.append(lletra)
num_lletres = num_lletres+1
c = Counter(llistaQ)
c_ordenat = c.most_common()
c_final=c_ordenat[::-1]
for i,j in enumerate(c_final):
c_final[i]=list(c_final[i])
diccionari=[]
for i,j in c_final:
diccionari.append([i,None])
while len(c_final) > 1:
petit1=c_final[0]
petit2=c_final[1]
c_final.append([petit1[0]+petit2[0],petit1[1]+petit2[1]])
for i in petit1[0]:
for pos,x in enumerate(diccionari):
if x[0]==i:
val_antic=diccionari[pos][1]
diccionari[pos].pop(1)
if val_antic==None:
diccionari[pos].insert(1,"1")
else:
diccionari[pos].insert(1,"1"+val_antic)
for i in petit2[0]:
for pos,x in enumerate(diccionari):
if x[0]==i:
val_antic=diccionari[pos][1]
diccionari[pos].pop(1)
if val_antic==None:
diccionari[pos].insert(1,"0")
else:
diccionari[pos].insert(1,"0"+val_antic)
del c_final[0]
del c_final[0]
c_final.sort(key = lambda f: f[1])
s=0
entropia=0
llarg=0
llistat=c_ordenat[::-1]
for i,j in llistat:
freq=float(j)/float(num_lletres)
s=freq*math.log(freq,2.0)
entropia=entropia+s
for pos,z in enumerate(diccionari):
if z[0]==i:
llarg=llarg+(len(z[1])*freq)
entropia=-(entropia)
with open("taula_Huffman.txt", "wb") as taula_final:
pickle.dump(diccionari, taula_final)
cadena=''
with open("quijote.txt", encoding="utf8") as entrada, open('Huffman sortida', 'wb') as sortida:
for line in entrada:
for x, y in diccionari:
line = line.replace(x, y)
cadena=cadena+line
cadena=str(1)+cadena
bits=re.findall('........',cadena)
for i in bits:
sortida.write(bytes([int(i,2)]))
ll=len(cadena)
sob=ll%8
a=len(cadena)/8
inta=(int(a))
nc=''
for n in range(1,sob+1):
nc=nc+cadena[(inta*8)+n-1]
penultim=nc+str(1)*(8-sob)
ultim=str('{0:08b}'.format(sob))
sortida.write(bytes([int(penultim,2)]))
sortida.write(bytes([int(ultim,2)]))
print("entropia=",entropia)
print("expected length=",llarg)
quijote.close()
And the decompressor:
#Descompressor Huffman
import binascii
import pickle
with open("Huffman sortida", "rb") as entrada, open('quijote descomprimit.txt','w',encoding='utf-8') as sortida, open("taula_Huffman.txt", "rb") as diccionari:
byte = entrada.read()
hexadecimal = binascii.hexlify(byte).decode()
binary=bin(int(hexadecimal, 16))[2:].zfill(8)
cadena=binary[1:]
ultim=cadena[(len(cadena)-8):(len(cadena))]
cadena=cadena[:(len(cadena)-8)]
penultim=cadena[(len(cadena)-8):(len(cadena))]
cadena=cadena[:(len(cadena)-8)]
ultim=int(ultim,2)
afegit=penultim[:ultim]
cadena=cadena+afegit
dic = pickle.load(diccionari)
dicc={}
for i in dic:
dicc[i[0]] = i[1]
dicci = {v: k for k, v in dicc.items()}
temporal=''
text=''
for i in cadena:
temporal+=i
if temporal in dicci:
text+=dicci[temporal]
temporal=''
sortida.write(text)
The problem is that the decompressed file is fine except for the numbers, I mean, where on the original file were letters in the final file there are the same letters, but where on the original file were numbers in the final file apears a series of 1111 Csicauicau,11dunrdunr dunrdunrdunrdunr 111 1111dunrdunr dunrdunrdunrdunr or whatever.
So I found that the part of the compressor that failed was the replace instruction (line 74-75):
for line in entrada:
for x, y in diccionari:
line = line.replace(x, y)
cadena=cadena+line
I replaced that four lines by the following:
for line in entrada:
for ch in line:
for x, y in diccionari:
if ch==x:
cadena=cadena+y
break
The problem is that this makes the program so slow (~40 seconds to run on my PC). The first option was about 9 seconds on my PC.
Is there a way to do this for inside a for inside a for faster? Or, is there a solution to my .replace(x,y) which fails with the numbers?

Related

How do I read a multi-line list from a file in Python?

I have a file which has lists spanned across multiple lines - with the length of the lists as constant. However, in each line the number of elements can varied.
How do I read this file in Python to read each list as a whole?
Edit: Would prefer a non-regex solution.
The file which looks something like this (just for illustration):
[ -6.70031086e-02 5.93684241e-02 1.11689426e-01 1.16174825e-01
-3.74981388e-02 4.05267589e-02 2.02941950e-02 1.65661901e-01
9.88883078e-02 -1.86108038e-01 -2.09761858e-01 2.08867267e-02
-7.34964982e-02 -1.38626635e-01 1.33853648e-02 -1.11527992e-02
7.19301552e-02 5.71861453e-02 -8.56672525e-02 8.01878721e-02
-2.27990234e-03 8.93531218e-02 -7.99949542e-02 -3.89122330e-02
3.07365637e-02 -1.14912149e-02 -1.25382066e-01 1.61550958e-02
-9.03828740e-02 -8.40659663e-02 2.35458408e-02 6.62269741e-02
-6.83306251e-03 3.86000201e-02 -2.85124127e-02 -1.22550033e-01
6.14493713e-02 5.42194061e-02 -9.98141840e-02 3.87526527e-02
-1.77935660e-02 6.59185136e-03 -7.56490007e-02 -8.04342143e-03
4.22548652e-02 -4.90937680e-02 7.31833130e-02 4.60098870e-02
-3.38455513e-02 7.72312284e-02 1.69506043e-01 8.54071528e-02
-5.15969582e-02 -8.66574422e-02 2.78513003e-02 -8.26551542e-02
5.72918989e-02 -8.63238499e-02 -1.09750973e-02 -1.04178898e-01
4.04170994e-03 7.16830865e-02 1.16529778e-01 1.65875465e-01
1.82720050e-02 1.71985731e-01 -2.09263922e-03 -3.31376195e-02
1.26107544e-01 1.47209521e-02 -1.41869476e-02 5.07163629e-02
1.49011686e-01 9.49593708e-02 4.67912182e-02 -8.64533633e-02
4.12282310e-02 8.19735080e-02 1.49312839e-02 2.14010417e-01
1.43005610e-01 -6.68876693e-02 1.25497788e-01 -8.12855735e-02
1.89039335e-02 -7.57512003e-02 4.25233506e-02 -6.90079033e-02
8.08808357e-02 -3.47024412e-03 2.63141114e-02 1.61882326e-01
1.25483396e-02 1.45484000e-01 3.12147997e-02 5.61049813e-03
-1.52215753e-02 -9.00566354e-02 7.78550655e-02 2.32269196e-03
6.35183901e-02 -1.34039536e-01 1.12368152e-01 -5.65479957e-02
-1.40751451e-01 -3.24242609e-03 -2.60595884e-02 -3.79961394e-02
9.53520015e-02 1.18161231e-01 -6.31203428e-02 6.54687434e-02
-8.70579779e-02 1.64551754e-02 -4.66874018e-02 -2.02252846e-02
1.81142420e-01 -4.29894254e-02 8.62734243e-02 -1.96067482e-01
-5.18136062e-02 -1.02697751e-02 -8.20104256e-02 -7.04407394e-02
-1.37479603e-01 1.51444465e-01 1.46553725e-01 6.87731877e-02]
[ 0.13552369 -0.05061625 0.13381879 -0.09299553 -0.10647763 -0.02260791
0.00843107 0.01909993 0.0252617 -0.09204189 0.11444099 0.16380875
-0.26470438 0.04185624 0.08701419 -0.00960395 0.03196884 0.05695887
0.03903539 0.0330128 0.0088141 0.03981387 -0.2256397 0.1373885
-0.00823926 -0.23756374 0.14071368 0.15679301 0.05020505 0.00083234
0.14197688 -0.17108534 -0.03471961 -0.09328505 0.04228394 0.07565336
-0.06243521 -0.09347741 -0.00821514 -0.06649745 0.05205032 -0.00554045
-0.00386953 0.05514322 -0.0234912 -0.11922046 0.14259741 -0.04250529
0.02933454 0.09837652 -0.04943179 -0.01795183 0.11347186 -0.0262726
0.14694421 0.00120262 0.02876565 0.06762701 -0.06783341 -0.0130248
0.0304249 0.04527348 0.15238339 0.01605285 0.02574495 0.03512112
-0.05733667 -0.09585288 0.05414675 0.14885603 -0.02176115 -0.11798949
0.10624658 0.04126133 0.0355645 -0.0176413 0.01316 -0.0731855
0.06095812 -0.03693416 0.05717857 -0.06640249 0.02760602 -0.11397229
-0.08891453 -0.05422837 -0.00309273 -0.08528782 0.04416328 0.10460843
0.08477673 -0.03460682 0.26425052 0.027636 -0.01395808 -0.04762371
-0.11365297 -0.09291256 0.02920797 0.1462263 -0.1354932 -0.00904074
0.16209167 -0.0351855 0.0287815 0.082674 0.03369482 -0.04522609
0.01189264 -0.03094579 -0.1829372 -0.0331573 0.03074961 -0.01479802
-0.06882931 -0.02879945 0.04064524 0.1048708 0.11631119 -0.13730904
-0.01107442 0.07329052 0.013919 0.02282012 0.14160685 -0.08278389
0.04416744 0.17811519 0.06306098 -0.15048456 -0.08337893 0.06718753
0.02712255 0.0626005 0.05940831 0.08399926 0.22958109 -0.06148282
-0.05348093 -0.05489948 0.18494032 -0.01777483 0.03008986 0.03045709
-0.09592026 0.17701676 -0.21119906 -0.01997624 0.15930974 -0.03315869 ]

import re
p=re.compile(r'\[.*\]', re.S)
num=re.compile(r'\S+')
f=open("lst", "r")
s=f.read()
f.close()
l=p.findall(s)
lst=[]
for i in l:
tmp=[]
num_list=num.findall(i)
del num_list[0]
for n in num_list:
if n!=']':
tmp.append(n)
lst.append(tmp)
print lst
lst is a list of lists read from your file.

You don't need a regex, just strip and split, mapping to float if you want floats:
def sections():
with open("in.txt") as f:
tmp = []
for line in f:
data = list(map(float, line.strip(" []\n").split()))
if line.rstrip().endswith("]"):
yield tmp
tmp = []
tmp.append(data)
from pprint import pprint as pp
pp(list(sections()))
Output:
[[[-0.0670031086, 0.0593684241, 0.111689426, 0.116174825],
[-0.0374981388, 0.0405267589, 0.020294195, 0.165661901],
[0.0988883078, -0.186108038, -0.209761858, 0.0208867267],
[-0.0734964982, -0.138626635, 0.0133853648, -0.0111527992],
[0.0719301552, 0.0571861453, -0.0856672525, 0.0801878721],
[-0.00227990234, 0.0893531218, -0.0799949542, -0.038912233],
[0.0307365637, -0.0114912149, -0.125382066, 0.0161550958],
[-0.090382874, -0.0840659663, 0.0235458408, 0.0662269741],
[-0.00683306251, 0.0386000201, -0.0285124127, -0.122550033],
[0.0614493713, 0.0542194061, -0.099814184, 0.0387526527],
[-0.017793566, 0.00659185136, -0.0756490007, -0.00804342143],
[0.0422548652, -0.049093768, 0.073183313, 0.046009887],
[-0.0338455513, 0.0772312284, 0.169506043, 0.0854071528],
[-0.0515969582, -0.0866574422, 0.0278513003, -0.0826551542],
[0.0572918989, -0.0863238499, -0.0109750973, -0.104178898],
[0.00404170994, 0.0716830865, 0.116529778, 0.165875465],
[0.018272005, 0.171985731, -0.00209263922, -0.0331376195],
[0.126107544, 0.0147209521, -0.0141869476, 0.0507163629],
[0.149011686, 0.0949593708, 0.0467912182, -0.0864533633],
[0.041228231, 0.081973508, 0.0149312839, 0.214010417],
[0.14300561, -0.0668876693, 0.125497788, -0.0812855735],
[0.0189039335, -0.0757512003, 0.0425233506, -0.0690079033],
[0.0808808357, -0.00347024412, 0.0263141114, 0.161882326],
[0.0125483396, 0.145484, 0.0312147997, 0.00561049813],
[-0.0152215753, -0.0900566354, 0.0778550655, 0.00232269196],
[0.0635183901, -0.134039536, 0.112368152, -0.0565479957],
[-0.140751451, -0.00324242609, -0.0260595884, -0.0379961394],
[0.0953520015, 0.118161231, -0.0631203428, 0.0654687434],
[-0.0870579779, 0.0164551754, -0.0466874018, -0.0202252846],
[0.18114242, -0.0429894254, 0.0862734243, -0.196067482],
[-0.0518136062, -0.0102697751, -0.0820104256, -0.0704407394]],
[[-0.137479603, 0.151444465, 0.146553725, 0.0687731877],
[0.13552369,
-0.05061625,
0.13381879,
-0.09299553,
-0.10647763,
-0.02260791],
[0.00843107, 0.01909993, 0.0252617, -0.09204189, 0.11444099, 0.16380875],
[-0.26470438, 0.04185624, 0.08701419, -0.00960395, 0.03196884, 0.05695887],
[0.03903539, 0.0330128, 0.0088141, 0.03981387, -0.2256397, 0.1373885],
[-0.00823926, -0.23756374, 0.14071368, 0.15679301, 0.05020505, 0.00083234],
[0.14197688, -0.17108534, -0.03471961, -0.09328505, 0.04228394, 0.07565336],
[-0.06243521,
-0.09347741,
-0.00821514,
-0.06649745,
0.05205032,
-0.00554045],
[-0.00386953, 0.05514322, -0.0234912, -0.11922046, 0.14259741, -0.04250529],
[0.02933454, 0.09837652, -0.04943179, -0.01795183, 0.11347186, -0.0262726],
[0.14694421, 0.00120262, 0.02876565, 0.06762701, -0.06783341, -0.0130248],
[0.0304249, 0.04527348, 0.15238339, 0.01605285, 0.02574495, 0.03512112],
[-0.05733667,
-0.09585288,
0.05414675,
0.14885603,
-0.02176115,
-0.11798949],
[0.10624658, 0.04126133, 0.0355645, -0.0176413, 0.01316, -0.0731855],
[0.06095812, -0.03693416, 0.05717857, -0.06640249, 0.02760602, -0.11397229],
[-0.08891453,
-0.05422837,
-0.00309273,
-0.08528782,
0.04416328,
0.10460843],
[0.08477673, -0.03460682, 0.26425052, 0.027636, -0.01395808, -0.04762371],
[-0.11365297, -0.09291256, 0.02920797, 0.1462263, -0.1354932, -0.00904074],
[0.16209167, -0.0351855, 0.0287815, 0.082674, 0.03369482, -0.04522609],
[0.01189264, -0.03094579, -0.1829372, -0.0331573, 0.03074961, -0.01479802],
[-0.06882931, -0.02879945, 0.04064524, 0.1048708, 0.11631119, -0.13730904],
[-0.01107442, 0.07329052, 0.013919, 0.02282012, 0.14160685, -0.08278389],
[0.04416744, 0.17811519, 0.06306098, -0.15048456, -0.08337893, 0.06718753],
[0.02712255, 0.0626005, 0.05940831, 0.08399926, 0.22958109, -0.06148282],
[-0.05348093, -0.05489948, 0.18494032, -0.01777483, 0.03008986, 0.03045709]]]
If you are storing the arrays, you might consider using numpy.save or pickle etc.. Storing in the current format is probably not the best idea.

Here is a another solution:
file = open('database.txt', 'r')
text = file.read()
file.close()
## long version
lists = text.split(']')
lists = lists[:-1] # remove last element which is empty (because of split)
lists = [i.strip() for i in lists] # remove possible spaces and tabs
lists = [i.strip('[') for i in lists] # remove '[' that is left on beginning of every element
lists = [i.split() for i in lists] # split every element to get list
lists = [[float(j) for j in i] for i in lists] # convert lists of strings to lists of numbers
print(lists) # result is list of lists
## short version
lists = [[float(j) for j in i.strip().strip('[').split()] for i in text.split(']')[:-1]]
print(lists)

f=open('sample.txt','r')
y=[]
for a in f:
b=a.split()
for c in b:
if c[0]=='[':
d=c[1:]
elif c[-1]==']':
d=c[:-1]
else:
d=c
y.append(d)
f.close()
print y

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time

Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.

If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!

Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}

As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0

Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.

Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)

Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.

Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

changing the contents of a file applying different conditions

I am trying to do some changes in the contents of an input file. The input file I have looks like the following:
18800000 20400000 pau
20400000 21300000 aa
21300000 22500000 p
22500000 23200000 l
23200000 24000000 ay
24000000 25000000 k
25000000 26500000 pau
This file is a transcription of an audio file. The first number denotes the start time and the next one denotes the end time. Then the alphabets denote the sound.
The change I have to make is, there are a few sounds which is made of two different sounds ie there are some diphthongs too. So these diphthongs have to be split into the two sounds. In the example above the diphthong is 'ay'. It is made of 'ao' and 'ih'.
What happens here is, the duration of 'ay' which is 24000000 - 232000000 = 8 is distributed into these two sounds. The result will be,
23200000 24000000 ay
changes to
23200000 236000000 ao
23600000 240000000 ih
I have attempted to write a pseudo code which looks rubbish.
def test(transcriptionFile) :
with open("transcriptions.txt", "r+") as tFile :
for line in tFile :
if 3rd_item = ay
duration = (2nd_item[1] - 1st_item[2]) / 2
delete the line
tFile.write(1st_item, 1st_item + d, ao)
tfile.write(1st_item + d, 1st_item, ih) # next line
if__name__ == "__main__" :
test("transcriptions.txt")
Thank you.
With the suggestions I was given I changed the code to the following. It is still not correct.
def test(transcriptionFile) :
with open("transcriptions.txt", "r") as tFile :
inp = tFile.readlines()
outp = []
for ln in inp :
start, end, sound = ln.strip()
if sound == ay :
duration = (end - start) / 2
ln.delete
start = start
end = start + duration
sound = ao
outp.append(ln)
start = start + duration # next line
end = start
sound = ih
outp.append(ln)
with open("transcriptions.txt", "w") as tFile:
tFile.writelines(outp)
__name__ == "__main__"
test("transcriptions.txt")

Editing a text file in-place is pretty hard. Your best options are:
Write the program as a Unix filter, i.e. produce the new file on sys.stdout and put it in place with external tools
Read in the whole file, then construct the new file in memory and write it out.
A program following the second line of thought would look like:
# read transcriptions.txt into a list of lines
with open("transcriptions.txt", "r") as tFile:
inp = tFile.readlines()
# do processing and build a new list of lines
outp = []
for ln in inp:
if not to_be_deleted(ln):
outp.append(transform(ln))
# now overwrite transcriptions.txt
with open("transcriptions.txt", "w") as tFile:
tFile.writelines(outp)
It would be even better if you'd write the processing bit as a list comprehension:
outp = [transform(ln) for ln in inp
if not to_be_deleted(ln)]

The following script should do what you want:
import sys
def main(src, dest):
with open(dest, 'w') as output:
with open(src) as source:
for line in source:
try:
start, end, sound = line.split()
except ValueError:
continue
if sound == 'ay':
start = int(start)
end = int(end)
offset = (end - start) // 2
output.write('%s %s ao\n' % (start, start + offset))
output.write('%s %s ih\n' % (start + offset, end))
else:
output.write(line)
if __name__ == "__main__":
main(*sys.argv[1:])
Output:
18800000 20400000 pau
20400000 21300000 aa
21300000 22500000 p
22500000 23200000 l
23200000 23600000 ao
23600000 24000000 ih
24000000 25000000 k
25000000 26500000 pau

How to merge only the unique lines from file_a to file_b?

This question has been asked here in one form or another but not quite the thing I'm looking for. So, this is the situation I shall be having: I already have one file, named file_a and I'm creating another file - file_b. file_a is always bigger than file_b in size. There will be a number of duplicate lines in file_b (hence, in file_a as well) but both the files will have some unique lines. What I want to do is: to copy/merge only the unique lines from file_a to file_b and then sort the line order, so that the file_b becomes the most up-to-date one with all the unique entries. Either of the original files shouldn't be more than 10MB in size. What's the most efficient (and fastest) way I can do that?
I was thinking something like that, which does the merging alright.
#!/usr/bin/env python
import os, time, sys
# Convert Date/time to epoch
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
# input files
o_file = "file_a"
c_file = "file_b"
n_file = [o_file,c_file]
m_file = "merged.file"
for x in range(len(n_file)):
P = open(n_file[x],"r")
output = P.readlines()
P.close()
# Sort the output, order by 2nd last field
#sp_lines = [ line.split('\t') for line in output ]
#sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
#for line in sp_lines:
for line in output:
if "group_" in line:
F.write(line)
F.close()
But, it's:
not with only the unique lines
not sorted (by next to last field)
and introduces the 3rd file i.e. m_file
Just a side note (long story short): I can't use sorted() here as I'm using v2.3, unfortunately. The input files look like this:
On 23/03/11 00:40:03
JobID Group.User Ctime Wtime Status QDate CDate
===================================================================================
430792 group_atlas.pltatl16 0 32 4 02/03/11 21:52:38 02/03/11 22:02:15
430793 group_atlas.atlas084 30 472 4 02/03/11 21:57:43 02/03/11 22:09:35
430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
430796 group_atlas.atlas084 8 185 4 02/03/11 22:02:38 02/03/11 22:05:46
I tried to use cmp() to sort by the 2nd last field but, I think, it doesn't work just because of the first 3 lines of the input files.
Can anyone please help? Cheers!!!
Update 1:
For the future reference, as suggested by Jakob, here is the complete script. It worked just fine.
#!/usr/bin/env python
import os, time, sys
from sets import Set as set
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
def yield_lines(fileobj):
#I want to discard the headers
for i in xrange(3):
fileobj.readline()
#
for line in fileobj:
yield line
def app(path1, path2):
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
# Input files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"
print time.strftime('%H:%M:%S', time.localtime())
# Sorting the output, order by 2nd last field
sp_lines = [ line.split('\t') for line in app(o_file, c_file) ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)
for line in sp_lines:
MF = '\t'.join(line)
F.write(MF)
F.close()
It took about 2m:47s to finish for 145244 lines.
[testac1#serv07 ~]$ ./uniq-merge.py
17:19:21
No. of lines: 145244
17:22:08
thanks!!
Update 2:
Hi eyquem, this is the Error message I get when I run your script(s).
From the first script:
[testac1#serv07 ~]$ ./uniq-merge_2.py
File "./uniq-merge_2.py", line 44
fm.writelines( '\n'.join(v)+'\n' for k,v in output )
^
SyntaxError: invalid syntax
From the second script:
[testac1#serv07 ~]$ ./uniq-merge_3.py
File "./uniq-merge_3.py", line 24
output = sett(line.rstrip() for line in fa)
^
SyntaxError: invalid syntax
Cheers!!
Update 3:
The previous one wasn't sorting the list at all. Thanks to eyquem to pointing that out. Well, it does now. This is a further modification to Jakob's version - I converted the set:app(path1, path2) to a list:myList() and then applied the sort( lambda ... ) to the myList to sort the merged file by the nest to last field. This is the final script.
#!/usr/bin/env python
import os, time, sys
from sets import Set as set
def toEpoch(dt):
# Convert date/time to epoch
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
def yield_lines(fileobj):
# Discard the headers (1st 3 lines)
for i in xrange(3):
fileobj.readline()
for line in fileobj:
yield line
def app(path1, path2):
# Remove duplicate lines
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
print time.strftime('%H:%M:%S', time.localtime())
# I/O files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"
# Convert set into to list
myList = list(app(o_file, c_file))
# Sort the list by the date
sp_lines = [ line.split('\t') for line in myList ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)
# Finally write to the outFile
for line in sp_lines:
MF = '\t'.join(line)
F.write(MF)
F.close()
There is no speed boost at all, it took 2m:50s to process the same 145244 lines. Is anyone see any scope of improvement, please let me know. Thanks to Jakob and eyquem for their time. Cheers!!
Update 4:
Just for future reference, this is a modified version of eyguem, which works much better and faster then the previous ones.
#!/usr/bin/env python
import os, sys, re
from sets import Set as sett
from time import mktime, strptime, strftime
def sorting_merge(o_file, c_file, m_file ):
# RegEx for Date/time filed
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d')
def kl(lines,pat = pat):
# match only the next to last field
line = lines.split('\t')
line = line[-2]
return mktime(strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = sett()
head = []
# Separate the header & remove the duplicates
def rmHead(f_n):
f_n.readline()
for line1 in f_n:
if pat.search(line1): break
else: head.append(line1) # line of the header
for line in f_n:
output.add(line.rstrip())
output.add(line1.rstrip())
f_n.close()
fa = open(o_file, 'r')
rmHead(fa)
fb = open(c_file, 'r')
rmHead(fb)
# Sorting date-wise
output = [ (kl(line),line.rstrip()) for line in output if line.rstrip() ]
output.sort()
fm = open(m_file,'w')
# Write to the file & add the header
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head[0]+head[1])))
for t,line in output:
fm.write(line + '\n')
fm.close()
c_f = "03_a"
o_f = "03_b"
sorting_merge(o_f, c_f, 'outfile.txt')
This version is much faster - 6.99 sec. for 145244 lines compare to the 2m:47s - then the previous one using lambda a, b: cmp(). Thanks to eyquem for all his support. Cheers!!

EDIT 2
My previous codes have problems with output = sett(line.rstrip() for line in fa) and output.sort(key=kl)
Moreover, they have some complications.
So I examined the choice of reading the files directly with a set() function taken by Jakob Bowyer in his code.
Congratulations Jakob ! (and Michal Chruszcz by the way) : set() is unbeatable, it's faster than a reading one line at a time.
Then , I abandonned my idea to read the files line after line.
.
But I kept my idea to avoid a sorting with the help of cmp() function because, as it is described in the doc:
s.sort([cmpfunc=None])
The sort() method takes an optional
argument specifying a comparison
function of two arguments (list items)
(...) Note that this slows the sorting
process down considerably
http://docs.python.org/release/2.3/lib/typesseq-mutable.html
Then, I managed to obtain a list of tuples (t,line) in which the t is
time.mktime(time.strptime(( 1st date-and-hour in line ,'%d/%m/%y %H:%M:%S'))
by the instruction
output = [ (kl(line),line.rstrip()) for line in output]
.
I tested 2 codes. The following one in which 1st date-and-hour in line is computed thanks to a regex:
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = [ (kl(line),line.rstrip()) for line in output if line.rstrip()]
output.sort()
And a second code in which kl() is:
def kl(line,pat = pat):
return time.mktime(time.strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S'))
.
The results are
Times of execution:
0.03598 seconds for the first code with regex
0.03580 seconds for the second code with split('\t')
that is to say the same
This algorithm is faster than a code using a function cmp() :
a code in which the set of lines output isn't transformed in a list of tuples by
output = [ (kl(line),line.rstrip()) for line in output]
but is only transformed in a list of the lines (without duplicates, then) and sorted with a function mycmp() (see the doc):
def mycmp(a,b):
return cmp(time.mktime(time.strptime(a.split('\t')[-2],'%d/%m/%y %H:%M:%S')),
time.mktime(time.strptime(b.split('\t')[-2],'%d/%m/%y %H:%M:%S')))
output = [ line.rstrip() for line in output] # not list(output) , to avoid the problem of newline of the last line of each file
output.sort(mycmp)
for line in output:
fm.write(line+'\n')
has an execution time of
0.11574 seconds
.
The code:
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = sett()
head = []
fa = open(o_file)
fa.readline() # first line is skipped
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1: head.append(line1) # line1 is here a line of the header
else: break # the loop ends on the first line1 not being a line of the heading
output = sett( fa )
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
if pat.search(line1): break
output = output.union(sett( fb ))
fb.close()
output = [ (kl(line),line.rstrip()) for line in output]
output.sort()
fm = open(m_file,'w')
fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te
This time, I hope it will run correctly, and that the only thing to do is to wait the times of execution on real files much bigger than the ones on which I tested the codes
.
EDIT 3
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+'
'[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'|'
'[ \t]+aborted/deleted)')
.
EDIT 4
#!/usr/bin/env python
import os, time, sys, re
from sets import Set
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+'
'[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'|'
'[ \t]+aborted/deleted)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
head = []
output = Set()
fa = open(o_file)
fa.readline() # first line is skipped
for line1 in fa:
if pat.search(line1): break # first line after the heading
else: head.append(line1) # line of the header
for line in fa:
output.add(line.rstrip())
output.add(line1.rstrip())
fa.close()
fb = open(c_file)
for line1 in fb:
if pat.search(line1): break
for line in fb:
output.add(line.rstrip())
output.add(line1.rstrip())
fb.close()
if '' in output: output.remove('')
output = [ (kl(line),line) for line in output]
output.sort()
fm = open(m_file,'w')
fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line+'\n')
fm.close()
te = time.clock()
sorting_merge('A.txt','B.txt','C.txt')
print time.clock()-te

Maybe something along these lines?
from sets import Set as set
def yield_lines(fileobj):
#I want to discard the headers
for i in xrange(3):
fileobj.readline()
for line in fileobj:
yield line
def app(path1, path2):
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
EDIT: Forgot about with :$

I wrote this new code, with the ease of using a set. It is faster that my previous code. And, it seems, than your code
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
# Convert Date/time to epoch
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)'
'[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d')
fa = open(o_file)
head = []
fa.readline()
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1:
head.append(('',line1.rstrip()))
else:
break
output = sett((toEpoch(pat.search(line).group(1)) , line.rstrip())
for line in fa)
output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
mat1 = pat.search(line1)
if mat1: break
for line in fb:
output.add((toEpoch(pat.search(line).group(1)) , line.rstrip()))
output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
fb.close()
output = list(output)
output.sort()
output[0:0] = head
output[0:0] = [('',time.strftime('On %d/%m/%y %H:%M:%S'))]
fm = open(m_file,'w')
fm.writelines( line+'\n' for t,line in output)
fm.close()
te = time.clock()
sorting_merge('ytr.txt','tatay.txt','merged.file.txt')
print time.clock()-te
Note that this code put a heading in the merged file
.
EDIT
Aaaaaah... I got it... :-))
Execution's time divided by 3 !
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
fa = open(o_file)
head = []
fa.readline()
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1:
head.append(line1.rstrip())
else:
break
output = sett(line.rstrip() for line in fa)
output.add(line1.rstrip())
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
mat1 = pat.search(line1)
if mat1: break
for line in fb:
output.add(line.rstrip())
output.add(line1.rstrip())
fb.close()
output = list(output)
output.sort(key=kl)
output[0:0] = [time.strftime('On %d/%m/%y %H:%M:%S')] + head
fm = open(m_file,'w')
fm.writelines( line+'\n' for line in output)
fm.close()
te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te

Last codes, I hope.
Because I found a killer code.
First , I created two files "xxA.txt" and "yyB.txt" of 30 lines having 30000 lines as
430559 group_atlas.atlas084 12 181 4 04/03/10 01:38:02 02/03/11 22:05:42
430502 group_atlas.atlas084 12 181 4 23/01/10 21:45:05 02/03/11 22:05:42
430544 group_atlas.atlas084 12 181 4 17/06/11 12:58:10 02/03/11 22:05:42
430566 group_atlas.atlas084 12 181 4 25/03/10 23:55:22 02/03/11 22:05:42
with the following code:
create AB.py
from random import choice
n = tuple( str(x) for x in xrange(500,600))
days = ('01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16',
'17','18','19','20','21','22','23','24','25','26','27','28')
# not '29','30,'31' to avoid problems with strptime() on last days of february
months = days[0:12]
hours = days[0:23]
ms = ['00','01','02','03','04','05','06','07','09'] + [str(x) for x in xrange(10,60)]
repeat = 30000
with open('xxA.txt','w') as f:
# 430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
ch = ('On 23/03/11 00:40:03\n'
'JobID Group.User Ctime Wtime Status QDate CDate\n'
'===================================================================================\n')
f.write(ch)
for i in xrange(repeat):
line = '430%s group_atlas.atlas084 12 181 4 \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
(choice(n),
choice(days),choice(months),choice(('10','11')),
choice(hours),choice(ms),choice(ms))
f.write(line)
with open('yyB.txt','w') as f:
# 430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
ch = ('On 25/03/11 13:45:24\n'
'JobID Group.User Ctime Wtime Status QDate CDate\n'
'===================================================================================\n')
f.write(ch)
for i in xrange(repeat):
line = '430%s group_atlas.atlas084 12 181 4 \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
(choice(n),
choice(days),choice(months),choice(('10','11')),
choice(hours),choice(ms),choice(ms))
f.write(line)
with open('xxA.txt') as g:
print 'readlines of xxA.txt :',len(g.readlines())
g.seek(0,0)
print 'set of xxA.txt :',len(set(g))
with open('yyB.txt') as g:
print 'readlines of yyB.txt :',len(g.readlines())
g.seek(0,0)
print 'set of yyB.txt :',len(set(g))
Then I ran these 3 programs:
"merging regex.py"
#!/usr/bin/env python
from time import clock,mktime,strptime,strftime
from sets import Set
import re
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (mktime(strptime(pat.search(line).group(),'%d/%m/%y %H:%M:%S')),line)
for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedr.txt')
t2 = clock()
print 'merging regex'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
"merging split.py"
#!/usr/bin/env python
from time import clock,mktime,strptime,strftime
from sets import Set
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (mktime(strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S')),line)
for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergeds.txt')
t2 = clock()
print 'merging split'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
"merging killer"
#!/usr/bin/env python
from time import clock,strftime
from sets import Set
import re
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
patk = re.compile('([0123]\d)/([01]\d)/(\d{2}) ([012]\d:[0-6]\d:[0-6]\d)')
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (patk.search(line).group(3,2,1,4),line)for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedk.txt')
t2 = clock()
print 'merging killer'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
results
merging regex
total time of execution : 14.2816595405
launching : 0.00169211450059
preparation : 0.00168093989599
reading of 1st file : 0.163582242995
reading of 2nd file : 0.141301478261
output.remove('') : 2.37460347614e-05
creation of output : 13.4460212122
sorting of output : 0.216363532237
writing of merging file : 0.232923737514
closing of the function : 0.0797514767938
merging split
total time of execution : 13.7824474898
launching : 4.10666718815e-05
preparation : 2.70984161395e-05
reading of 1st file : 0.154349784679
reading of 2nd file : 0.136050810927
output.remove('') : 2.06730184981e-05
creation of output : 12.9691854691
sorting of output : 0.218704332534
writing of merging file : 0.225259076223
closing of the function : 0.0788362766776
merging killer
total time of execution : 2.14315311024
launching : 0.00206199391263
preparation : 0.00205026057781
reading of 1st file : 0.158711791582
reading of 2nd file : 0.138976601775
output.remove('') : 2.37460347614e-05
creation of output : 0.621466415424
sorting of output : 0.823161602941
writing of merging file : 0.227701565422
closing of the function : 0.171049393149
During killer program, sorting output takes 4 times longer , but time of creation of output as a list is divided by 21 !
Then globaly, the execution's time is reduced at least by 85 %.

Python: File formatting

I have a for loop which references a dictionary and prints out the value associated with the key. Code is below:
for i in data:
if i in dict:
print dict[i],
How would i format the output so a new line is created every 60 characters? and with the character count along the side for example:
0001
MRQLLLISDLDNTWVGDQQALEHLQEYLGDRRGNFYLAYATGRSYHSARELQKQVGLMEP
0061
DYWLTAVGSEIYHPEGLDQHWADYLSEHWQRDILQAIADGFEALKPQSPLEQNPWKISYH
0121 LDPQACPTVIDQLTEMLKETGIPVQVIFSSGKDVDLLPQRSNKGNATQYLQQHLAMEPSQ

It's a finicky formatting problem, but I think the following code:
import sys
class EveryN(object):
def __init__(self, n, outs):
self.n = n # chars/line
self.outs = outs # output stream
self.numo = 1 # next tag to write
self.tll = 0 # tot chars on this line
def write(self, s):
while True:
if self.tll == 0: # start of line: emit tag
self.outs.write('%4.4d ' % self.numo)
self.numo += self.n
# wite up to N chars/line, no more
numw = min(len(s), self.n - self.tll)
self.outs.write(s[:numw])
self.tll += numw
if self.tll >= self.n:
self.tll = 0
self.outs.write('\n')
s = s[numw:]
if not s: break
if __name__ == '__main__':
sys.stdout = EveryN(60, sys.stdout)
for i, a in enumerate('abcdefgh'):
print a*(5+ i*5),
shows how to do it -- the output when running for demonstration purposes as the main script (five a's, ten b's, etc, with spaces in-between) is:
0001 aaaaa bbbbbbbbbb ccccccccccccccc dddddddddddddddddddd eeeeee
0061 eeeeeeeeeeeeeeeeeee ffffffffffffffffffffffffffffff ggggggggg
0121 gggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
0181 hhhhhhh

# test data
data = range(10)
the_dict = dict((i, str(i)*200) for i in range( 10 ))
# your loops as a generator
lines = ( the_dict[i] for i in data if i in the_dict )
def format( line ):
def splitter():
k = 0
while True:
r = line[k:k+60] # take a 60 char block
if r: # if there are any chars left
yield "%04d %s" % (k+1, r) # format them
else:
break
k += 60
return '\n'.join(splitter()) # join all the numbered blocks
for line in lines:
print format(line)

I haven't tested it on actual data, but I believe the code below would do the job. It first builds up the whole string, then outputs it a 60-character line at a time. It uses the three-argument version of range() to count by 60.
s = ''.join(dict[i] for i in data if i in dict)
for i in range(0, len(s), 60):
print '%04d %s' % (i+1, s[i:i+60])

It seems like you're looking for textwrap
The textwrap module provides two convenience functions, wrap() and
fill(), as well as TextWrapper, the class that does all the work, and
a utility function dedent(). If you’re just wrapping or filling one or
two text strings, the convenience functions should be good enough;
otherwise, you should use an instance of TextWrapper for efficiency.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Replacing characters in python - python

Related

How do I read a multi-line list from a file in Python?

load parameters from a file in Python

changing the contents of a file applying different conditions

How to merge only the unique lines from file_a to file_b?

Python: File formatting

Categories

Resources