I don't know why the length of string is '0' - python

following is my code. not finding any comments, I will add my codes.
filenames2 = ['BROWN1_L1.txt', 'BROWN1_M1.txt', 'BROWN1_N1.txt', 'BROWN1_P1.txt', 'BROWN1_R1.txt']
with open("C:/Python27/L1_R1_TRAINING.txt", 'w') as outfile:
for fname in filenames2:
with open(fname) as infile:
for line in infile:
outfile.write(line)
b = open("C:/Python27/L1_R1_TRAINING.txt", 'rU')
filenames3 =[]
for path, dirs, files in os.walk("C:/Python27/Reutertest"):
for file in files:
file = os.path.join(path, file)
filenames3.append(file)
with open("C:/Python27/REUTER.txt", 'w') as outfile:
for fname in filenames3:
with open(fname) as infile:
for line in infile:
outfile.write(line)
c = open("C:/Python27/REUTER.txt", 'rU')
def Cross_Entropy(x,y):
filecontents1 = x.read()
filecontents2 = y.read()
sentence1 = filecontents1.upper()
sentence2 = filecontents2.upper()
count_A1 = sentence1.count('A')
count_B1 = sentence1.count('B')
count_C1 = sentence1.count('C')
count_all1 = len(sentence1)
prob_A1 = count_A1 / count_all1
prob_B1 = count_B1 / count_all1
prob_C1 = count_C1 / count_all1
count_A2 = sentence2.count('A')
count_B2 = sentence2.count('B')
count_C2 = sentence2.count('C')
count_all2 = len(sentence2)
prob_A2 = count_A2 / count_all2
prob_B2 = count_B2 / count_all2
prob_C2 = count_C2 / count_all2
Cross_Entropy = -(prob_A1 * math.log(prob_A2, 2) + prob_B1 * math.log(prob_B2, 2) + prob_C1 * math.log(prob_C2, 2)
Cross_Entropy(b, c)
Yes. now. I'v got error "prob_A1 = count_A1 / count_all1
ZeroDivisionError: division by zero" . what's wrong with my code? Is my orthography is wrong?

I'm not quite sure what is behind your failure to read your strings from the files, but your cross-entropy can be computed much more succinctly:
def crossEntropy(s1,s2):
s1 = s1.upper()
s2 = s2.upper()
probsOne = (s1.count(c)/float(len(s1)) for c in 'ABC')
probsTwo = (s2.count(c)/float(len(s2)) for c in 'ABC')
return -sum(p*math.log(q,2) for p,q in zip(probsOne,probsTwo))
For example,
>>> crossEntropy('abbcabcba','abbabaccc')
1.584962500721156
If this is what you want to compute -- you can now concentrate on assembling the strings to pass to crossEntropy. I would recommend getting rid of the read-write-read logic (unless you need the two files that you are trying to create) and just directly read the files in the two directories into two arrays, joining them into two big strings which are stripped of all white space and then passed to crossEntropy
Another possible approach. If all you want are the counts of 'A', 'B', 'C' in the two directories -- just create two dictionaries, one for each directory, both keyed by 'A', 'B', and 'C', and iterate through the files in each directory, reading each file in turn, iterating through but not saving the resulting string, just getting the counts of those three characters, and creating a version of crossEntropy which is expecting two dictionaries.
Something like:
def crossEntropy(d1,d2):
countOne = sum(d1[c] for c in 'ABC')
countTwo = sum(d2[c] for c in 'ABC')
probsOne = (d1[c]/float(countOne) for c in 'ABC')
probsTwo = (d2[c]/float(countTwo) for c in 'ABC')
return -sum(p*math.log(q,2) for p,q in zip(probsOne,probsTwo))
For example,
>>> d1 = {'A':3,'B':5,'C':2}
>>> d2 = {'A':2,'B':5,'C':3}
>>> crossEntropy(d1,d2)
1.54397154729945

Related

difficulties with files in python

For a homework assignment I have a filepath called P, and a string called S which is equal to 'parrot', I need to search P for S and output the number of times S appears. I cannot use regexs.
this is my code:
matches = []
matches2 = []
def file_reading(P, S):
file1 = open(P, 'r')
matches.append(S)
file1.close()
for S in P:
matches2.append(S)
print (len(matches2))
The output should be 3 but this only outputs 1, can someone point me in the right direction? if more details are needed let me know and I will edit them in.
In order to search how many times S appears in P, you can simply do the following.
P = "/home/shan/shan/shan/editshanfile/exe"
S = "shan"
parts = P.split(S)
print (len(parts)-1)
Open the file using given path P
Read the file into a variable
Search that variable for the target string S
Close the file
Print the output
I suspect string.count(string2) is what you're looking for:
>>> big_string = 'a' * 100 + 'parrot' + 'b' * 20 + 'parrot' + 'c' * 50 + 'parrot'
>>> len(big_string)
188
>>> big_string.count('parrot')
3
>>>

Python renaming duplicates

How to solve this renaming duplicates problem without resorting to renaming with something unique like "_DUPLICATED_#NO" the names have to be unique when finished, and preferably with iterative numbers denoting number of duplicates
from collections import defaultdict
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
tally = defaultdict(lambda:-1)
for i in range(len(l)):
e = l[i]
tally[e] += 1
if tally[e] > 0:
e += str(tally[e])
l[i] = e
print (l)
results:
['hello1', 'hello2', 'hello3', 'hello', 'hello1', 'hello2']
as you can see, the names are not unique
This seems simple enough. You start with a list of filenames:
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
Then you iterate through them to finished filenames, incrementing a trailing number by 1 if a duplicate is found.
result = {}
for fname in l:
orig = fname
i=1
while fname in result:
fname = orig + str(i)
i += 1
result[fname] = orig
This should leave you with a dictionary like:
{"hello1": "hello1",
"hello2": "hello2",
"hello3": "hello3",
"hello": "hello",
"hello4": "hello",
"hello5": "hello"}
Of course if you don't care about mapping the originals to the duplicate names, you can drop that part.
result = set()
for fname in l:
orig = fname
i=1
while fname in result:
fname = orig + str(i)
i += 1
result.add(fname)
If you want a list afterward, just cast it that way.
final = list(result)
Note that if you're creating files, this is exactly what the tempfile module is designed to do.
import tempfile
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
fs = [tempfile.NamedTemporaryFile(prefix=fname, delete=False, dir="/some/directory/") for fname in l]
This will not create nicely incrementing filenames, but they are guaranteed unique, and fs will be a list of the (open) file objects rather than a list of names, although NamedTemporaryFile.name will give you the filename.

Increase Python code speed

Does anyone have a clue how to increase the speed of this part of python code?
It was designed to deal with small files (with just a few lines, and for this is very fast) but i want to run it with big files (with ~50Gb, and millions of lines).
The main goal of this code is to get stings from a file (.txt) and search for these in a input file printing the the number of times that these occurred in the output file.
Here is the code: infile, seqList and out are determined by the optparse as Options in the beginning of the code (not shown)
def novo (infile, seqList, out) :
uDic = dict()
rDic = dict()
nmDic = dict()
with open(infile, 'r') as infile, open(seqList, 'r') as RADlist :
samples = [line.strip() for line in RADlist]
lines = [line.strip() for line in infile]
#Create dictionaires with all the samples
for i in samples:
uDic[i.replace(" ","")] = 0
rDic[i.replace(" ","")] = 0
nmDic[i.replace(" ","")] = 0
for k in lines:
l1 = k.split("\t")
l2 = l1[0].split(";")
l3 = l2[0].replace(">","")
if len(l1)<2:
continue
if l1[4] == "U":
for k in uDic.keys():
if k == l3:
uDic[k] += 1
if l1[4] == "R":
for j in rDic.keys():
if j == l3:
rDic[j] += 1
if l1[4] == "NM":
for h in nmDic.keys():
if h == l3:
nmDic[h] += 1
f = open(out, "w")
f.write("Sample"+"\t"+"R"+"\t"+"U"+"\t"+"NM"+"\t"+"TOTAL"+"\t"+"%R"+"\t"+"%U"+"\t"+"%NM"+"\n")
for i in samples:
U = int()
R = int()
NM = int ()
for k, j in uDic.items():
if k == i:
U = j
for o, p in rDic.items():
if o == i:
R = p
for y,u in nmDic.items():
if y == i:
NM = u
TOTAL = int(U + R + NM)
try:
f.write(i+"\t"+str(R)+"\t"+str(U)+"\t"+str(NM)+"\t"+str(TOTAL)+"\t"+str(float(R) / TOTAL)+"\t"+str(float(U) / TOTAL)+"\t"+str(float(NM) / TOTAL$
except:
continue
f.close()
With processing 50 GB files, the question is not how to make it faster, but how to make it runnable
at all.
The main problem is, you will run out of memory and shall modify the code to be processing the files
without having all the files in memory, but rather having in memory onle a line, which is needed.
Following code from your question is reading all the lines form two files:
with open(infile, 'r') as infile, open(seqList, 'r') as RADlist :
samples = [line.strip() for line in RADlist]
lines = [line.strip() for line in infile]
# at this moment you are likely to run out of memory already
#Create dictionaires with all the samples
for i in samples:
uDic[i.replace(" ","")] = 0
rDic[i.replace(" ","")] = 0
nmDic[i.replace(" ","")] = 0
#similar loop over `lines` comes later on
You shall defer reading the lines till the latest possible moment like this:
#Create dictionaires with all the samples
with open(seqList, 'r') as RADlist:
for samplelines in RADlist:
sample = sampleline.strip()
for i in samples:
uDic[i.replace(" ","")] = 0
rDic[i.replace(" ","")] = 0
nmDic[i.replace(" ","")] = 0
Note: did you want to use line.strip() or line.split()?
This way, you do not have to keep all the content in memory.
There are many more options for optimization, but this one will let you to take off and run.
It would make it much easier if you provided some sample inputs. Because you haven't I haven't tested this, but the idea is simple - iterate through each file only once, using iterators rather than reading the whole file into memory. Use the efficient collections.Counter object to handle the counting and minimise inner looping:
def novo (infile, seqList, out):
from collections import Counter
import csv
# Count
counts = Counter()
with open(infile, 'r') as infile:
for line in infile:
l1 = line.strip().split("\t")
l2 = l1[0].split(";")
l3 = l2[0].replace(">","")
if len(l1)<2:
continue
counts[(l1[4], l3)] += 1
# Produce output
types = ['R', 'U', 'NM']
with open(seqList, 'r') as RADlist, open(out, 'w') as outfile:
f = csv.writer(outfile, delimiter='\t')
f.writerow(types + ['TOTAL'] + ['%' + t for t in types])
for sample in RADlist:
sample = sample.strip()
countrow = [counts((t, sample)) for t in types]
total = sum(countrow)
f.writerow([sample] + countrow + [total] + [c/total for c in countrow])
samples = [line.strip() for line in RADlist]
lines = [line.strip() for line in infile]
If you convert your script into functions (it makes profiling easier), and then see what it does when you code profile it : I suggest using runsnake : runsnakerun
I would try replacing your loops with list & dictionary comprehensions:
For example, instead of
for i in samples:
uDict[i.replace(" ","")] = 0
Try:
udict = {i.replace(" ",""):0 for i in samples}
and similarly for the other dicts
I don't really follow what's going on in your "for k in lines" loop, but you only need l3 (and l2) when you have certain values for l1[4]. Why not check for those values before splitting and replacing?
Lastly, instead of looping through all the keys of a dict to see if a given element is in that dict, try:
if x in myDict:
myDict[x] = ....
for example:
for k in uDic.keys():
if k == l3:
uDic[k] += 1
can be replaced with:
if l3 in uDic:
uDic[l3] += 1
Other than that, try profiling.
1)Look into profilers and adjust the code that is taking the most time.
2)You could try optimizing some methods with Cython - use data from profiler to modify correct thing
3)It looks like you can use a counter instead of a dict for the output file, and a set for the input file -look into them.
set = set()
from Collections import Counter
counter = Counter() # Essentially a modified dict, that is optimized for counting...
# like counting occurences of strings in a text file
4) If you are reading 50GB of memory you won't be able to store it all in RAM (I'm assuming who knows what kind of computer you have), so generators should save your memory and time.
#change list comprehension to generators
samples = (line.strip() for line in RADlist)
lines = (line.strip() for line in infile)

writing a line with input from different rows in python

I am trying to write out a line to a new file based on input from a csv file, with elements from different rows and different columns for example
test.csv:
name1, value1, integer1, integer1a
name2, value2, integer2, integer2a
name3, value3, integer3, integer3a
desired output:
command integer1:integer1a moretext integer2:integer2a
command integer2:integer2a moretext integer3:integer3a
I realize this will probably some type of loop, I am just getting lost in the references for loop interation and python maps
Assuming python 2, you want the output in a text file, and you have the command and moretext saved as variables earlier in the code.
from csv import reader
f = reader(open('test.csv'))
data = [str(r[2]) +':' + str(r[3]) for r in f]
out = open('out.txt', 'w')
for i in range(len(data)-1):
print >> out, command + data[i] + moretext + data[i+1]
out.close()
Easiest to adapt to your need would be to build a list of tuples from your file :
data = []
for l in open('file.csv'):
data.append( l.strip().split() )
at this point data is a list of quadruples. So you can do your example like this:
for i in range(len(data)/2):
_,_,i1,i2 = data[2*i]
_,_,j1,j2 = data[2*i+1]
print('command {}:{} moretext {}:{}'.format( i1,i2,j1,j2 ))
Here I use _ to say that I don't care about the two first variables of the quadruple. Hence, I don't even named them. It's a nice feature to write clear code.
You can also do this in a single loop:
f = open('file.csv')
while True:
l1 = f.readline()
l2 = f.readline()
if not l1 or not l2: break # file ended
_,_,i1,i2 = l1.strip().split()
_,_,j1,j2 = l2.strip().split()
print('command {}:{} moretext {}:{}'.format( i1,i2,j1,j2 ))
Here's a simple and general function that takes any iterable and generates all sequential pairs (e.g., 1, 2, 3 becomes (1, 2), (2, 3)):
def pairwise(iterable):
it = iter(iterable)
a = next(it)
for b in it:
yield a, b
a = b
This can then be used to solve your particular problem as follows:
with open('outputfile', 'w') as out:
for (_, _, a1, a2), (_, _, b1, b2) in pairwise(
[w.strip() for w in l.split(',')] for l in open('test.csv')):
out.write('command %s:%s moretext %s:%s\n' % (a1, a2, b1, b2))
One advantage of doing it this way is that you don't read the whole input into memory before starting the output, thus it will work well for streaming and arbitrarily large files.

Python loop through two files, do computation, then output 3 files

I have 2 tab delimited files
for example:
file1:
12 23 43 34
433 435 76 76
file2:
123 324 53 65
12 457 54 32
I would like to loop through these 2 files, comparing every line of file1 with file2 and vice versa.
If, for example, the 1st number of 1st line in file1 is the same as the 1st number of 2nd line in file 2:
I would like to put from the 1st line in file1 in a file called output.
then I would like to put all the lines from file1 that didn't find a match in file 2 in a new file
and all the lines from file2 that didn't find a match in file1 in a new file.
so far I have been able to find the matching lines and put them in a file but I'm having trouble putting the lines that didn't match into 2 separate files.
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
count=0
list1=[] #list for lines in file1 that didn't find a match
list2=[] #list for lines in file2 that didn't find a match
for i in one:
for j in two:
columns1=i.strip().split('\t')
num1=int(columns1[0])
columns2=j.strip().split('\t')
num2=int(columns2[0])
if num1==num2:
count+=1
output.write(i+j)
else:
list1.append(i)
list2.append(j)
Problem I have here is with the else part.
Can someone show me the right and better way to do this, I would greatly appreciate.
EDIT: Thanks for the quick responses everyone
The 3 output I would be looking for is:
Output_file1: #Matching results between the 2 files
12 23 43 34 #line from file1
12 457 54 32 #line from file2
Output_file2: #lines from the first file that didn't find a match
433 435 76 76
Output_file3: #lines from the second file that didn't find a match
123 324 53 65
I would suggest that you use the csv module to read your files like so (you might have to mess around with the dialect, see http://docs.python.org/library/csv.html for help:
import csv
one = csv.reader(open(file1, 'r'), dialect='excell')
two = csv.reader(open(file2, 'r'), dialect='excell')
then you might find it easier to "zip" along the lines of both files at the same time like so (see http://docs.python.org/library/itertools.html#itertools.izip_longest):
import itertools
file_match = open('match', 'w')
file_nomatch1 = open('nomatch1', 'w')
file_nomatch2 = open('nomatch2', 'w')
for i,j in itertools.izip_longest(one, two, fillvalue="-"):
if i[0] == j[0]:
file_match.write(str(i)+'\n')
else:
file_nomatch1.write(str(i)+'\n')
file_nomatch2.write(str(j)+'\n')
# and maybe handle the case where one is "-"
I reread the post and realized you are looking for a match between ANY two lines in both files. Maybe someone will find the above code useful, but it wont solve your particular problem.
I'd suggest using set operation
from collections import defaultdict
def parse(filename):
result = defaultdict(list)
for line in open(filename):
# take the first number and put it in result
num = int(line.strip().split(' ')[0])
result[num].append(line)
return result
def select(selected, items):
result = []
for s in selected:
result.extend(items[s])
return result
one = parse('one.txt')
two = parse('two.txt')
one_s = set(one)
two_s = set(two)
intersection = one_s & two_s
one_only = one_s - two_s
two_only = two_s - one_s
one_two = defaultdict(list)
for e in one: one_two[e].extend(one[e])
for e in two: one_two[e].extend(two[e])
open('intersection.txt', 'w').writelines(select(intersection, one_two))
open('one_only.txt', 'w').writelines(select(one_only, one))
open('two_only.txt', 'w').writelines(select(two_only, two))
Think that it is not the best way but it works for me and looks prety easy for understanding:
# Sorry but was not able to check code below
def get_diff(fileObj1, fileObj2):
f1Diff = []
f2Diff = []
outputData = []
# x is one row
f1Data = set(x.strip() for x in fileObj1)
f2Data = set(x.strip() for x in fileObj2)
f1Column1 = set(x.split('\t')[0] for x in f1Data)
f2Column1 = set(x.split('\t')[0] for x in f2Data)
l1Col1Diff = f1Column1 ^ f2Column1
l2Col1Diff = f2Column1 ^ f1Column1
commonPart = f1Column1 & f2column1
for line in f1Data.union(f2Data):
lineKey = line.split('\t')[0]
if lineKey in common:
outputData.append(line)
elif lineKey in l1ColDiff:
f1Diff.append(line)
elif lineKey in l2ColDiff:
f2Diff.append(line)
return outputData, f1Diff, f2Diff
outputData, file1Missed, file2Missed = get_diff(open(file1, 'r'), open(file2, 'r'))
I think that this code fits your purposes
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
first = {x.split('\t')[0] for x in one}
second = {x.split('\t')[0] for x in two}
common = first.intersection( second )
list1 = filter( lambda x: not x.split('\t')[0] in common, one )
list2 = filter( lambda x: not x.split('\t')[0] in common, two )
res1 = filter( lambda x: x.split('\t')[0] in common, one )
res2 = filter( lambda x: x.split('\t')[0] in common, two )
count = len( res1 )
for x in range(count):
output.write( res1[x] )
output.write( res2[x] )

Categories

Resources