With my code, I loop over files and count patterns in files. My code is as follows
from collections import defaultdict
import csv, os, re
from itertools import groupby
import glob
def count_kmers(read, k):
counts = defaultdict(list)
num_kmers = len(read) - k + 1
for i in range(num_kmers):
kmer = read[i:i+k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] += 1
for item in counts:
return(basename, sequence, item, counts[item])
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file) as f_fasta:
for k, g in groupby(f_fasta, lambda x: x.startswith('>')):
if k:
sequence = next(g).strip('>\n')
else:
d1 = list(''.join(line.strip() for line in g))
d2 = ''.join(d1)
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
reverse_complement = "".join(complement.get(base, base) for base in reversed(d1))
d3 = list(''.join(line.strip() for line in reverse_complement))
d4 = ''.join(d3)
d5 = (d2+d4)
counting = count_kmers(d5, 5)
with open('kmer.out', 'a') as text_file:
text_file.write(counting)
And my output looks like this
1035 1 GAGGA 2
1035 1 CGCAT 1
1035 1 TCCCG 1
1035 1 CTCAT 2
1035 1 CCTGG 2
1035 1 GTCCA 1
1035 1 CATGG 1
1035 1 TAGCC 2
1035 1 GCTGC 7
1035 1 TGCAT 1
The code works fine, but I cannot write my output to a file. I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-190-89e3487da562> in <module>()
37 counting = count_kmers(d5, 5)
38 with open('kmer.out', 'w') as text_file:
---> 39 text_file.write(counting)
TypeError: write() argument must be str, not tuple
What am I doing wrong and how can I solve this problem, to make sure that my code write the output to a txt file?
The original verions of count_kmers() did not contain a return statement, which means it has an implicit return None.
As you assign this to counting all of your errors became self explanatory.
After your edit, the end of the function looked like this:
for item in counts:
return(basename, sequence, item, counts[item])
which returns a tuple with four values. It also exits the function on the first pass through the loop.
Related
I'm trying to find occurrences of several pairs of words in strings which are in a list in a tsv file. A list in a tsv file is below.
0 ILDIGCGRGRHARALVRRGWQVTGLDLSEDAVAAARSRVADDDLDV...
1 AELETLQAKINPHFLYNSLNSIASLVYTDPEKAEKMVLMLSKLFRV...
2 AQLSSLKEQLNPHFLFNTFNTLYGISLKYPERVPDLIMHTSQLMRY...
3 TEIKALQSQIKPHFLFNTLNAIRCTIINNNNDKAADLVYKLAMLLR...
4 SEMSRLNAQINPHFLFNTLNFFYSEVRTLHPKISESILLLSDIMRY...
...
...1000 SELSFLKAQINPHFFFNTLNNIYALTMMDVASAQEALHRLSRMMRY...
1001 ILEPGCGTGRLMLALAEHGHHVAGVDASATALEFCRERLTQHGLTG...
1002 IADLGAGEGTISQLMAQRAKRVIAIDNSEKMVEFGAELARKHGIAN...
1003 AELRALRAQISPHFIYNALAAIASFVRTDPERARELLLEFADFSRY...
1004 VVDLGCGSGASTDALVNSMGHRGETYAAIGIDASAGMLTEAHSKPW...
[1005 rows x 1 columns]
then, I'd like to get occurrences of AA, AB, AC, ...ZY, ZZ for each row. An example is below.
If there is a string "AEAETLQAKIN" in a row, then I'd like to get the result below.
(the definition of strings must be acid. ex)acid='AEAETLQAKIN')
IN[]......(I'd like to know how to describe codes which can get occurrences here. )
OUT[] AA: 0, AC: 0, AD: 0, AE: 2, ... AK: 1, ... EA: 1, ...
If you want a dict containing only existing pairs, use a defaultdict
from collections import defaultdict
from string import ascii_uppercase
def occurrences(content):
result = defaultdict(int)
for i in range(len(content) - 1):
result[content[i:i + 2]] += 1
return result
If you want to also have the 0, so 26x26=676 pairs, prepare one dict before
from itertools import product
OCCURRENCE_DEFAULT = {f"{x}{y}": 0 for x, y in product(ascii_uppercase, repeat=2)}
def occurrences(content):
result = OCCURRENCE_DEFAULT.copy()
for i in range(len(content) - 1):
result[content[i:i + 2]] += 1
return result
Then apply on each string of your content
value = ["0 ILDIGCGRGRHARALVRRGWQVTGLDLSEDAVAAARSRVADDDLDV",
"4 SEMSRLNAQINPHFLFNTLNFFYSEVRTLHPKISESILLLSDIMRY"]
for row in value:
occ = occurrences(row.split()[1])
print(occ)
Getting the below error on the code snippet that follows the error.
Any ideas on how to solve this?
Pretty much brand new to using Numpy - have spent most of my time using Pandas but trying to move away from using Pandas for numerous performance related issues.
End goal is to run a LEFT JOIN on the two structed arrays.
The error seems to be prompted by the ret[i] = tuple(row1[f1]) + tuple(row2[f1]) expression, but honestly not certain why i'd be getting this error.
Tested the row1 and row2 to check the number of fields vs. the f1 which contains the dtype keys, and it all seems to line up from what I can tell.
Any thoughts would be appreciated!
ERROR
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_43960/3997384146.py in <module>
66 # dtype=[('name', 'U10'), ('age', 'i4')])
67
---> 68 join_by_left(key='name', r1=struct_arr1, r2=struct_arr2, mask=True)
~\AppData\Local\Temp/ipykernel_43960/3997384146.py in join_by_left(key, r1, r2, mask)
43 print(row1[f1])
44 print(row2[f1])
---> 45 ret[i] = tuple(row1[f1]) + tuple(row2[f1])
46
47 i += 1
~\AppData\Roaming\Python\Python37\site-packages\numpy\ma\core.py in __setitem__(self, indx, value)
3379 elif not self._hardmask:
3380 # Set the data, then the mask
-> 3381 _data[indx] = dval
3382 _mask[indx] = mval
3383 elif hasattr(indx, 'dtype') and (indx.dtype == MaskType):
ValueError: could not assign tuple of length 6 to structure with 3 fields.
FULL CODE
import numpy as np
def join_by_left(key, r1, r2, mask=True):
# figure out the dtype of the result array
descr1 = r1.dtype.descr
descr2 = [d for d in r2.dtype.descr if d[0] not in r1.dtype.names]
descrm = descr1 + descr2
# figure out the fields we'll need from each array
f1 = [d[0] for d in descr1]
f2 = [d[0] for d in descr2]
# cache the number of columns in f1
ncol1 = len(f1)
print(f1)
# get a dict of the rows of r2 grouped by key
rows2 = {}
for row2 in r2:
rows2.setdefault(row2[key], []).append(row2)
# figure out how many rows will be in the result
nrowm = 0
for k1 in r1[key]:
if k1 in rows2:
nrowm += len(rows2[k1])
else:
nrowm += 1
# allocate the return array
_ret = np.recarray(nrowm, dtype=descrm)
if mask:
ret = np.ma.array(_ret, mask=True)
else:
ret = _ret
# merge the data into the return array
i = 0
for row1 in r1:
if row1[key] in rows2:
for row2 in rows2[row1[key]]:
print(row1[f1])
print(row2[f1])
ret[i] = tuple(row1[f1]) + tuple(row2[f1])
i += 1
else:
for j in range(ncol1):
ret[i][j] = row1[j]
i += 1
return ret
struct_arr1 = np.array([('jason', 28, 'j#j.com'), ('jared', 31, 'jm#j.com')],
dtype=[('name', 'U10'), ('age', 'i4'), ('email', 'U10')])
struct_arr2 = np.array([('jason', 22, 'jm#j.com'), ('jason', 27, 'jmm#j.com'), ('george', 28, 'gmm#j.com'), ('jared', 22, 'm#j.com')],
dtype=[('name', 'U10'), ('age', 'i4'), ('email', 'U10')])
join_by_left(key='name', r1=struct_arr1, r2=struct_arr2, mask=True)
On the line where you're getting the error:
ret[i] = tuple(row1[f1]) + tuple(row2[f1])
The + operator concatenates two tuples together, so you the result is a tuple with 6 elements, not 3 with the elements added pairwise (if that is what you were expecting).
Simple example:
tuple('abc') + tuple('def')
Results in:
('a', 'b', 'c', 'd', 'e', 'f')
I have two CSV files that I want to compare one looks like this:
"a" 1 6 3 1 8
"b" 15 6 12 5 6
"c" 7 4 1 4 8
"d" 14 8 12 11 4
"e" 1 8 7 13 12
"f" 2 5 4 13 9
"g" 8 6 9 3 3
"h" 5 12 8 2 3
"i" 5 9 2 11 11
"j" 1 9 2 4 9
So "a" possesses the numbers 1,6,3,1,8 etc. The actual CSV file is 1,000s of lines long so you know for efficiency sake when writing the code.
The second CSV file looks like this:
4
15
7
9
2
I have written some code to import these CSV files into lists in python.
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
wn1 = winningnumbers[0]
wn2 = winningnumbers[1]
wn3 = winningnumbers[2]
wn4 = winningnumbers[3]
wn5 = winningnumbers[4]
print(winningnumbers)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
enl = list(readere)
How would I now search cross reference number 4 so wn1 of CSV file 2 with the first csv file. So that it returns that "b" has wn1 in it. I imported them as a list to see if I could figure out how to do it but just ended up running in circles. I also tried using dict() but had no success.
If I understood you correctly, you want to find the first index (or all indexes) of numbers in entries that are winning. If you want it, you can do that:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
winning_number_index = -1 # Default value which we will print if nothing is found
current_index = 0 # Initial index
for line in readere: # Iterate over entries file
all_numbers_match = True # Default value that will be set to False if any of the elements doesn't match with winningnumbers
for i in range(len(line)):
if line[i] != winningnumbers[i]: # If values of current line and winningnumbers with matching indexes are not equal
all_numbers_match = False # Our default value is set to False
break # Exit "for" without finishing
if all_numbers_match == True: # If our default value is still True (which indicates that all numbers match)
winning_number_index = current_index # Current index is written to winning_number_index
break # Exit "for" without finishing
else: # Not all numbers match
current_index += 1
print(winning_number_index)
This will print the index of the first winning number in entries (if you want all the indexes, write about it in the comments).
Note: this is not the optimal code to solve your problem. It's just easier to undestand and debug if you're not familiar with Python's more advanced features.
You should probably consider not abbreviating your variables. entries_reader takes just a second more to write and 5 seconds less to understand then readere.
This is the variant that is faster, shorter and more memory efficient, but may be harder to understand:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
for line_index, line in enumerate(readere):
if all((line[i] == winningnumbers[i] for i in xrange(len(line)))):
winning_number_index = line_index
break
else:
winning_number_index = -1
print(winning_number_index)
The features that might me unclear are probably enumerate(), any() and using else in for and not in if. Let's go through all of them one by one.
To understand this usage of enumerate, you'll need to understand that syntax:
a, b = [1, 2]
Variables a and b will be assigned according values from the list. In this case a will be 1 and b will be 2. Using this syntax we can do that:
for a, b in [[1, 2], [2, 3], ['spam', 'eggs']]:
# do something with a and b
in each iteration, a and b will be 1 and 2, 2 and 3, 'spam' and 'eggs' accordingly.
Let's assume we have a list a = ['spam', 'eggs', 'potatoes']. enumerate() just returns a "list" like that: [(1, 'spam'), (2, 'eggs'), (3, 'potatoes')]. So, when we use it like that,
for line_index, line in enumerate(readere):
# Do something with line_index and line
line_index will be 1, 2, 3, e.t.c.
any() function accepts a sequence (list, tuple, e.t.c.) and returns True if all the elements in it are equal to True.
Generator expression mylist = [line[i] == winningnumbers[i] for i in range(len(line))] returns a list and is similar to the following:
mylist = []
for i in range(len(line)):
mylist.append(line[i] == winningnumbers[i]) # a == b will return True if a is equal to b
So any will return True only in cases when all the numbers from entry match the winning numbers.
Code in else section of for is called only when for was not interrupted by break, so in our situation it's good for setting a default index to return.
Having duplicate numbers seems illogical but if you want to get the count of matched numbers for each row regardless of index then makes nums a set and sum the times a number from each row is in the set:
from itertools import islice, imap
import csv
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
# iterate over each row and sum how many matches we get
for row in r:
print("{} matched {}".format(row[0], sum(n in nums
for n in islice(row, 1, None))))
Which using your input will output:
a matched 0
b matched 1
c matched 2
d matched 1
e matched 0
f matched 2
g matched 0
h matched 1
i matched 1
j matched 2
presuming your file is comma separated and you have a number per line in your numbers file.
If you actually want to know which numbers if any are present then you need to iterate over the number and print each one that is in our set:
from itertools import islice, imap
import csv
with open("in.txt") as f, open("numbers.txt") as nums:
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
for row in r:
for n in islice(row, 1, None):
if n in nums:
print("{} is in row {}".format(n, row[0]))
print("")
But again, I am not sure having duplicate numbers makes sense.
To group the rows based on how many matches, you can use a dict using the sum as the key and appending the first column value:
from itertools import islice, imap
import csv
from collections import defaultdict
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
results = defaultdict(list)
# iterate over each row and sum how many matches we get
for row in r:
results[sum(n in nums for n in islice(row, 1, None))].append(row[0])
results:
defaultdict(<type 'list'>,
{0: ['a', 'e', 'g'], 1: ['b', 'd', 'h', 'i'],
2: ['c', 'f', 'j']})
The keys are numbers match, the values are the rows ids that matched the n numbers.
I have 2 tab delimited files
for example:
file1:
12 23 43 34
433 435 76 76
file2:
123 324 53 65
12 457 54 32
I would like to loop through these 2 files, comparing every line of file1 with file2 and vice versa.
If, for example, the 1st number of 1st line in file1 is the same as the 1st number of 2nd line in file 2:
I would like to put from the 1st line in file1 in a file called output.
then I would like to put all the lines from file1 that didn't find a match in file 2 in a new file
and all the lines from file2 that didn't find a match in file1 in a new file.
so far I have been able to find the matching lines and put them in a file but I'm having trouble putting the lines that didn't match into 2 separate files.
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
count=0
list1=[] #list for lines in file1 that didn't find a match
list2=[] #list for lines in file2 that didn't find a match
for i in one:
for j in two:
columns1=i.strip().split('\t')
num1=int(columns1[0])
columns2=j.strip().split('\t')
num2=int(columns2[0])
if num1==num2:
count+=1
output.write(i+j)
else:
list1.append(i)
list2.append(j)
Problem I have here is with the else part.
Can someone show me the right and better way to do this, I would greatly appreciate.
EDIT: Thanks for the quick responses everyone
The 3 output I would be looking for is:
Output_file1: #Matching results between the 2 files
12 23 43 34 #line from file1
12 457 54 32 #line from file2
Output_file2: #lines from the first file that didn't find a match
433 435 76 76
Output_file3: #lines from the second file that didn't find a match
123 324 53 65
I would suggest that you use the csv module to read your files like so (you might have to mess around with the dialect, see http://docs.python.org/library/csv.html for help:
import csv
one = csv.reader(open(file1, 'r'), dialect='excell')
two = csv.reader(open(file2, 'r'), dialect='excell')
then you might find it easier to "zip" along the lines of both files at the same time like so (see http://docs.python.org/library/itertools.html#itertools.izip_longest):
import itertools
file_match = open('match', 'w')
file_nomatch1 = open('nomatch1', 'w')
file_nomatch2 = open('nomatch2', 'w')
for i,j in itertools.izip_longest(one, two, fillvalue="-"):
if i[0] == j[0]:
file_match.write(str(i)+'\n')
else:
file_nomatch1.write(str(i)+'\n')
file_nomatch2.write(str(j)+'\n')
# and maybe handle the case where one is "-"
I reread the post and realized you are looking for a match between ANY two lines in both files. Maybe someone will find the above code useful, but it wont solve your particular problem.
I'd suggest using set operation
from collections import defaultdict
def parse(filename):
result = defaultdict(list)
for line in open(filename):
# take the first number and put it in result
num = int(line.strip().split(' ')[0])
result[num].append(line)
return result
def select(selected, items):
result = []
for s in selected:
result.extend(items[s])
return result
one = parse('one.txt')
two = parse('two.txt')
one_s = set(one)
two_s = set(two)
intersection = one_s & two_s
one_only = one_s - two_s
two_only = two_s - one_s
one_two = defaultdict(list)
for e in one: one_two[e].extend(one[e])
for e in two: one_two[e].extend(two[e])
open('intersection.txt', 'w').writelines(select(intersection, one_two))
open('one_only.txt', 'w').writelines(select(one_only, one))
open('two_only.txt', 'w').writelines(select(two_only, two))
Think that it is not the best way but it works for me and looks prety easy for understanding:
# Sorry but was not able to check code below
def get_diff(fileObj1, fileObj2):
f1Diff = []
f2Diff = []
outputData = []
# x is one row
f1Data = set(x.strip() for x in fileObj1)
f2Data = set(x.strip() for x in fileObj2)
f1Column1 = set(x.split('\t')[0] for x in f1Data)
f2Column1 = set(x.split('\t')[0] for x in f2Data)
l1Col1Diff = f1Column1 ^ f2Column1
l2Col1Diff = f2Column1 ^ f1Column1
commonPart = f1Column1 & f2column1
for line in f1Data.union(f2Data):
lineKey = line.split('\t')[0]
if lineKey in common:
outputData.append(line)
elif lineKey in l1ColDiff:
f1Diff.append(line)
elif lineKey in l2ColDiff:
f2Diff.append(line)
return outputData, f1Diff, f2Diff
outputData, file1Missed, file2Missed = get_diff(open(file1, 'r'), open(file2, 'r'))
I think that this code fits your purposes
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
first = {x.split('\t')[0] for x in one}
second = {x.split('\t')[0] for x in two}
common = first.intersection( second )
list1 = filter( lambda x: not x.split('\t')[0] in common, one )
list2 = filter( lambda x: not x.split('\t')[0] in common, two )
res1 = filter( lambda x: x.split('\t')[0] in common, one )
res2 = filter( lambda x: x.split('\t')[0] in common, two )
count = len( res1 )
for x in range(count):
output.write( res1[x] )
output.write( res2[x] )
I have a minor problem while checking for elements in a list:
I have two files with contents something like this
file 1: file2:
47 358 47
48 450 49
49 56 50
I parsed both files into two lists and used the following code to check
for i in file_1:
for j in file_2:
j = j.split()
if i == j[1]:
x=' '.join(j)
print >> write_in, x
I am now trying to get a "0" if the value of file_1 is not there in file_2 for example, value "48" is not there is file_2 so I need to get the output like (with only one space in between the two numbers) Also both the conditions should produce only one output file:
output_file:
358 47
0 48
450 49
56 50
I tried using the dictionary approach but I didn't quite get what I wanted (actually I don't know how to use dictionary in python correctly ;)). Any help will be great.
r1=open('file1').read().split()
r2=open('file2').read().split()
d=dict(zip(r2[1::2],r2[::2]))
output='\n'.join(x in d and d[x]+' '+x or '0 '+x for x in r1)
open('output_file','wb').write(output)
Test
>>> file1='47\n48\n49\n50'
>>> file2='358 47\n450 49\n56 50'
>>>
>>> r1=file1.split()
>>> r2=file2.split()
>>>
>>> d=dict(zip(r2[1::2],r2[::2])) #
>>> d
{'47': '358', '50': '56', '49': '450'}
>>>
>>> print '\n'.join(x in d and d[x]+' '+x or '0 '+x for x in r1)
358 47
0 48
450 49
56 50
>>>
You could modify your code quite easily:
for i in file_1:
x = None
for j in file_2:
j = j.split()
if i == j[1]:
x = ' '.join(j)
if x is None:
x = ' '.join(['0', i])
Depending on your inputs, the whole task might be of course simplified even further. At the moment, your code is 0(n**2) complexity.
Here's a readable solution using a dictionary:
d = {}
for k in file1:
d[k] = 0
for line in file2:
v, k = line.split()
d[k] = v
for k in sorted(d):
print d[k], k
You can try something like:
l1 = open('file1').read().split()
l2 = [line.split() for line in open('file2')]
for x, y in zip(l1, l2):
if x not in y:
print 0, x
print ' '.join(y)
but if you follow your logic, the output should be
358 47
0 48
450 49
0 49
56 50
and not
358 47
0 48
450 49
56 50
def file_process(filename1, filename2):
# read first file with zeroes as values
with open(filename1) as fp:
adict= dict( (line.rstrip(), 0) for line in fp)
# read second file as "value key"
with open(filename2) as fp:
adict.update(
line.rstrip().partition(" ")[2::-2] # tricky, read notes
for line in fp)
for key in sorted(adict):
yield adict[key], key
fp= open("output_file", "w")
fp.writelines("%s %s\n" % items for items in file_process("file1", "file2"))
fp.close()
str.partition(" ") returns a tuple of (pre-space, space, post-space). By slicing the tuple, starting at item 2 (post-space) and moving by a step of -2, we return a tuple of (post-space, pre-space), which are (key, value) for the dictionary that describes the solution.
PS Um :) I just noticed that my answer is essentially the same as Daniel Stutzbach's.