tiny_reads = [
Sequence('CGTGCAA'),
Sequence('TGCAATG'),
Sequence('ATGGCGT'),
Sequence('GGCGTGC'),
Sequence('CAATGGC'),]
dictionary = {}
def kmers(reads, k):
for line in tiny_reads:
for kmer in line.iter_kmers(k, overlap=3):
dictionary[str(kmer)] = 1
print(dictionary)
if str(kmer) not in dictionary:
dictionary[str(kmer)] = 1
else:
dictionary[str(kmer)] += 1
#print(dict)
kmers(tiny_reads, 3)
print(dictionary)
My code go through sequences above and uses iter_kmer() to break the sequence into small reads of size 3 ('CGT'). I want to create a dictionary that will have all these small reads and the count of how many they are in the sequence. The result im getting is off and im not sure why.
Expected result:
kmers(tiny_reads, 3)
{'AAT': 2,'ATG': 3, ...'TGG': 2}
My result:
{'CAA': 2, 'GTG': 2, 'GCA': 2, 'GCG': 2, 'ATG': 2, 'TGC': 2, 'CGT': 2, 'AAT': 2, 'GGC': 2, 'TGG': 2}
My result is incorrect because 'ATG' is repeated 3 times. Can you guys help please this frustrating me.
You are resetting the counter in the dictionary with every line that you are iterating through:
With the code you already have, I would use a defaultdict.
from collections import defaultdict
def kmers(reads, k):
dictionary = defaultdict(int)
for line in tiny_reads:
for kmer in line.iter_kmers(k, overlap=3):
dictionary[str(kmer)] += 1
If I were writing the code I would probably concatenate all the lines together and then use Counter.
def kmers(reads, k):
accumlator = []
for line in tiny_reads:
accumlator += line.iter_kmers(k, overlap=3):
dictionary = Counter(accumlator)
Not sure exactly how iter_kmers works, but perhaps you are looking for something like the following?
tiny_reads = [
Sequence('CGTGCAA'),
Sequence('TGCAATG'),
Sequence('ATGGCGT'),
Sequence('GGCGTGC'),
Sequence('CAATGGC')
]
kmer_d = dict()
def kmers(reads, k):
for tiny_r in tiny_reads:
for kmer in tiny_r.iter_kmers(k, overlap=3):
d = kmer_d.get(str(kmer), 0)
kmer_d[str(kmer)] = d + 1
if __name__ == "__main__":
kmers(tiny_reads, 3)
print(kmer_d)
Bear in mind that this is not the fastest implementation probably, however it simply fixes the error with the minimal change.
When reading values from a dictionary, using the .get() method you can set a default value in case no entry is found
Related
I write a small program using comprehension list python and I need to assign a value to dictionary.
It gives me syntax error.
all_freq = {}
Input = 'google.com'
[all_freq[s] += 1 if s in Input else all_freq[s] = 1 for s in Input]
It says "[" was not closed.
Could you please help me.
Use a normal for loop, not a list comprehension, as you are not trying to create a list of anything.
all_freq = {}
for s in Input:
if s in all_freq:
all_freq[s] += 1
else:
all_freq[s] = 1
which can be simplified slightly to
all_freq = {}
for s in Input:
if s not in all_freq:
all_freq[s] = 0
all_freq[s] += 1
which can be replaced entirely with
from collections import Counter
all_freq = Counter(Input)
Just inspired by earlier post, you can also do this way:
Of course, Counter is the best to do quick tallying.
from collections import defaultdict
all_freq = defaultdict(int) # initialize the dict to take 0, if key is not existing yet
for s in 'google': # each for-loop, just increment the *count* for corresponding letter
all_freq[s] += 1
print(all_freq)
defaultdict(<class 'int'>, {'g': 2, 'o': 2, 'l': 1, 'e': 1})
I'd like to write a function that will take one argument (a text file) to use its contents as keys and assign values to the keys. But I'd like the keys to go from 1 to n:
{'A': 1, 'B': 2, 'C': 3, 'D': 4... }.
I tried to write something like this:
Base code which kind of works:
filename = 'words.txt'
with open(filename, 'r') as f:
text = f.read()
ready_text = text.split()
def create_dict(lst):
""" go through the arg, stores items in it as keys in a dict"""
dictionary = dict()
for item in lst:
if item not in dictionary:
dictionary[item] = 1
else:
dictionary[item] += 1
return dictionary
print(create_dict(ready_text))
The output: {'A': 1, 'B': 1, 'C': 1, 'D': 1... }.
Attempt to make the thing work:
def create_dict(lst):
""" go through the arg, stores items in it as keys in a dict"""
dictionary = dict()
values = list(range(100)) # values
for item in lst:
if item not in dictionary:
for value in values:
dictionary[item] = values[value]
else:
dictionary[item] = values[value]
return dictionary
The output: {'A': 99, 'B': 99, 'C': 99, 'D': 99... }.
My attempt doesn't work. It gives all the keys 99 as their value.
Bonus question: How can I optimaze my code and make it look more elegant/cleaner?
Thank you in advance.
You can use dict comprehension with enumerate (note the start parameter):
words.txt:
colorless green ideas sleep furiously
Code:
with open('words.txt', 'r') as f:
words = f.read().split()
dct = {word: i for i, word in enumerate(words, start=1)}
print(dct)
# {'colorless': 1, 'green': 2, 'ideas': 3, 'sleep': 4, 'furiously': 5}
Note that "to be or not to be" will result in {'to': 5, 'be': 6, 'or': 3, 'not': 4}, perhaps what you don't want. Having only one entry out of two (same) words is not the result of the algorithm here. Rather, it is inevitable as long as you use a dict.
Your program sends a list of strings to create_dict. For each string in the list, if that string is not in the dictionary, then the dictionary value for that key is set to 1. If that string has been encountered before, then the value of that key is increased by 1. So, since every key is being set to 1, then that must mean there are no repeat keys anywhere, meaning you're sending a list of unique strings.
So, in order to have the numerical values increase with each new key, you just have to increment some number during your loop:
num = 0
for item in lst:
num += 1
dictionary[item] = num
There's an easier way to loop through both numbers and list items at the same time, via enumerate():
for num, item in enumerate(lst, start=1): # start at 1 and not 0
dictionary[item] = num
You can use this code. If an item has been in the lst more than once, the idx is considered one time in dictionary!
def create_dict(lst):
""" go through the arg, stores items in it as keys in a dict"""
dictionary = dict()
idx = 1
for item in lst:
if item not in dictionary:
dictionary[item]=idx
idx += 1
return dictionary
I'm experimenting with python and am stuck trying to understand the error messages in the context of what I am doing.
I'm playing around with comprehensions and trying to find a pattern to create a list/dictionary comprehension with more than one input set (assuming this is possible):
Note: Here the word input set means the input area of the comprehension. In setbuilder notation, from where python derived its comprehensions [Y for X in LIST], Y is the output function, X is the variable and LIST is the input set.
Assume I have the following working code:
from random import randint
mydict = {k: 0 for k in range(10)}
result = {randint(0,9): v + 1 for v in mydict.values()}
I'm not trying to do anything special about it. This is not even useful code because it won't work as expected. All elements in the dictionary will have the value 1, instead of just those pointed at by the random generator. My only objective is to have a basis from where I start my attempt at working with a tuple of input sets.
from random import randint
mydict = {k: 0 for k in range(10)}
result = {k: v + 1 for k, v in (randint(0,9), mydict.values())}
This option gives me: TypeError: 'int' object is not iterable.
By swapping the input sets and unpacking I have:
result = {k: v + 1 for *v, k in (mydict.values(), randint(0,9))}
But this option gives me: TypeError: can only concatenate list (not "int") to list
Are these errors appearing because I am trying to do something the language grammar does not understand, or am I missing something and I could in fact fix the code?
You will have to create a separate comprehension for the random numbers, as it currently stands, you have only one random number. Also, you will then need to zip the results to get a combined entity:
>>> from random import randint
>>> mydict = {k: 0 for k in range(10)}
>>> result = {k: v + 1 for k, v in zip([randint(0,9) for _ in range(10)] , mydict.values())}
>>> result
{2: 1, 3: 1, 4: 1, 5: 1, 8: 1, 9: 1}
Note that since your initial dict has the value 0 for all its keys, all the values in the result dict are 1 (0+1).
Also, since we are making the keys random, there can be possible overlaps (say 2 was generated twice), so that's why we don't see all the keys in the result dictionary.
As #wim notes in comments below, a better way to generate this result dictionary would be to use:
>>> {randint(0,9): v+1 for v in mydict.values()}
{0: 1, 1: 1, 2: 1, 3: 1, 6: 1, 7: 1}
I have a problem concerning a comparison between a char key in a dict and a char within a list.
The Task is to read a text and count all beginning letters.
I have a list with chars:
bchars = ('i','g','h','n','h')
and a dict with the alphabet and frequency default to zero:
d = dict(dict())
for i in range(97,123):
d[i-97]={chr(i):0}
no i want to check like the following:
for i in range(len(bchars)):
for j in range(len(d)):
if(bchars[i] in d[j]):
d[j][chr(i+97)] +=1
else:
d[j][chr(i+97)] +=0
so if the char in the list is a key at the certain position then += 1 else += zero
I thought by using a if/else statement I can bypass the KeyError.
Is there any more elegant solution for that?
The specific problem is that you check whether bchars[i] is in d[j], but then the key you actually use is chr(i+97).
chr(i+97) is the index of the ith character in bchars, but mapped to ASCII characters starting from 'a'. Why would you want to use this as your key?
I think you really want to do:
for i in range(len(bchars)):
for j in range(len(d)):
if(bchars[i] in d[j]):
d[j][bchars[i]] += 1
else:
d[j][bchars[i]] = 1
Note that you can't use += in the else; remember how you literally just checked whether the key was there and decided it wasn't?
More broadly, though, your code doesn't make sense - it is overcomplicated and does not use the real power of Python's dictionaries. d looks like:
{0: {'a': 0}, 1: {'b': 0}, 2: {'c': 0}, ...}
It would be much more sensible to build a dictionary mapping character directly to count:
{'a': 0, 'b': 0, 'c': 0, ...}
then you can simply do:
for char in bchars:
if char in d:
d[char] += 1
Python even comes with a class just for doing this sort of thing.
The nested dictionary doesn't seem necessary:
d = [0] * 26
for c in bchars:
d[ord(c)-97] += 1
You might also want to look at the Counter class in the collections module.
from collections import Counter
bchars = ('i','g','h','n','h')
counts = Counter(bchars)
print(counts)
print(counts['h'])
prints
Counter({'h': 2, 'i': 1, 'g': 1, 'n': 1})
2
I have a dictionary where each key has a list of variable length, eg:
d = {
'a': [1, 3, 2],
'b': [6],
'c': [0, 0]
}
Is there a clean way to get a random dictionary key, weighted by the length of its value?
random.choice(d.keys()) will weight the keys equally, but in the case above I want 'a' to be returned roughly half the time.
This would work:
random.choice([k for k in d for x in d[k]])
Do you always know the total number of values in the dictionary? If so, this might be easy to do with the following algorithm, which can be used whenever you want to make a probabilistic selection of some items from an ordered list:
Iterate over your list of keys.
Generate a uniformly distributed random value between 0 and 1 (aka "roll the dice").
Assuming that this key has N_VALS values associated with it and there are TOTAL_VALS total values in the entire dictionary, accept this key with a probability N_VALS / N_REMAINING, where N_REMAINING is the number of items left in the list.
This algorithm has the advantage of not having to generate any new lists, which is important if your dictionary is large. Your program is only paying for the loop over K keys to calculate the total, a another loop over the keys which will on average end halfway through, and whatever it costs to generate a random number between 0 and 1. Generating such a random number is a very common application in programming, so most languages have a fast implementation of such a function. In Python the random number generator a C implementation of the Mersenne Twister algorithm, which should be very fast. Additionally, the documentation claims that this implementation is thread-safe.
Here's the code. I'm sure that you can clean it up if you'd like to use more Pythonic features:
#!/usr/bin/python
import random
def select_weighted( d ):
# calculate total
total = 0
for key in d:
total = total + len(d[key])
accept_prob = float( 1.0 / total )
# pick a weighted value from d
n_seen = 0
for key in d:
current_key = key
for val in d[key]:
dice_roll = random.random()
accept_prob = float( 1.0 / ( total - n_seen ) )
n_seen = n_seen + 1
if dice_roll <= accept_prob:
return current_key
dict = {
'a': [1, 3, 2],
'b': [6],
'c': [0, 0]
}
counts = {}
for key in dict:
counts[key] = 0
for s in range(1,100000):
k = select_weighted(dict)
counts[k] = counts[k] + 1
print counts
After running this 100 times, I get select keys this number of times:
{'a': 49801, 'c': 33548, 'b': 16650}
Those are fairly close to your expected values of:
{'a': 0.5, 'c': 0.33333333333333331, 'b': 0.16666666666666666}
Edit: Miles pointed out a serious error in my original implementation, which has since been corrected. Sorry about that!
Without constructing a new, possibly big list with repeated values:
def select_weighted(d):
offset = random.randint(0, sum(d.itervalues())-1)
for k, v in d.iteritems():
if offset < v:
return k
offset -= v
Given that your dict fits in memory, the random.choice method should be reasonable. But assuming otherwise, the next technique is to use a list of increasing weights, and use bisect to find a randomly chosen weight.
>>> import random, bisect
>>> items, total = [], 0
>>> for key, value in d.items():
total += len(value)
items.append((total, key))
>>> items[bisect.bisect_left(items, (random.randint(1, total),))][1]
'a'
>>> items[bisect.bisect_left(items, (random.randint(1, total),))][1]
'c'
Make a list in which each key is repeated a number of times equal to the length of its value. In your example: ['a', 'a', 'a', 'b', 'c', 'c']. Then use random.choice().
Edit: or, less elegantly but more efficiently, try this: take the sum of the lengths of all values in the dictionary, S (you can cache and invalidate this value, or keep it up to date as you edit the dictionary, depending on the exact usage pattern you anticipate). Generate a random number from 0 to S, and do a linear search through the dictionary keys to find the range into which your random number falls.
I think that's the best you can do without changing or adding to your data representation.
Here is some code that is based on a previous answer I gave for probability distribution in python but is using the length to set the weight. It uses an iterative markov chain so that it does not need to know what the total of all of the weights are. Currently it calculates the max length but if that is too slow just change
self._maxw = 1
to
self._maxw = max lenght
and remove
for k in self._odata:
if len(self._odata[k])> self._maxw:
self._maxw=len(self._odata[k])
Here is the code.
import random
class RandomDict:
"""
The weight is the length of each object in the dict.
"""
def __init__(self,odict,n=0):
self._odata = odict
self._keys = list(odict.keys())
self._maxw = 1 # to increase speed set me to max length
self._len=len(odict)
if n==0:
self._n=self._len
else:
self._n=n
# to increase speed set above max value and comment out next 3 lines
for k in self._odata:
if len(self._odata[k])> self._maxw:
self._maxw=len(self._odata[k])
def __iter__(self):
return self.next()
def next(self):
while (self._len > 0) and (self._n>0):
self._n -= 1
for i in range(100):
k=random.choice(self._keys)
rx=random.uniform(0,self._maxw)
if rx <= len(self._odata[k]): # test to see if that is the value we want
break
# if you do not find one after 100 tries then just get a random one
yield k
def GetRdnKey(self):
for i in range(100):
k=random.choice(self._keys)
rx=random.uniform(0,self._maxw)
if rx <= len(self._odata[k]): # test to see if that is the value we want
break
# if you do not find one after 100 tries then just get a random one
return k
#test code
d = {
'a': [1, 3, 2],
'b': [6],
'c': [0, 0]
}
rd=RandomDict(d)
dc = {
'a': 0,
'b': 0,
'c': 0
}
for i in range(100000):
k=rd.GetRdnKey()
dc[k]+=1
print("Key count=",dc)
#iterate over the objects
dc = {
'a': 0,
'b': 0,
'c': 0
}
for k in RandomDict(d,100000):
dc[k]+=1
print("Key count=",dc)
Test results
Key count= {'a': 50181, 'c': 33363, 'b': 16456}
Key count= {'a': 50080, 'c': 33411, 'b': 16509}
I'd say this:
random.choice("".join([k * len(d[k]) for k in d]))
This makes it clear that each k in d gets as many chances as the length of its value. Of course, it is relying on dictionary keys of length 1 that are characters....
Much later:
table = "".join([key * len(value) for key, value in d.iteritems()])
random.choice(table)
I modified some of the other answers to come up with this. It's a bit more configurable. It takes 2 arguments, a list and a lambda function to tell it how to generate a key.
def select_weighted(lst, weight):
""" Usage: select_weighted([0,1,10], weight=lambda x: x) """
thesum = sum([weight(x) for x in lst])
if thesum == 0:
return random.choice(lst)
offset = random.randint(0, thesum - 1)
for k in lst:
v = weight(k)
if offset < v:
return k
offset -= v
Thanks to sth for the base code for this.
import numpy as np
my_dict = {
"one": 5,
"two": 1,
"three": 25,
"four": 14
}
probs = []
elements = [my_dict[x] for x in my_dict.keys()]
total = sum(elements)
probs[:] = [x / total for x in elements]
r = np.random.choice(len(my_dict), p=probs)
print(list(my_dict.values())[r])
# 25
Need to mention random.choices for Python 3.6+:
import random
raffle_dict = {"Person 1": [1,2], "Person 2": [1]}
random.choices(list(raffle_dict.keys()), [len(w[1]) for w in raffle_dict.items()], k=1)[0]
random.choices returns a list of samples, so k=1 if you only need one and we'll take the first item in the list. If your dictionary already has the weights, just get rid of the len or better yet:
raffle_dict = {"Person 1": 1, "Person 2": 10}
random.choices(list(raffle_dict.keys()), raffle_dict.values(), k=1)[0]
See also this question and this tutorial,