I was trying to find the occurrence of every 2 consecutive characters from a string.
The result will be in a dictionary as key = 2 characters and value = number of occurrence.
I tried the following :
seq = "AXXTAGXXXTA"
d = {seq[i:i+2]:seq.count(seq[i:i+2]) for i in range(0, len(seq)-1)}
The problem is that the result of XX should be 3 not 2 .
You can use collections.Counter.
from collections import Counter
seq = "AXXTAGXXXTA"
Counter((seq[i:i+2] for i in range(len(seq)-1)))
Output:
Counter({'AX': 1, 'XX': 3, 'XT': 2, 'TA': 2, 'AG': 1, 'GX': 1})
Or without additional libraries. You can use dict.setdefault.
seq = "AXXTAGXXXTA"
d = {}
for i in range(len(seq)-1):
key = seq[i:i+2]
d[key] = d.setdefault(key, 0) + 1
print(d)
Related
So, I have a dictionary like this:
dic_parsed_sentences = {'religion': {'david': 1, 'joslin': 1, 'apolog': 5, 'jim': 1, 'meritt': 2},
'sport': {'sari': 1, 'basebal': 1, 'kolang': 5, 'footbal': 1, 'baba': 2},
'education': {'madrese': 1, 'kelas': 1, 'yahyah': 5, 'dars': 1},
'computer': {'net': 1, 'internet': 1},
'windows': {'copy': 1, 'right': 1}}
I want to loop through it based on the length of the dictionaries within that dictionary.
For example,
it has two items with length 5, one item with length 4, and two items with length 2. I want to process the same length items together (something like a group by in pandas).
So the output of the first iteration will look like this (as you see only items with length 5 are available here):
[[david, joslin, apolog, jim, meritt],
[sari, baseball, kolang, footbal, baba]]
and next iteration it will make the next same length items:
[[madrese, kelas, yahyah, dars]]
And the last iteration:
[[net, internet],
[copy, right]]
Why do we only have three iterations here? Because we only have three different lengths of items within the dictionary dic_parsed_sentences.
I have done something like this, but I dont know how to iterate through the same length items:
for i in dic_parsed_sentences.groupby(dic_parsed_sentences.same_length_items): # this line is sodoku line I dont know how to code it(I mean iterate through same length items in the dicts)
for index_file in dic_parsed_sentences:
temp_sentence = dic_parsed_sentences[index_file]
keys_words = list(temp_sentence.keys())
for index_word in range(len(keys_words)):
arr_sent_wids[index_sentence, index_word] =
keys_words[index_word]
index = index + 1
index_sentence = index_sentence + 1
Update:
for length, dics in itertools.groupby(dic_parsed_sentences, len):
for index_file in dics:
temp_sentence = dics[index_file]
keys_words = list(temp_sentence.keys())
for index_word in range(len(keys_words)):
test_sent_wids[index_sentence, index_word] = lookup_word2id(keys_words[index_word])
index = index + 1
index_sentence = index_sentence + 1
You can use itertools.groupby after sorting the dictionary elements by length.
import itertools
items = sorted(dic_parsed_sentences.values(), key = len, reverse = True)
for length, dics in itertools.groupby(items, len):
# dics is all the nested dictionaries with this length
for temp_sentence in dics:
keys_words = list(temp_sentence.keys())
for index_word in range(len(keys_words)):
test_sent_wids[index_sentence, index_word] = lookup_word2id(keys_words[index_word])
index = index + 1
index_sentence = index_sentence + 1
bylen = {}
for v in dic_parsed_sentences.values():
l = len(v)
if not l in bylen:
bylen[l] = []
bylen[l].append(list(v.keys()))
for k in reversed(sorted(bylen.keys())):
# use bylen[k]
You can do it using the following method:
finds = [[key, len(dic_parsed_sentences[key])] for key in dic_parsed_sentences]
finds.sort(reverse=True, key=lambda x: x[1])
previous = finds[0][1]
res = []
for elem in finds:
current = elem[1]
if current != previous:
previous = current
print(res)
res = []
res.append(list(dic_parsed_sentences[elem[0]]))
print(res)
im new on python 3.
What I want to do is to alternate upper and lowercase but only on a dictionary key.
my dictionary is created from a list, its key is the word (or list element) and its value is the times this element appears in the list.
kb = str(input("Give me a string: "));
txt = kb.lower(); #Turn string into lowercase
cadena = txt.split(); #Turn string into list
dicc = {};
for word in cadena:
if (word in dicc):
dicc[word] = dicc[word] + 1
else:
dicc[word] = 1
print(dicc)
With this code i can get for example:
input: "Hi I like PYthon i am UsING python"
{'hi': 1, 'i': 2, 'like': 1, 'python': 2, 'am': 1, 'using': 1}
but what I am trying to get is actually is:
{'hi': 1, 'I': 2, 'like': 1, 'PYTHON': 2, 'am': 1, 'USING': 1}
I tried using this:
for n in dicc.keys():
if (g%2 == 0):
n.upper()
else:
n.lower()
print(dicc)
But it seems that I have no idea of what I'm doing.
Any help would be appreciated.
Using itertools and collections.OrderedDict (to guarantee order in Python < 3.7)
Setup
import itertools
from collections import OrderedDict
s = 'Hi I like PYthon i am UsING python'
switcher = itertools.cycle((str.lower, str.upper))
d = OrderedDict()
final = OrderedDict()
First, create an OrderedDictionary just to count the occurences of strings in your list (since you want matches to be case insensitive based on your output):
for word in s.lower().split():
d.setdefault(word, 0)
d[word] += 1
Next, use itertools.cycle to call str.lower or str.upper on keys and create your final dictionary:
for k, v in d.items():
final[next(switcher)(k)] = v
print(final)
OrderedDict([('hi', 1), ('I', 2), ('like', 1), ('PYTHON', 2), ('am', 1), ('USING', 1)])
Your n in dicc.keys() line is wrong. You are trying to use n as both the position in the array of keys and the key itself.
Also the semicolons are unnecessary.
This should do what you want:
from collections import OrderedDict
# Receive user input
kb = str(input("Give me a string: "))
txt = kb.lower()
cadena = txt.split()
dicc = OrderedDict()
# Construct the word counter
for word in cadena:
if word in dicc:
dicc[word] += 1
else:
dicc[word] = 1
If you just want to print the output with alternating case, you can do something like this:
# Print the word counter with alternating case
elems = []
for i, (word, wordcount) in enumerate(dicc.items()):
if i % 2 == 0:
word = word.upper()
elems.append('{}: {}'.format(word, wordcount)
print('{' + ', '.join(elems) + '}')
Or you can make a new OrderedDict with alternating case...
dicc_alt_case = OrderedDict((word.upper() if (i % 2 == 0) else word, wordcount)
for word, wordcount in dicc.items())
If I have a list which consists of multiple path, IE:
/project/task1/sub1/info1
/project/task1/sub1/info2
/project/task1/sub2/info1
/project/task1/sub2/info2
/project/task2/sub1/info1
/project/task2/sub1/info2
/project/task2/sub2/info1
/project/task2/sub2/info2
How could I count the number of occurrence by index if the string is different within the paths? Such as for above I am expecting to get this:
idx0 = 1 (only project exist)
idx1 = 2 (task1 & task2)
idx3 = 2 (sub1 & sub2)
idx4 = 2 (info1 & info2)
Thank you in advanced
Solution via list and dictionary comprehensions:
lst = ['/project/task1/sub1/info1',
'/project/task1/sub1/info2',
'/project/task1/sub2/info1',
'/project/task1/sub2/info2',
'/project/task2/sub1/info1',
'/project/task2/sub1/info2',
'/project/task2/sub2/info1',
'/project/task2/sub2/info2']
lst_expanded = [x.split('/')[1:] for x in lst]
idx = {i: len(set(j)) for i, j in enumerate(zip(*lst_expanded))}
# idx = {0: 1, 1: 2, 2: 2, 3: 2}
You can try this:
s = ['/project/task1/sub1/info1', '/project/task1/sub1/info2', '/project/task1/sub2/info1', '/project/task1/sub2/info2', '/project/task2/sub1/info1', '/project/task2/sub1/info2', '/project/task2/sub2/info1', '/project/task2/sub2/info2']
new_s = map(lambda x:x.split('/'), s)
final_output = {"idx{}".format(a):len(set(i)) for a, i in enumerate(zip(*new_s))}
Output:
{'idx3': 2, 'idx2': 2, 'idx1': 1, 'idx0': 1, 'idx4': 2}
I am trying to create a dictionary that has a nested list inside of it.
The goal would be to have it be:
key : [x,y,z]
I am pulling the information from a csv file and counting the number of times a certain key shows up in each column. However I am getting the below error
> d[key][i] = 1
KeyError: 'owner'
Where owner is the title of my column.
if __name__ == '__main__':
d = {}
with open ('sample.csv','r') as f:
reader = csv.reader(f)
for i in range(0,3):
for row in reader:
key = row[0]
if key in d:
d[key][i] +=1
else:
d[key][i] = 1
for key,value in d.iteritems():
print key,value
What do I tweak in this loop to have it create a key if it doesn't exist and then add to it if it does?
The problem is, that you try to use a list ([i]) where no list is.
So you have to replace
d[key][i] = 1
with
d[key] = [0,0,0]
d[key][i] = 1
This would first create the list with three entries (so you can use [0], [1] and [2] afterward without error) and then assigns one to the correct entry in the list.
You can use defaultdict:
from collections import defaultdict
ncols = 3
d = defaultdict(lambda: [0 for i in range(ncols)])
Use a try, catch block to append a list to the new key, then increment as needed
if __name__ == '__main__':
d = {}
with open ('sample.csv','r') as f:
reader = csv.reader(f)
for i in xrange(0,3):
for row in reader:
key = row[i]
try: d[key][i] += 1
except KeyError:
d[key] = [0, 0, 0]
d[key][i] = 1
for key,value in d.iteritems():
print key,value
Using defaultdict and Counter you can come up with a dict that allows you to easily measure how many times a key appeared in a position (in this case 1st, 2nd or 3rd, by the slice)
csv = [
['a','b','c','d'],
['e','f','g', 4 ],
['a','b','c','d']
]
from collections import Counter, defaultdict
d = defaultdict(Counter)
for row in csv:
for idx, value in enumerate(row[0:3]):
d[value][idx] += 1
example usage:
print d
print d['a'][0] #number of times 'a' has been found in the 1st position
print d['b'][2] #number of times 'b' found in the 3rd position
print d['f'][1] #number of times 'f' found in 2nd position
print [d['a'][n] for n in xrange(3)] # to match the format requested in your post
defaultdict(<class 'collections.Counter'>, {'a': Counter({0: 2}), 'c': Counter({2: 2}), 'b': Counter({1: 2}), 'e': Counter({0: 1}), 'g': Counter({2: 1}), 'f': Counter({1: 1})})
2
0
1
[2, 0, 0]
Or put into a function:
def occurrences(key):
return [d[key][n] for n in xrange(3)]
print occurrences('a') # [2, 0, 0]
I have a dictionary that's two levels deep. That is, each key in the first dictionary is a url and the value is another dictionary with each key being words and each value being the number of times the word appeared on that url. It looks something like this:
dic = {
'http://www.cs.rpi.edu/news/seminars.html': {
'hyper': 1,
'summer': 2,
'expert': 1,
'koushk': 1,
'semantic': 1,
'feedback': 1,
'sandia': 1,
'lewis': 1,
'global': 1,
'yener': 1,
'laura': 1,
'troy': 1,
'session': 1,
'greenhouse': 1,
'human': 1
...and so on...
The dictionary itself is very long and has 25 urls in it, each url having another dictionary as its value with every word found within the url and the number of times its found.
I want to find the word or words that appear in the most different urls in the dictionary. So the output should look something like this:
The following words appear x times on y pages: list of words
It seems that you should use a Counter for this:
from collections import Counter
print sum((Counter(x) for x in dic.values()),Counter()).most_common()
Or the multiline version:
c = Counter()
for d in dic.values():
c += Counter(d)
print c.most_common()
To get the words which are common in all of the subdicts:
subdicts = iter(dic.values())
s = set(next(subdicts)).intersection(*subdicts)
Now you can use that set to filter the resulting counter, removing words which don't appear in every subdict:
c = Counter((k,v) for k,v in c.items() if k in s)
print c.most_common()
A Counter isn't quite what you want. From the output you show, it looks like you want to keep track of both the total number of occurrences, and the number of pages the word occurs on.
data = {
'page1': {
'word1': 5,
'word2': 10,
'word3': 2,
},
'page2': {
'word2': 2,
'word3': 1,
}
}
from collections import defaultdict
class Entry(object):
def __init__(self):
self.pages = 0
self.occurrences = 0
def __iadd__(self, occurrences):
self.pages += 1
self.occurrences += occurrences
return self
def __str__(self):
return '{} occurrences on {} pages'.format(self.occurrences, self.pages)
def __repr__(self):
return '<Entry {} occurrences, {} pages>'.format(self.occurrences, self.pages)
counts = defaultdict(Entry)
for page_words in data.itervalues():
for word, count in page_words.iteritems():
counts[word] += count
for word, entry in counts.iteritems():
print word, ':', entry
This produces the following output:
word1 : 5 occurrences on 1 pages
word3 : 3 occurrences on 2 pages
word2 : 12 occurrences on 2 pages
That would capture the information you want, the next step would be to find the most common n words. You could do that using a heapsort (which has the handy feature of not requiring that you sort the whole list of words by number of pages then occurrences - that might be important if you've got a lot of words in total, but n of 'top n' is relatively small).
from heapq import nlargest
def by_pages_then_occurrences(item):
entry = item[1]
return entry.pages, entry.occurrences
print nlargest(2, counts.iteritems(), key=by_pages_then_occurrences)