Find most common word in a list of sets

Find most common word in a list of sets - python

I'm currently working in my university projects in NLP. I'd like to display the most common words contained in this list of sets:
[{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
This is what I've accomplished until now:
def word_list(sent):
if isinstance(sent, str):
tokens = set(word_tokenize(sent.lower()))
else:
tokens = set([t for s in sent for t in word_tokenize(s.lower())])
tokens = set([stemmer.stem(t) for t in tokens])
for w in stopword_final:
tokens.discard(w)
return tokens
def get_most_relevant_words(definitions):
list_of_words = list()
most_common_word_dict = dict()
for d1 in definitions:
list_of_words.append(word_list(d1))
for elem in list_of_words:
for word in elem:
print(word)
word_counter = Counter(word)
most_occurrences = word_counter.most_common(3)
most_common_word_dict.update({word: most_occurrences})
return most_common_word_dict
The desired output should be: {fear: 2, feel: 2}
The output that it prints is: {'feel': [('e', 2), ('f', 1), ('l', 1)]}

Use collections.Counter:
from collections import Counter
list_of_sets = [{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
words = [word for my_set in list_of_sets for word in my_set]
c = Counter(words)
print(c)
output:
Counter({
'fear': 2,
'face': 2,
'feel': 2,
'properti': 1,
'despit': 1,
'allow': 1,
'situat': 1,
'ani': 1,
'someth': 1,
'unpleas': 1,
'make': 1,
'abil': 1,
'us': 1,
'scar': 1
})

You can simply iterate through the 2 sets, find common terms, and update the count in a dictionary. By the way, 'face' should also be included in your result.
lst = [{'allow', 'feel', 'fear', 'situat', 'properti', 'despit', 'face', 'ani'}, {'unpleas', 'someth', 'fear', 'make', 'abil', 'face', 'scar', 'us', 'feel'}]
dic = {}
for word1 in lst[0]:
for word2 in lst[1]:
if word1 == word2:
dic[word1] = dic.get(word1, 0) + 2
print(dic)
#{'fear': 2, 'feel': 2, 'face': 2}

Related

How do I remove newline from my key in a dictionary?

I have a file I am reading from and putting it into a dictionary. How do I remove the new line from the key?
def main():
# Set up empty dictionary
counter = {}
# open text file
my_file = open("WorldSeriesWinners.txt", "r")
words = my_file.readlines()
# Add each unique word to dictionary with a counter of 0
unique_words = list(set(words))
for word in unique_words:
counter[word] = 0
# For each word in the text increase its counter in the dictionary
for item in words:
counter[item] += 1
return counter
counter = main()
print(counter)
OUTPUT:
{'Cleveland Indians\n': 2, 'Pittsburgh Pirates\n': 5, 'St. Louis Cardinals\n': 10, 'New York Giants\n': 5, 'Cincinnati Reds\n': 5, 'Boston Americans\n': 1, 'Chicago White Sox\n': 3, 'Toronto Blue Jays\n': 2, 'Detroit Tigers\n': 4, 'NONE\n': 2, 'Boston Red Sox\n': 6, 'Minnesota Twins\n': 2, 'Kansas City Royals\n': 1, 'Chicago Cubs\n': 2, 'Baltimore Orioles\n': 3, 'Arizona Diamondbacks\n': 1, 'Philadelphia Phillies': 1, 'Los Angeles Dodgers\n': 5, 'Brooklyn Dodgers\n': 1, 'Florida Marlins\n': 2, 'Washington Senators\n': 1, 'New York Yankees\n': 26, 'Philadelphia Athletics\n': 5, 'Boston Braves\n': 1, 'New York Mets\n': 2, 'Atlanta Braves\n': 1, 'Anaheim Angels\n': 1, 'Philadelphia Phillies\n': 1, 'Oakland Athletics\n': 4, 'Milwaukee Braves\n': 1}

Just use the replace function when defining your key. See below:
def main():
# Set up empty dictionary
counter = {}
# open text file
my_file = open("WorldSeriesWinners.txt", "r")
words = my_file.readlines()
# Add each unique word to dictionary with a counter of 0
unique_words = list(set(words))
for word in unique_words:
word_no_lines = word.replace('\n', '')
counter[word_no_lines] = 0
# For each word in the text increase its counter in the dictionary
for item in words:
item_no_lines = item.replace('\n', '')
counter[item_no_lines] += 1
return counter
counter = main()
print(counter)

List that resembles a dict to dict

I have a list that already quite resembles a dictionary:
l=["'S':'NP''VP'", "'NP':'DET''N'", "'VP':'V'", "'DET':'a'", "'DET':'an'", "'N':'elephant'", "'N':'elephants'", "'V':'talk'", "'V':'smile'"]
I want to create a dictionary keeping all information:
dict= {'S': [['NP','VP']],
'NP': [['DET', 'N']],
'VP': [['V']], 'DET': [['a'], ['an']],
'N': [['elephants'], ['elephant']],
'V': [['talk'], ['smile]]}
I tried using this:
d = {}
elems = filter(str.isalnum,l.replace('"',"").split("'"))
values = elems[1::2]
keys = elems[0::2]
d.update(zip(keys,values))
and this:
s = l.split(",")
dictionary = {}
for i in s:
dictionary[i.split(":")[0].strip('\'').replace("\"", "")] = i.split(":")[1].strip('"\'')
print(dictionary)

You can use collections.defaultdict with re:
import re, collections
l=["'S':'NP''VP'", "'NP':'DET''N'", "'VP':'V'", "'DET':'a'", "'DET':'an'", "'N':'elephant'", "'N':'elephants'", "'V':'talk'", "'V':'smile'"]
d = collections.defaultdict(list)
for i in l:
d[(k:=re.findall('\w+', i))[0]].append(k[1:])
print(dict(d))
Output:
{'S': [['NP', 'VP']], 'NP': [['DET', 'N']], 'VP': [['V']], 'DET': [['a'], ['an']], 'N': [['elephant'], ['elephants']], 'V': [['talk'], ['smile']]}

I want to find the length of my each words in text file

I'm trying to find the length of words individually in my text file. I tried it by following code but this code is showing me up the count of words that how many times this word is used in file.
text = open(r"C:\Users\israr\Desktop\counter\Bigdata.txt")
d = dict()
for line in text:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if word in d:
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
for key in list(d.keys()):
print(key, ":", d[key])
And the output is something like this
china : 14
emerged : 1
as : 16
one : 5
of : 44
the : 108
world's : 7
first : 2
civilizations, : 1
in : 26
fertile : 1
basin : 1
yellow : 1
river : 1
north : 1
plain. : 1
Basically i want a list of words having same length for example china , first , world :5 this 5 is the length of all this words and so on the words having different length in other list

İf you need all word's total lengths seperatly, you can find them using this formula:
len(word) * count(word) for all word in words
equalivent in python:
d[key] * len(key)
Change last 2 lines with below:
for key in list(d.keys()):
print(key, ":", d[key] * len(key))
----EDIT----
It ıs what you asked in comments, I guess. Below code gives you groups whose members are the same length.
for word in words:
if len(word) in d:
if word not in d[len(word)]:
d[len(word)].append(word)
else:
# Add the word to dictionary with count 1
d[len(word)] = [word]
for key in list(d.keys()):
print(key, ":", d[key])
Output of this code:
3 : ['the', 'bc,', '(c.', 'who', 'was', '100', 'bc)', 'and', 'xia', 'but', 'not', 'one', 'due', '8th', '221', 'qin', 'shi', 'for', 'his', 'han', '220', '206', 'has', 'war', 'all', 'far']
8 : ['earliest', 'describe', 'writings', 'indicate', 'commonly', 'however,', 'cultural', 'history,', 'regarded', 'external', 'internal', 'culture,', 'troubled', 'imperial', 'selected', 'replaced', 'republic', 'mainland', "people's", 'peoples,', 'multiple', 'kingdoms', 'xinjiang', 'present.', '(carried']
5 : ['known', 'china', 'early', 'shang', 'texts', 'grand', 'ruled', 'river', 'which', 'along', 'these', 'arose', 'years', 'their', 'rule.', 'began', 'first', 'those', 'huang', 'title', 'after', 'until', '1912,', 'tasks', 'elite', 'young', '1949.', 'unity', 'being', 'civil', 'parts', 'other', 'world', 'waves', 'basis']
7 : ['written', 'records', 'history', 'dynasty', 'ancient', 'century', 'mention', 'writing', 'period,', 'xia.[5]', 'valley,', 'chinese', 'various', 'centers', 'yangtze', "world's", 'cradles', 'concept', 'mandate', 'justify', 'central', 'country', 'smaller', 'period.', 'another', 'warring', 'created', 'himself', 'huangdi', 'marking', 'systems', 'enabled', 'emperor', 'control', 'routine', 'handled', 'special', 'through', "china's", 'between', 'periods', 'culture', 'western', 'foreign']
2 : ['of', 'as', 'wu', 'by', 'no', 'is', 'do', 'in', 'to', 'be', 'at', 'or', 'bc', '21', 'ad']
4 : ['date', 'from', '1250', 'bc),', 'king', 'such', 'book', '11th', '(296', 'held', 'both', 'with', 'zhou', 'into', 'much', 'qin,', 'fell', 'soon', '(206', 'ad).', 'that', 'vast', 'were', 'men,', 'last', 'qing', 'then', 'most', 'whom', 'eras', 'have', 'some', 'asia', 'form']
9 : ['1600–1046', 'mentioned', 'documents', 'chapters,', 'historian', '2070–1600', 'existence', 'neolithic', 'millennia', 'thousands', '(1046–256', 'pressures', 'following', 'developed', 'conquered', '"emperor"', 'beginning', 'dynasties', 'directly.', 'centuries', 'carefully', 'difficult', 'political', 'dominated', 'stretched', 'contact),']
6 : ['during', "ding's", '(early', 'bamboo', 'annals', 'before', 'shang,', 'yellow', 'cradle', 'river.', 'shang.', 'oldest', 'heaven', 'weaken', 'states', 'spring', 'autumn', 'became', 'warred', 'times.', 'china.', 'death,', 'peace,', 'failed', 'recent', 'steppe', 'china;', 'tibet,', 'modern']
12 : ['reign,[1][2]', 'twenty-first', 'longer-lived', 'bureaucratic', 'calligraphy,', '(1644–1912),', '(1927–1949).', 'occasionally', 'immigration,']
11 : ['same.[3][4]', 'independent', 'traditional', 'territories', 'well-versed', 'literature,', 'philosophy,', 'assimilated', 'population.', 'warlordism,']
10 : ['historical', 'originated', 'continuous', 'supplanted', 'introduced', 'government', 'eventually', 'splintered', 'literature', 'philosophy', 'oppressive', 'successive', 'alternated', 'influences', 'expansion,']
1 : ['a', '–']
13 : ['civilization.', 'civilizations', 'examinations.', 'statehood—the', 'assimilation,']
17 : ['civilizations,[6]']
16 : ['civilization.[7]']
0 : ['']
14 : ['administrative']
18 : ['scholar-officials.']
Below is full version of code.
text = open("bigdata.txt")
d = dict()
for line in text:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if len(word) in d:
if word not in d[len(word)]:
d[len(word)].append(word)
else:
d[len(word)] = [word]
for key in list(d.keys()):
print(key, ":", d[key])

When you look at the code for dealing with each word, you will see your problem..
for word in words:
if word in d:
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
Here you are checking if a word is in the dictionary. If it is, add 1 to its key when we find it. If it is not, initialize it at 1. This is the core concept for counting repetitions.
If you want to count the length of the word, you could simply do.
for word in words:
if word not in d:
d[word] = len(word)
And to output your dict, you can do
for k, v in d.items():
print(k, ":", v)

You can create a list of word lengths and then process them through python's built-in Counter:
from collections import Counter
with open("mytext.txt", "r") as f:
words = f.read().split()
words_lengths = [len(word) for word in words]
counter = Counter(words_lengths)
The output would be smth like:
In[1]:counter
Out[1]:Counter({7: 146, 9: 73, 5: 73, 4: 146, 1: 73})
Where keys are words lengths, and values are the number of times they occurred.
You can work with that as with usual dictionary.

How to search for the dictionary keys in a list

Say I have a dictionary of:
lst = {'adore': 10, 'hate': 10, 'hello': 10, 'pigeon': 1, 'would': 5, 'elections': 5}
And I have a list of:
mylist = [['a new', 'party'], ['to', 'lol'], ['compete'], ['in', 'adore', 'the 2013'], ['federal', 'elections'], ['The Free', 'Voters'], ['leadership', 'declined to'], ['join forces', 'according to', 'a leaked'], ['email from', 'Bernd Lucke'], ['Advocating', 'adore'] ]
I want to be able to search the list for the keys in the dictionary. If a word in the list is a key, then to take the value of that key and add it to a counter. In the end, to have a total sum of all the values.
Is there a way to do this?

Like this?
lst = {'adore': 10, 'hate': 10, 'hello': 10, 'pigeon': 1, 'would': 5, 'elections': 5}
mylist = [['a new', 'party'], ['to', 'lol'], ['compete'], ['in', 'adore', 'the 2013'], ['federal', 'elections'], ['The Free', 'Voters'], ['leadership', 'declined to'], ['join forces', 'according to', 'a leaked'], ['email from', 'Bernd Lucke'], ['Advocating', 'adore']]
print([lst.get(i) for j in mylist for i in j if lst.get(i) != None])
print(sum([lst.get(i) for j in mylist for i in j if lst.get(i) != None]))
Output:
[10, 5, 10]
25
If you don't like them in one line:
total = []
for i in mylist:
for j in i:
if lst.get(i) != None:
total.append(lst.get(i))
print(sum(total))

Probably you can do this in a more pythonic way.
lst = {'adore': 10, 'hate': 10, 'hello': 10, 'pigeon': 1, 'would': 5}
counter = {'adore': 0, 'hate': 0, 'hello': 0, 'pigeon': 0, 'would': 0}
mylist = [['a new', 'party'], ['to', 'lol'], ['compete'], ['in', 'adore', 'the 2013'], ['federal', 'elections'], ['The Free', 'Voters'], ['leadership', 'declined to'], ['join forces', 'according to', 'a leaked'], ['email from', 'Bernd Lucke'], ['Advocating', 'adore'] ]
def func():
for key in lst.keys():
for item in mylist:
if key in item:
counter[key] = counter[key] + lst[key]
func()
print sum(counter.values())

Adding list in form of tuples to a dictionary

Assuming there is a list with sublists like this
[[2013, 'Patric', 'M', 1356], [2013, 'Helena', 'F', 202], [2013, 'Patric', 'F', 6],[1993, 'Patric', 'F', 7]......]
which is an output of def list_of_names() where 2013 is year, M is gender and 1356 is number of M births etc.
And I want to create a dictionary which outputs the name as a key and values as tuples (year, number_of_males,number_of_females) . So for example:
{ .. ’Patric’:[... , (1993, 0, 7), (2013, 1356, 6), ... ], ... }.
Technically 1993 is year, 0 is number of males and 7 is number of females and the tuples should be arranged in order of the years.
and I'm stuck on how to add this info into a dictionary
def name_Index(names):
d = dict()
L = readNames() #the list with from previous def which outputs different names and info as above
newlist = []
for sublist in L:

from collections import defaultdict
def list_of_names():
return [[2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7]]
def name_Index():
tmp = defaultdict(lambda:defaultdict(lambda: [0,0]))
for year, name, sex, N in list_of_names():
i = 0 if sex == 'M' else 1
tmp[name][year][i] += N
d = {}
for name, entries in tmp.items():
d[name] = [(year, M, F) for (year, (M,F)) in entries.items()]
return d
print name_Index()

This was my attempt at the problem:
from collections import defaultdict, namedtuple
from itertools import groupby
data = [[2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7]]
names = defaultdict(list)
datum = namedtuple('datum', 'year gender number')
for k, g in groupby(data, key=lambda x: x[1]):
for l in g:
year, name, gender, number = l
names[k].append(datum(year, gender, number))
final_dict = defaultdict(list)
for n in names:
for k, g in groupby(names[n], lambda x: x.year):
males = 0
females = 0
for l in g:
if l.gender == 'M':
males += l.number
elif l.gender == 'F':
females += l.number
final_dict[n].append((k, males, females))
print(final_dict)

The most convenient will be to use collections.defauldict. It returns dictionary-like object, that returns default value, if it doesn't find key. In your case, you use a list as default value, and in your loop you append tuples to it:
from collections import defaultdict
names = [ [2013, 'Patric', 'M', 1356],
[2013, 'Helena', 'F', 202],
[2013, 'Patric', 'F', 6],
[1993, 'Patric', 'F', 7] ]
def name_Index(data):
# name => year => sex
d = defaultdict(lambda: defaultdict(lambda: {'F': 0, 'M': 0}))
for year, name, sex, births in data:
d[name][year][sex] += births
# if you are fine with defauldict result: return d
# else collect results into tuples:
result = {}
for name, data in d.items():
result[name] = [(year, c['M'], c['F']) for year, c in data.items()]
return result
print name_Index(names)
# {'Helena': [(2013, 0, 202)], 'Patric': [(1993, 0, 7), (2013, 1356, 6)]}

I didn't understand why you are taking names as an argument of name_Index function and then calling readNames, there must be some necessity required for your work. Hence, i just put a dummy readNames function and sent None as argument to name_Index. Using class is a good technique to solve complicated data structures. Btw, nicely written question i must admit.
def readNames ():
return [[2013, 'Patric', 'M', 1356], [2013, 'Helena', 'F', 202], [2013, 'Patric', 'F', 6],[1993, 'Patric', 'F', 7]]
class YearOb(object):
def __init__(self):
self.male = 0
self.female = 0
def add_birth_data(self, gender, birth_count):
if gender == "M":
self.male += birth_count
else:
self.female += birth_count
class NameOb(object):
def __init__(self):
self.yearobs = dict()
def add_record(self, year, gender, birth_count):
if year not in self.yearobs:
self.yearobs[year]=YearOb()
self.yearobs[year].add_birth_data(gender, birth_count)
def get_as_list(self):
list_data = []
for year, yearob in self.yearobs.items():
list_data.append((year, yearob.male, yearob.female))
return list_data
def name_Index(names):
d = dict()
L = readNames() #the list with from previous def which outputs different names and info as above
newlist = []
for sublist in L:
name = sublist[1]
if name not in d:
d[name]=NameOb()
d[name].add_record(sublist[0], sublist[2], sublist[3])
for name, nameob in d.items():
d[name] = nameob.get_as_list()
return d
print(name_Index(None))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find most common word in a list of sets - python

Related

How do I remove newline from my key in a dictionary?

List that resembles a dict to dict

I want to find the length of my each words in text file

How to search for the dictionary keys in a list

Adding list in form of tuples to a dictionary

Categories

Resources