Determining the most common word from a user's input. [Python]

Determining the most common word from a user's input. [Python] - python

The way I tried to solve this problem was by entering the words of a user into a list and then using .count() to see how many times the word is in the list. The problem is whenever there is a tie, I need to print all of the words that appear the most amount of times. It works only if the words that I use aren't inside of another word that appears the same amount of times. Ex: if I use Jimmy and Jim in that order, it will only print Jimmy.
for value in usrinput:
dict.append(value)
for val in range(len(dict)):
count = dict.count(dict[val])
print(dict[val],count)
if (count > max):
max = count
common= dict[val]
elif(count == max):
if(dict[val] in common):
pass
else:
common+= "| " + dict[val]

Use a collections.Counter class. I'll give you a hint.
>>> from collections import Counter
>>> a = Counter()
>>> a['word'] += 1
>>> a['word'] += 1
>>> a['test'] += 1
>>> a.most_common()
[('word', 2), ('test', 1)]
You can extract the word and the frequencies from here.
Using it to extract frequencies from user input.
>>> userInput = raw_input("Enter Something: ")
Enter Something: abc def ghi abc abc abc ghi
>>> testDict = Counter(userInput.split(" "))
>>> testDict.most_common()
[('abc', 4), ('ghi', 2), ('def', 1)]

Why not use a collections.defaultdict?
from collections import defaultdict
d = defaultdict(int)
for value in usrinput:
d[value] += 1
To get the most common words sorted descending order by the number of occurences:
print sorted(d.items(), key=lambda x: x[1])[::-1]

Rather that concatenating to common where "Jim" in "Fred|Jimmy|etc" is true use a list to store the found max values and then print "|".join(commonlist).

This is a quick and dirty solution, not elegant at all, and uses numpy.
import numpy as np
def print_common( usrinput ):
'''prints the most common entry of usrinput, printing all entries if there is a tie '''
usrinput = np.array( usrinput )
# np.unique returns the unique elements of usrinput
unique_inputs = np.unique( usrinput )
# an array to store the counts of each input
counts = np.array( [] )
# loop over the unique inputs and store the count for each item
for u in unique_inputs:
ind = np.where( usrinput == u )
counts = np.append( counts, len( usrinput[ ind ] ) )
# find the maximum counts and indices in the original input array
max_counts = np.max( counts )
max_ind = np.where( counts == max_counts )
# if there's a tie for most common, print all of the ties
if len( max_ind[0] ) > 1:
for i in max_ind[0]:
print unique_inputs[i], counts[i]
#otherwise just print the maximum
else:
print unique_inputs[max_ind][0], counts[max_ind][0]
return 1
# two test arrays which show desired results
usrinput = ['Jim','Jim','Jim', 'Jimmy','Jimmy','Matt','Matt','Matt']
print_common( usrinput )
usrinput = ['Jim','Jim','Jim', 'Jimmy','Jimmy','Matt','Matt']
print_common( usrinput )

Related

Check the most frequent letter(s) in a word. Python

My task is:
To write a function that gets a string as an argument and returns the letter(s) with the maximum appearance in it.
Example 1:
s = 'Astana'
Output:
a
Example 2:
s = 'Kaskelen'
Output:
ke
So far, I've got this code(click to run):
a = input()
def most_used(w):
a = list(w)
indexes = []
g_count_max = a.count(a[0])
for letter in a:
count = 0
i = int()
for index in range(len(a)):
if letter == a[index] or letter == a[index].upper():
count += 1
i = index
if g_count_max <= count: //here is the problem.
g_count_max = count
if i not in indexes:
indexes.append(i)
letters = str()
for i in indexes:
letters = letters + a[i].lower()
return letters
print(most_used(a))
The problem is that it automatically adds first letter to the array because the sum of appearance of the first element is actually equal to the starter point of appearance(which is basically the first element).
Example 1:
s = 'hheee'
Output:
he
Example 2:
s = 'malaysia'
Output:
ma

I think what you're trying to can be much simplified by using the standard library's Counter object
from collections import Counter
def most_used(word):
# this has the form [(letter, count), ...] ordered from most to least common
most_common = Counter(word.lower()).most_common()
result = []
for letter, count in most_common:
if count == most_common[0][1]:
result.append(letter) # if equal largest -- add to result
else:
break # otherwise don't bother looping over the whole thing
return result # or ''.join(result) to return a string

You can use a dictionary comprehension with a list comprehension and max():
s = 'Kaskelen'
s_lower = s.lower() #convert string to lowercase
counts = {i: s_lower.count(i) for i in s_lower}
max_counts = max(counts.values()) #maximum count
most_common = ''.join(k for k,v in counts.items() if v == max_counts)
Yields:
'ke'

try this code using list comprehensions:
word = input('word=').lower()
letters = set(list(word))
max_w = max([word.count(item) for item in letters])
out = ''.join([item for item in letters if word.count(item)==max_w])
print(out)

Also you can import Counter lib:
from collections import Counter
a = "dagsdvwdsbd"
print(Counter(a).most_common(3)[0][0])
Then it returns:
d

using min with list properly

text = input("enter string:")
text.lower()
counta = text.count ("a")
counte = text.count ("e")
counti = text.count ("i")
counto = text.count ("o")
countu = text.count ("u")
if counta > 0:
print ("'a'",counta)
if counte > 0:
print ("'e'",counte)
if counti> 0:
print ("'i'",counti)
if counto > 0:
print ("'o'",counto)
if countu > 0:
print ("'u':",countu)
leastFreq = [counta,counte,counti,counto,countu]
leastFreq.sort()
while 0 in leastFreq: leastFreq.remove(0)
print (leastFreq)
task = count vowels in word, print least frequent vowels that occur. in this case, "potato" would print:
'a' = 1
'0' = 2
how do I make it so that it prints just 'a'? I could use min(leastFreq) but that would only return the value "1". how do I do it so that it prints using the format 'a' = 1 or if there is more than one vowel with the same number of occurences.

You could use min with an additional filter condition, testing whether the element is > 0:
>>> leastFreq = [4, 2, 1, 0, 3, 0]
>>> min(x for x in leastFreq if x > 0)
1
But this way, you lose the information what character this count belongs to. Instead of your five different variables, you could create a dictionary, mapping vowels to their respective counts:
>>> text = "potato"
>>> counts = {c: text.count(c) for c in "aeiou"}
>>> counts
{'a': 1, 'i': 0, 'e': 0, 'u': 0, 'o': 2}
>>> counts["o"]
2
And then again use min with a generator expression and a specific key function (to sort by the count, not by the vowel itself).
>>> min((c for c in counts if counts[c] > 0), key=counts.get)
'a'
If you are interested in the counts of all the letters, you could also use collections.Counter.

To minimally change your code:
leastFreq = [(counta, 'a'),(counte, 'e'),(counti, 'i'),(counto, 'o'),(countu, 'u')]
leastvowel = min(leastFreq)[1]
But you should use collections.Counter instead
from collections import Counter
text = input("Enter text: ")
c = Counter(character for character in text if character in 'aeiou') #get counts of vowels
least_frequent = c.most_common()[-1]
least_frequent will then be a tuple like ('a', 1)
EDIT: If you want all of the most frequent items you can use itertools.groupby
lestfreq=list(next(itertools.groupby(c.most_common()[::-1], key=lambda x:x[1]))[1])
This looks complicated, but all it's saying is take a list in sorted order and take all the tuples with the same second value and put them in a list.

A combination of Counter and operator might do the trick:
from collections import Counter
import operator
text = raw_input("enter string:")
freq = Counter(i for i in text if i in 'aeiou')
items = sorted(freq.items(),key=operator.itemgetter(1))
min = items[0][1]
for character,frequency in items:
if frequency == min:
print character,frequency

Count consecutive characters

How would I count consecutive characters in Python to see the number of times each unique digit repeats before the next unique digit?
At first, I thought I could do something like:
word = '1000'
counter = 0
print range(len(word))
for i in range(len(word) - 1):
while word[i] == word[i + 1]:
counter += 1
print counter * "0"
else:
counter = 1
print counter * "1"
So that in this manner I could see the number of times each unique digit repeats. But this, of course, falls out of range when i reaches the last value.
In the example above, I would want Python to tell me that 1 repeats 1, and that 0 repeats 3 times. The code above fails, however, because of my while statement.
How could I do this with just built-in functions?

Consecutive counts:
You can use itertools.groupby:
s = "111000222334455555"
from itertools import groupby
groups = groupby(s)
result = [(label, sum(1 for _ in group)) for label, group in groups]
After which, result looks like:
[("1": 3), ("0", 3), ("2", 3), ("3", 2), ("4", 2), ("5", 5)]
And you could format with something like:
", ".join("{}x{}".format(label, count) for label, count in result)
# "1x3, 0x3, 2x3, 3x2, 4x2, 5x5"
Total counts:
Someone in the comments is concerned that you want a total count of numbers so "11100111" -> {"1":6, "0":2}. In that case you want to use a collections.Counter:
from collections import Counter
s = "11100111"
result = Counter(s)
# {"1":6, "0":2}
Your method:
As many have pointed out, your method fails because you're looping through range(len(s)) but addressing s[i+1]. This leads to an off-by-one error when i is pointing at the last index of s, so i+1 raises an IndexError. One way to fix this would be to loop through range(len(s)-1), but it's more pythonic to generate something to iterate over.
For string that's not absolutely huge, zip(s, s[1:]) isn't a a performance issue, so you could do:
counts = []
count = 1
for a, b in zip(s, s[1:]):
if a==b:
count += 1
else:
counts.append((a, count))
count = 1
The only problem being that you'll have to special-case the last character if it's unique. That can be fixed with itertools.zip_longest
import itertools
counts = []
count = 1
for a, b in itertools.zip_longest(s, s[1:], fillvalue=None):
if a==b:
count += 1
else:
counts.append((a, count))
count = 1
If you do have a truly huge string and can't stand to hold two of them in memory at a time, you can use the itertools recipe pairwise.
def pairwise(iterable):
"""iterates pairwise without holding an extra copy of iterable in memory"""
a, b = itertools.tee(iterable)
next(b, None)
return itertools.zip_longest(a, b, fillvalue=None)
counts = []
count = 1
for a, b in pairwise(s):
...

A solution "that way", with only basic statements:
word="100011010" #word = "1"
count=1
length=""
if len(word)>1:
for i in range(1,len(word)):
if word[i-1]==word[i]:
count+=1
else :
length += word[i-1]+" repeats "+str(count)+", "
count=1
length += ("and "+word[i]+" repeats "+str(count))
else:
i=0
length += ("and "+word[i]+" repeats "+str(count))
print (length)
Output :
'1 repeats 1, 0 repeats 3, 1 repeats 2, 0 repeats 1, 1 repeats 1, and 0 repeats 1'
#'1 repeats 1'

Totals (without sub-groupings)
#!/usr/bin/python3 -B
charseq = 'abbcccdddd'
distros = { c:1 for c in charseq }
for c in range(len(charseq)-1):
if charseq[c] == charseq[c+1]:
distros[charseq[c]] += 1
print(distros)
I'll provide a brief explanation for the interesting lines.
distros = { c:1 for c in charseq }
The line above is a dictionary comprehension, and it basically iterates over the characters in charseq and creates a key/value pair for a dictionary where the key is the character and the value is the number of times it has been encountered so far.
Then comes the loop:
for c in range(len(charseq)-1):
We go from 0 to length - 1 to avoid going out of bounds with the c+1 indexing in the loop's body.
if charseq[c] == charseq[c+1]:
distros[charseq[c]] += 1
At this point, every match we encounter we know is consecutive, so we simply add 1 to the character key. For example, if we take a snapshot of one iteration, the code could look like this (using direct values instead of variables, for illustrative purposes):
# replacing vars for their values
if charseq[1] == charseq[1+1]:
distros[charseq[1]] += 1
# this is a snapshot of a single comparison here and what happens later
if 'b' == 'b':
distros['b'] += 1
You can see the program output below with the correct counts:
➜ /tmp ./counter.py
{'b': 2, 'a': 1, 'c': 3, 'd': 4}

You only need to change len(word) to len(word) - 1. That said, you could also use the fact that False's value is 0 and True's value is 1 with sum:
sum(word[i] == word[i+1] for i in range(len(word)-1))
This produces the sum of (False, True, True, False) where False is 0 and True is 1 - which is what you're after.
If you want this to be safe you need to guard empty words (index -1 access):
sum(word[i] == word[i+1] for i in range(max(0, len(word)-1)))
And this can be improved with zip:
sum(c1 == c2 for c1, c2 in zip(word[:-1], word[1:]))

If we want to count consecutive characters without looping, we can make use of pandas:
In [1]: import pandas as pd
In [2]: sample = 'abbcccddddaaaaffaaa'
In [3]: d = pd.Series(list(sample))
In [4]: [(cat[1], grp.shape[0]) for cat, grp in d.groupby([d.ne(d.shift()).cumsum(), d])]
Out[4]: [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('a', 4), ('f', 2), ('a', 3)]
The key is to find the first elements that are different from their previous values and then make proper groupings in pandas:
In [5]: sample = 'abba'
In [6]: d = pd.Series(list(sample))
In [7]: d.ne(d.shift())
Out[7]:
0 True
1 True
2 False
3 True
dtype: bool
In [8]: d.ne(d.shift()).cumsum()
Out[8]:
0 1
1 2
2 2
3 3
dtype: int32

This is my simple code for finding maximum number of consecutive 1's in binaray string in python 3:
count= 0
maxcount = 0
for i in str(bin(13)):
if i == '1':
count +=1
elif count > maxcount:
maxcount = count;
count = 0
else:
count = 0
if count > maxcount: maxcount = count
maxcount

There is no need to count or groupby. Just note the indices where a change occurs and subtract consecutive indicies.
w = "111000222334455555"
iw = [0] + [i+1 for i in range(len(w)-1) if w[i] != w[i+1]] + [len(w)]
dw = [w[i] for i in range(len(w)-1) if w[i] != w[i+1]] + [w[-1]]
cw = [ iw[j] - iw[j-1] for j in range(1, len(iw) ) ]
print(dw) # digits
['1', '0', '2', '3', '4']
print(cw) # counts
[3, 3, 3, 2, 2, 5]
w = 'XXYXYYYXYXXzzzzzYYY'
iw = [0] + [i+1 for i in range(len(w)-1) if w[i] != w[i+1]] + [len(w)]
dw = [w[i] for i in range(len(w)-1) if w[i] != w[i+1]] + [w[-1]]
cw = [ iw[j] - iw[j-1] for j in range(1, len(iw) ) ]
print(dw) # characters
print(cw) # digits
['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'z', 'Y']
[2, 1, 1, 3, 1, 1, 2, 5, 3]

A one liner that returns the amount of consecutive characters with no imports:
def f(x):s=x+" ";t=[x[1] for x in zip(s[0:],s[1:],s[2:]) if (x[1]==x[0])or(x[1]==x[2])];return {h: t.count(h) for h in set(t)}
That returns the amount of times any repeated character in a list is in a consecutive run of characters.
alternatively, this accomplishes the same thing, albeit much slower:
def A(m):t=[thing for x,thing in enumerate(m) if thing in [(m[x+1] if x+1<len(m) else None),(m[x-1] if x-1>0 else None)]];return {h: t.count(h) for h in set(t)}
In terms of performance, I ran them with
site = 'https://web.njit.edu/~cm395/theBeeMovieScript/'
s = urllib.request.urlopen(site).read(100_000)
s = str(copy.deepcopy(s))
print(timeit.timeit('A(s)',globals=locals(),number=100))
print(timeit.timeit('f(s)',globals=locals(),number=100))
which resulted in:
12.528256356999918
5.351301653001428
This method can definitely be improved, but without using any external libraries, this was the best I could come up with.

In python
your_string = "wwwwweaaaawwbbbbn"
current = ''
count = 0
for index, loop in enumerate(your_string):
current = loop
count = count + 1
if index == len(your_string)-1:
print(f"{count}{current}", end ='')
break
if your_string[index+1] != current:
print(f"{count}{current}",end ='')
count = 0
continue
This will output
5w1e4a2w4b1n

#I wrote the code using simple loops and if statement
s='feeekksssh' #len(s) =11
count=1 #f:0, e:3, j:2, s:3 h:1
l=[]
for i in range(1,len(s)): #range(1,10)
if s[i-1]==s[i]:
count = count+1
else:
l.append(count)
count=1
if i == len(s)-1: #To check the last character sequence we need loop reverse order
reverse_count=1
for i in range(-1,-(len(s)),-1): #Lopping only for last character
if s[i] == s[i-1]:
reverse_count = reverse_count+1
else:
l.append(reverse_count)
break
print(l)

Today I had an interview and was asked the same question. I was struggling with the original solution in mind:
s = 'abbcccda'
old = ''
cnt = 0
res = ''
for c in s:
cnt += 1
if old != c:
res += f'{old}{cnt}'
old = c
cnt = 0 # default 0 or 1 neither work
print(res)
# 1a1b2c3d1
Sadly this solution always got unexpected edge cases result(is there anyone to fix the code? maybe i need post another question), and finally timeout the interview.
After the interview I calmed down and soon got a stable solution I think(though I like the groupby best).
s = 'abbcccda'
olds = []
for c in s:
if olds and c in olds[-1]:
olds[-1].append(c)
else:
olds.append([c])
print(olds)
res = ''.join([f'{lst[0]}{len(lst)}' for lst in olds])
print(res)
# [['a'], ['b', 'b'], ['c', 'c', 'c'], ['d'], ['a']]
# a1b2c3d1a1

Here is my simple solution:
def count_chars(s):
size = len(s)
count = 1
op = ''
for i in range(1, size):
if s[i] == s[i-1]:
count += 1
else:
op += "{}{}".format(count, s[i-1])
count = 1
if size:
op += "{}{}".format(count, s[size-1])
return op

data_input = 'aabaaaabbaaaaax'
start = 0
end = 0
temp_dict = dict()
while start < len(data_input):
if data_input[start] == data_input[end]:
end = end + 1
if end == len(data_input):
value = data_input[start:end]
temp_dict[value] = len(value)
break
if data_input[start] != data_input[end]:
value = data_input[start:end]
temp_dict[value] = len(value)
start = end
print(temp_dict)

PROBLEM: we need to count consecutive characters and return characters with their count.
def countWithString(input_string:str)-> str:
count = 1
output = ''
for i in range(1,len(input_string)):
if input_string[i]==input_string[i-1]:
count +=1
else:
output += f"{count}{input_string[i-1]}"
count = 1
# Used to add last string count (at last else condition will not run and data will not be inserted to ouput string)
output += f"{count}{input_string[-1]}"
return output
countWithString(input)
input:'aaabbbaabbcc'
output:'3a3b2a2b2c'
Time Complexity: O(n)
Space Complexity: O(1)

temp_str = "aaaajjbbbeeeeewwjjj"
def consecutive_charcounter(input_str):
counter = 0
temp_list = []
for i in range(len(input_str)):
if i==0:
counter+=1
elif input_str[i]== input_str[i-1]:
counter+=1
if i == len(input_str)-1:
temp_list.extend([input_str[i - 1], str(counter)])
else:
temp_list.extend([input_str[i-1],str(counter)])
counter = 1
print("".join(temp_list))
consecutive_charcounter(temp_str)

How to produce multiple modes in Python?

Basically I just need to figure out how to produce modes (numbers occurring most frequently) from a list in Python, whether or not that list has multiple modes?
Something like this:
def print_mode (thelist):
counts = {}
for item in thelist:
counts [item] = counts.get (item, 0) + 1
maxcount = 0
maxitem = None
for k, v in counts.items ():
if v > maxcount:
maxitem = k
maxcount = v
if maxcount == 1:
print "All values only appear once"
if counts.values().count (maxcount) > 1:
print "List has multiple modes"
else:
print "Mode of list:", maxitem
But instead of returning strings in the "All values only appear once," or "list has multiple modes," I would want it to return the actual integers that it's referencing?

Make a Counter, then pick off the most common elements:
from collections import Counter
from itertools import groupby
l = [1,2,3,3,3,4,4,4,5,5,6,6,6]
# group most_common output by frequency
freqs = groupby(Counter(l).most_common(), lambda x:x[1])
# pick off the first group (highest frequency)
print([val for val,count in next(freqs)[1]])
# prints [3, 4, 6]

def mode(arr):
if len(arr) == 0:
return []
frequencies = {}
for num in arr:
frequencies[num] = frequencies.get(num,0) + 1
mode = max([value for value in frequencies.values()])
modes = []
for key in frequencies.keys():
if frequencies[key] == mode:
modes.append(key)
return modes
This code can tackle with any list. Make sure, elements of the list are numbers.

new in python 3.8's statistics module there is a function for that:
import statistics as s
print("mode(s): ",s.multimode([1,1,2,2]))
output: mode(s): [1, 2]

Counting longest occurrence of repeated sequence in Python

What's the easiest way to count the longest consecutive repeat of a certain character in a string? For example, the longest consecutive repeat of "b" in the following string:
my_str = "abcdefgfaabbbffbbbbbbfgbb"
would be 6, since other consecutive repeats are shorter (3 and 2, respectively.) How can I do this in Python?

How about a regex example:
import re
my_str = "abcdefgfaabbbffbbbbbbfgbb"
len(max(re.compile("(b+b)*").findall(my_str))) #changed the regex from (b+b) to (b+b)*
# max([len(i) for i in re.compile("(b+b)").findall(my_str)]) also works
Edit, Mine vs. interjays
x=timeit.Timer(stmt='import itertools;my_str = "abcdefgfaabbbffbbbbbbfgbb";max(len(list(y)) for (c,y) in itertools.groupby(my_str) if c=="b")')
x.timeit()
22.759046077728271
x=timeit.Timer(stmt='import re;my_str = "abcdefgfaabbbffbbbbbbfgbb";len(max(re.compile("(b+b)").findall(my_str)))')
x.timeit()
8.4770550727844238

Here is a one-liner:
max(len(list(y)) for (c,y) in itertools.groupby(my_str) if c=='b')
Explanation:
itertools.groupby will return groups of consecutive identical characters, along with an iterator for all items in that group. For each such iterator, len(list(y)) will give the number of items in the group. Taking the maximum of that (for the given character) will give the required result.

Here's my really boring, inefficient, straightforward counting method (interjay's is much better). Note, I wrote this in this little text field, which doesn't have an interpreter, so I haven't tested it, and I may have made a really dumb mistake that a proof-read didn't catch.
my_str = "abcdefgfaabbbffbbbbbbfgbb"
last_char = ""
current_seq_len = 0
max_seq_len = 0
for c in mystr:
if c == last_char:
current_seq_len += 1
if current_seq_len > max_seq_len:
max_seq_len = current_seq_len
else:
current_seq_len = 1
last_char = c
print(max_seq_len)

Using run-length encoding:
import numpy as NP
signal = NP.array([4,5,6,7,3,4,3,5,5,5,5,3,4,2,8,9,0,1,2,8,8,8,0,9,1,3])
px, = NP.where(NP.ediff1d(signal) != 0)
px = NP.r_[(0, px+1, [len(signal)])]
# collect the run-lengths for each unique item in the signal
rx = [ (m, n, signal[m]) for (m, n) in zip(px[:-1], px[1:]) if (n - m) > 1 ]
# get longest:
rx2 = [ (b-a, c) for (a, b, c) in rx ]
rx2.sort(reverse=True)
# returns: [(4, 5), (3, 8)], ie, '5' occurs 4 times consecutively, '8' occurs 3 times consecutively

Here is my code, Not that efficient but seems to work:
def LongCons(mystring):
dictionary = {}
CurrentCount = 0
latestchar = ''
for i in mystring:
if i == latestchar:
CurrentCount += 1
if dictionary.has_key(i):
if CurrentCount > dictionary[i]:
dictionary[i]=CurrentCount
else:
CurrentCount = 1
dictionary.update({i: CurrentCount})
latestchar = i
k = max(dictionary, key=dictionary.get)
print(k, dictionary[k])
return

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Determining the most common word from a user's input. [Python] - python

Why not use a collections.defaultdict? from collections import defaultdict d = defaultdict(int) for value in usrinput: d[value] += 1 To get the most common words sorted descending order by the number of occurences: print sorted(d.items(), key=lambda x: x[1])[::-1]

Rather that concatenating to common where "Jim" in "Fred|Jimmy|etc" is true use a list to store the found max values and then print "|".join(commonlist).

Related

Check the most frequent letter(s) in a word. Python

using min with list properly

Count consecutive characters

How to produce multiple modes in Python?

Counting longest occurrence of repeated sequence in Python

Categories

Resources