create all possible combinations with multiple variants from list

create all possible combinations with multiple variants from list - python

Ok so the problem is as follows:
let's say I have a list like this [12R,102A,102L,250L] what I would want is a list of all possible combinations, however for only one combination/number. so for the example above, the output I would like is:
[12R,102A,250L]
[12R,102L,250L]
my actual problem is a lot more complex with many more sites. Thanks for your help
edit: after reading some comments I guess this is slightly unclear. I have 3 unique numbers here, [12, 102, and 250] and for some numbers, I have different variations, for example [102A, 102L]. what I need is a way to combine the different positions[12,102,250] and all possible variations within. just like the lists, I presented above. they are the only valid solutions. [12R] is not. neither is [12R,102A,102L,250L]. so far I have done this with nested loops, but I have a LOT of variation within these numbers, so I can't really do that anymore
ill edit this again: ok so it seems as though there is still some confusion so I might extend the point I made before. what I am dealing with there is DNA. 12R means the 12th position in the sequence was changed to an R. so the solution [12R,102A,250L] means that the amino acid on position 12 is R, 102 is A 250 is L.
this is why a solution like [102L, 102R, 250L] is not usable, because the same position can not be occupied by 2 different amino acids.
thank you

You can use a recursive generator function:
from itertools import groupby as gb
import re
def combos(d, c = []):
if not d:
yield c
else:
for a, b in d[0]:
yield from combos(d[1:], c + [a+b])
d = ['12R', '102A', '102L', '250L']
vals = [re.findall('^\d+|\w+$', i) for i in d]
new_d = [list(b) for _, b in gb(sorted(vals, key=lambda x:x[0]), key=lambda x:x[0])]
print(list(combos(new_d)))
Output:
[['102A', '12R', '250L'], ['102L', '12R', '250L']]

So it works with ["10A","100B","12C","100R"] (case 1) and ['12R','102A','102L','250L'] (case 2)
import itertools as it
liste = ['12R','102A','102L','250L']
comb = []
for e in it.combinations(range(4), 3):
e1 = liste[e[0]][:-1]
e2 = liste[e[1]][:-1]
e3 = liste[e[2]][:-1]
if e1 != e2 and e2 != e3 and e3 != e1:
comb.append([e1+liste[e[0]][-1], e2+liste[e[1]][-1], e3+liste[e[2]][-1]])
print(list(comb))
# case 1 : [['10A', '100B', '12C'], ['10A', '12C', '100R']]
# case 2 : [['12R', '102A', '250L'], ['12R', '102L', '250L']]

import re
def get_grouped_options(input):
options = {}
for option in input:
m = re.match('([\d]+)([A-Z])$', option)
if m:
position = int(m.group(1))
acid = m.group(2)
else:
continue
if position not in options:
options[position] = []
options[position].append(acid)
return options
def yield_all_combos(options):
n = len(options)
positions = list(options.keys())
indices = [0] * n
while True:
yield ["{}{}".format(position, options[position][indices[i]])
for i, position in enumerate(positions)]
j = 0
indices[j] += 1
while indices[j] == len(options[positions[j]]):
# carry
indices[j] = 0
j += 1
if j == n:
# overflow
return
indices[j] += 1
input = ['12R', '102A', '102L', '250L']
options = get_grouped_options(input)
for combo in yield_all_combos(options):
print("[{}]".format(",".join(combo)))
Gives:
[12R,102A,250L]
[12R,102L,250L]

Try this:
from itertools import groupby
import re
def __genComb(arr, res=[]):
for i in range(len(res), len(arr)):
el=arr[i]
if(len(el[1])==1):
res+=el[1]
else:
for el_2 in el[1]:
yield from __genComb(arr, res+[el_2])
break
if(len(res)==len(arr)): yield res
def genComb(arr):
res=[(k, list(v)) for k,v in groupby(sorted(arr), key=lambda x: re.match(r"(\d*)", x).group(1))]
yield from __genComb(res)
Sample output (using the input you provided):
test=["12R","102A","102L","250L"]
for el in genComb(test):
print(el)
# returns:
['102A', '12R', '250L']
['102L', '12R', '250L']

I believe this is what you're looking for!
This works by
generating a collection of all the postfixes each prefix can have
finding the total count of positions (multiply the length of each sublist together)
rotating through each postfix by basing the read index off of both its member postfix position in the collection and the absolute result index (known location in final results)
import collections
import functools
import operator
import re
# initial input
starting_values = ["12R","102A","102L","250L"]
d = collections.defaultdict(list) # use a set if duplicates are possible
for value in starting_values:
numeric, postfix = re.match(r"(\d+)(.*)", value).groups()
d[numeric].append(postfix) # .* matches ""; consider (postfix or "_") to give value a size
# d is now a dictionary of lists where each key is the prefix
# and each value is a list of possible postfixes
# each set of postfixes multiplies the total combinations by its length
total_combinations = functools.reduce(
operator.mul,
(len(sublist) for sublist in d.values())
)
results = collections.defaultdict(list)
for results_pos in range(total_combinations):
for index, (prefix, postfix_set) in enumerate(d.items()):
results[results_pos].append(
"{}{}".format( # recombine the values
prefix, # numeric prefix
postfix_set[(results_pos + index) % len(postfix_set)]
))
# results is now a dictionary mapping { result index: unique list }
displaying
# set width of column by longest prefix string
# need a collection for intermediate cols, but beyond scope of Q
col_width = max(len(str(k)) for k in results)
for k, v in results.items():
print("{:<{w}}: {}".format(k, v, w=col_width))
0: ['12R', '102L', '250L']
1: ['12R', '102A', '250L']
with a more advanced input
["12R","102A","102L","250L","1234","1234A","1234C"]
0: ['12R', '102L', '250L', '1234']
1: ['12R', '102A', '250L', '1234A']
2: ['12R', '102L', '250L', '1234C']
3: ['12R', '102A', '250L', '1234']
4: ['12R', '102L', '250L', '1234A']
5: ['12R', '102A', '250L', '1234C']
You can confirm the values are indeed unique with a set
final = set(",".join(x) for x in results.values())
for f in final:
print(f)
12R,102L,250L,1234
12R,102A,250L,1234A
12R,102L,250L,1234C
12R,102A,250L,1234
12R,102L,250L,1234A
12R,102A,250L,1234C
notes
in cPython, regexes are cached after their first compile
list member multiplier from "How can I multiply all items in a list together with Python?"

Related

Count occurance of an item in a list and store it in another list if it is exists more than once

Let's say I have the following list.
my_list = ['4/10', '8/-', '9/2', '4/11', '-/13', '19/10', '25/-', '26/-', '4/12', '10/16']
I would like to check the occurrence of each item and if it exists more than once I would like to store it in a new list.
For example from the above list, 4 is existed 3 times before / as 4/10, 4/11, 4/12. So I would like to create a new list called new list and store them as new_list = '4/10', '4/11', '4/12, 19/10'.
An additional example I want to consider also /. if 10 exist twice as 4/10 and 10/16 I don want to consider it as a duplicate since the position after and before / is different.
If there any way to count the existence of an item in a list and store them in a new list?
I tried the following but got an error.
new_list = []
d = Counter(my_list)
for v in d.items():
if v > 1:
new_list.append(v)
The error TypeError: '>' not supported between instances of 'tuple' and 'int'
Can anyone help with this?

I think below code is quite self-explanatory. It will work alright. If you have any issues or need clarification, feel free to ask.
NOTE : This code is not very efficient and can be improved a lot. But will work allright if you are not running this on extremely large data.
my_list = ['4/10', '8/-', '9/2', '4/11', '-/13', '19/10', '25/-', '26/-', '4/12', '10/16']
frequency = {}; new_list = [];
for string in my_list:
x = '';
for j in string:
if j == '/':
break;
x += j;
if x.isdigit():
frequency[x] = frequency.get(x, 0) + 1;
for string in my_list:
x = '';
for j in string:
if j == '/':
break;
x += j;
if x.isdigit():
if frequency[x] > 1:
new_list.append(string);
print(new_list);

.items() is not what you think - it returns a list of key-value pairs (tuples), not sole values. You want to:
d = Counter(node)
new_list = [ k for (k,v) in d.items() if v > 1 ]
Besides, I am not sure how node is related to my_list but I think there is some additional processing you didn't show.
Update: after reading your comment clarifying the problem, I think it requires two separate counters:
first_parts = Counter([x.split('/')[0] for x in my_list])
second_parts = Counter([x.split('/')[1] for x in my_list])
first_duplicates = { k for (k,v) in first_parts.items() if v > 1 and k != '-' }
second_duplicates = { k for (k,v) in second_parts.items() if v > 1 and k != '-' }
new_list = [ e for e in my_list if
e.split('/')[0] in first_duplicates or e.split('/')[1] in second_duplicates ]

this might help : create a dict to contain the pairings and then extract the pairings that have a length more than one. defaultdict helps with aggregating data, based on the common keys.
from collections import defaultdict
d = defaultdict(list)
e = defaultdict(list)
m = [ent for ent in my_list if '-' not in ent]
for ent in m:
front, back = ent.split('/')
d[front].append(ent)
e[back].append(ent)
new_list = []
for k,v in d.items():
if len(v) > 1:
new_list.extend(v)
for k,v in e.items():
if len(v) > 1:
new_list.extend(v)
sortr = lambda x: [int(ent) for ent in x.split("/")]
from operator import itemgetter
sorted(set(new_list), key = sortr)
print(new_list)
['4/10', '4/11', '4/12', '19/10']

Counting strings in lists and then filtering & matching, in python

I have a list of words, and with python3 I count the difference in letters between each combination of words (using a clever diff_summing algorithm from this site):
import itertools
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
print(x,y)
This prints:
AAHS AALS
AAHS DAHS
My question: How can I count and record that strings 'DAHS' and 'AALS' have exactly one partner, and 'AAHS' has two partners? I'll be filtering for directional combinations where each target_string has exactly one near_matching_word, so my final data would (as a JSON) look like this:
[
{
"target_word": "DAHS",
"near_matching_word": "AAHS"
},
{
"target_word": "AALS",
"near_matching_word": "AAHS"
}
]
(noticing that AAHS doesn't appear as a target_word)
I have one version using functools.reduce
import itertools
import functools
import operator
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
pairs = []
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
#print(x,y)
pairs.append((x,y))
full_list = functools.reduce(operator.add, pairs)
for x in full_list:
if full_list.count(x) == 1:
print (x)
which prints
AALS
DAHS
but then I would have to go back to my big list pairs to find the near_matching_word. Of course, in my final version, list pairs will be much larger, and the target_word could be either the 1st or 2nd item in the tuple (x,y).

The other answers keep all pairs even when more than one is found. Since they are not needed, that seems to waste memory. This answer only keeps at most one pair for each string.
import collections
import itertools
def diff_letters(a,b):
return sum ( a[i] != b[i] for i in range(len(a)) )
w = ['AAHS','AALS','DAHS','XYZA']
# Marker for pairs that have not been found yet.
NOT_FOUND = object()
# Collection of found pairs x => y. Each item is in one of three states:
# - y is NOT_FOUND if x has not been seen yet
# - y is a string if it is the only accepted pair for x
# - y is None if there is more than one accepted pair for x
pairs = collections.defaultdict(lambda: NOT_FOUND)
for x,y in itertools.combinations(w,2):
if diff_letters(x,y) == 1:
if pairs[x] is NOT_FOUND:
pairs[x] = y
else:
pairs[x] = None
if pairs[y] is NOT_FOUND:
pairs[y] = x
else:
pairs[y] = None
# Remove None's and change into normal dict.
pairs = {x: y for x, y in pairs.items() if y}
for x, y in pairs.items():
print("Target = {}, Only near matching word = {}".format(x, y))
Output:
Target = AALS, Only near matching word = AAHS
Target = DAHS, Only near matching word = AAHS

You could use a dictionary instead of a list of pairs:
pairs = {}
for x, y in itertools.combinations(w, 2):
if diff_letters(x, y) == 1:
pairs.setdefault(x, []).append(y)
pairs.setdefault(y, []).append(x)
result = [{ "target_word": key, "near_matching_word": head, } for key, (head, *tail) in pairs.items() if not tail]
print(result)
Output
[{'target_word': 'AALS', 'near_matching_word': 'AAHS'}, {'target_word': 'DAHS', 'near_matching_word': 'AAHS'}]
In the pairs dictionary the keys are the target_words and the values are the near_matching_words. Then use a list comprehension to filter out those that have more that 1 near_matching_word.

import itertools
import functools
import operator
def diff_letters(a, b):
return sum(a[i] != b[i] for i in range(len(a)))
w = ['AAHS', 'AALS', 'DAHS', 'XYZA']
pairs = []
for x, y in itertools.combinations(w, 2):
if diff_letters(x, y) == 1:
pairs.append((x, y))
full_list = functools.reduce(operator.add, pairs)
result = []
for x in set(full_list):
if full_list.count(x) == 1:
pair = next((i for i in pairs if x in i))
match = [i for i in pair if i != x][0]
result.append({
"target_word": x,
"near_matching_word": match
})
print(result)
Outputs:
[{'target_word': 'DAHS', 'near_matching_word': 'AAHS'}, {'target_word': 'AALS', 'near_matching_word': 'AAHS'}]

Remove adjacent duplicates given a condition

I'm trying to write a function that will take a string, and given an integer, will remove all the adjacent duplicates larger than the integer and output the remaining string. I have this function right now that removes all the duplicates in a string, and I'm not sure how to put the integer constraint into it:
def remove_duplicates(string):
s = set()
list = []
for i in string:
if i not in s:
s.add(i)
list.append(i)
return ''.join(list)
string = "abbbccaaadddd"
print(remove_duplicates(string))
This outputs
abc
What I would want is a function like
def remove_duplicates(string, int):
.....
Where if for the same string I input int=2, I want to remove my n characters without removing all the characters. Output should be
abbccaadd
I'm also concerned about run time and complexity for very large strings, so if my initial approach is bad, please suggest a different approach. Any help is appreciated!

Not sure I understand your question correctly. I think that, given m repetitions of a character, you want to remove up to k*n duplicates such that k*n < m.
You could try this, using groupby:
>>> from itertools import groupby
>>> string = "abbbccaaadddd"
>>> n = 2
>>> ''.join(c for k, g in groupby(string) for c in k * (len(list(g)) % n or n))
'abccadd'
Here, k * (len(list(g)) % n or n) means len(g) % n repetitions, or n if that number is 0.
Oh, you changed it... now my original answer with my "interpretation" of your output actually works. You can use groupby together with islice to get at most n characters from each group of duplicates.
>>> from itertools import groupby, islice
>>> string = "abbbccaaadddd"
>>> n = 2
>>> ''.join(c for _, g in groupby(string) for c in islice(g, n))
'abbccaadd'

Create group of letters, but compute the length of the groups, maxed out by your parameter.
Then rebuild the groups and join:
import itertools
def remove_duplicates(string,maxnb):
groups = ((k,min(len(list(v)),maxnb)) for k,v in itertools.groupby(string))
return "".join(itertools.chain.from_iterable(v*k for k,v in groups))
string = "abbbccaaadddd"
print(remove_duplicates(string,2))
this prints:
abbccaadd
can be a one-liner as well (cover your eyes!)
return "".join(itertools.chain.from_iterable(v*k for k,v in ((k,min(len(list(v)),maxnb)) for k,v in itertools.groupby(string))))
not sure about the min(len(list(v)),maxnb) repeat value which can be adapted to suit your needs with a modulo (like len(list(v)) % maxnb), etc...

You should avoid using int as a variable name as it is a python keyword.
Here is a vanilla function that does the job:
def deduplicate(string: str, treshold: int) -> str:
res = ""
last = ""
count = 0
for c in string:
if c != last:
count = 0
res += c
last = c
else:
if count < treshold:
res += c
count += 1
return res

Find values in list which differ from reference list by up to N characters

I have a list like the following:
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
And a reference list like this:
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
I want to extract the values from Test if they are N or less characters different from any one of the items in Ref.
For example, if N = 1, only the first two elements of Test should be output. If N = 2, all three elements fit this criteria and should be returned.
It should be noted that I am looking for same charcacter length values (ASDFGY -> ASDFG matching doesn't work for N = 1), so I want something more efficient than levensthein distance.
I have over 1000 values in ref and a couple hundred million in Test so efficiency is key.

Using a generation expression with sum:
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
from collections import Counter
def comparer(x, y, n):
return (len(x) == len(y)) and (sum(i != j for i, j in zip(x, y)) <= n)
res = [a for a, b in zip(Ref, Test) if comparer(a, b, 1)]
print(res)
['ASDFGY', 'QWERTYI']

Using difflib
Demo:
import difflib
N = 1
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA']
result = []
for i,v in zip(Test, Ref):
c = 0
for j,s in enumerate(difflib.ndiff(i, v)):
if s.startswith("-"):
c += 1
if c <= N:
result.append( i )
print(result)
Output:
['ASDFGH', 'QWERTYU']

The newer regex module offers a "fuzzy" match possibility:
import regex as re
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA', 'ASDFGI', 'ASDFGX']
for item in Test:
rx = re.compile('(' + item + '){s<=3}')
for r in Ref:
if rx.search(r):
print(rf'{item} is similar to {r}')
This yields
ASDFGH is similar to ASDFGY
ASDFGH is similar to ASDFGI
ASDFGH is similar to ASDFGX
QWERTYU is similar to QWERTYI
ZXCVB is similar to ZXCAA
You can control it via the {s<=3} part which allows three or less substitutions.
To have pairs, you could write
pairs = [(origin, difference)
for origin in Test
for rx in [re.compile(rf"({origin}){{s<=3}}")]
for difference in Ref
if rx.search(difference)]
Which would yield for
Test = ['ASDFGH', 'QWERTYU', 'ZXCVB']
Ref = ['ASDFGY', 'QWERTYI', 'ZXCAA', 'ASDFGI', 'ASDFGX']
the following output:
[('ASDFGH', 'ASDFGY'), ('ASDFGH', 'ASDFGI'),
('ASDFGH', 'ASDFGX'), ('QWERTYU', 'QWERTYI'),
('ZXCVB', 'ZXCAA')]

Find the most common element in a list

What is an efficient way to find the most common element in a Python list?
My list items may not be hashable so can't use a dictionary.
Also in case of draws the item with the lowest index should be returned. Example:
>>> most_common(['duck', 'duck', 'goose'])
'duck'
>>> most_common(['goose', 'duck', 'duck', 'goose'])
'goose'

A simpler one-liner:
def most_common(lst):
return max(set(lst), key=lst.count)

Borrowing from here, this can be used with Python 2.7:
from collections import Counter
def Most_Common(lst):
data = Counter(lst)
return data.most_common(1)[0][0]
Works around 4-6 times faster than Alex's solutions, and is 50 times faster than the one-liner proposed by newacct.
On CPython 3.6+ (any Python 3.7+) the above will select the first seen element in case of ties. If you're running on older Python, to retrieve the element that occurs first in the list in case of ties you need to do two passes to preserve order:
# Only needed pre-3.6!
def most_common(lst):
data = Counter(lst)
return max(lst, key=data.get)

With so many solutions proposed, I'm amazed nobody's proposed what I'd consider an obvious one (for non-hashable but comparable elements) -- [itertools.groupby][1]. itertools offers fast, reusable functionality, and lets you delegate some tricky logic to well-tested standard library components. Consider for example:
import itertools
import operator
def most_common(L):
# get an iterable of (item, iterable) pairs
SL = sorted((x, i) for i, x in enumerate(L))
# print 'SL:', SL
groups = itertools.groupby(SL, key=operator.itemgetter(0))
# auxiliary function to get "quality" for an item
def _auxfun(g):
item, iterable = g
count = 0
min_index = len(L)
for _, where in iterable:
count += 1
min_index = min(min_index, where)
# print 'item %r, count %r, minind %r' % (item, count, min_index)
return count, -min_index
# pick the highest-count/earliest item
return max(groups, key=_auxfun)[0]
This could be written more concisely, of course, but I'm aiming for maximal clarity. The two print statements can be uncommented to better see the machinery in action; for example, with prints uncommented:
print most_common(['goose', 'duck', 'duck', 'goose'])
emits:
SL: [('duck', 1), ('duck', 2), ('goose', 0), ('goose', 3)]
item 'duck', count 2, minind 1
item 'goose', count 2, minind 0
goose
As you see, SL is a list of pairs, each pair an item followed by the item's index in the original list (to implement the key condition that, if the "most common" items with the same highest count are > 1, the result must be the earliest-occurring one).
groupby groups by the item only (via operator.itemgetter). The auxiliary function, called once per grouping during the max computation, receives and internally unpacks a group - a tuple with two items (item, iterable) where the iterable's items are also two-item tuples, (item, original index) [[the items of SL]].
Then the auxiliary function uses a loop to determine both the count of entries in the group's iterable, and the minimum original index; it returns those as combined "quality key", with the min index sign-changed so the max operation will consider "better" those items that occurred earlier in the original list.
This code could be much simpler if it worried a little less about big-O issues in time and space, e.g....:
def most_common(L):
groups = itertools.groupby(sorted(L))
def _auxfun((item, iterable)):
return len(list(iterable)), -L.index(item)
return max(groups, key=_auxfun)[0]
same basic idea, just expressed more simply and compactly... but, alas, an extra O(N) auxiliary space (to embody the groups' iterables to lists) and O(N squared) time (to get the L.index of every item). While premature optimization is the root of all evil in programming, deliberately picking an O(N squared) approach when an O(N log N) one is available just goes too much against the grain of scalability!-)
Finally, for those who prefer "oneliners" to clarity and performance, a bonus 1-liner version with suitably mangled names:-).
from itertools import groupby as g
def most_common_oneliner(L):
return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]

What you want is known in statistics as mode, and Python of course has a built-in function to do exactly that for you:
>>> from statistics import mode
>>> mode([1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6])
3
Note that if there is no "most common element" such as cases where the top two are tied, this will raise StatisticsError on Python
<=3.7, and on 3.8 onwards it will return the first one encountered.

Without the requirement about the lowest index, you can use collections.Counter for this:
from collections import Counter
a = [1936, 2401, 2916, 4761, 9216, 9216, 9604, 9801]
c = Counter(a)
print(c.most_common(1)) # the one most common element... 2 would mean the 2 most common
[(9216, 2)] # a set containing the element, and it's count in 'a'

If they are not hashable, you can sort them and do a single loop over the result counting the items (identical items will be next to each other). But it might be faster to make them hashable and use a dict.
def most_common(lst):
cur_length = 0
max_length = 0
cur_i = 0
max_i = 0
cur_item = None
max_item = None
for i, item in sorted(enumerate(lst), key=lambda x: x[1]):
if cur_item is None or cur_item != item:
if cur_length > max_length or (cur_length == max_length and cur_i < max_i):
max_length = cur_length
max_i = cur_i
max_item = cur_item
cur_length = 1
cur_i = i
cur_item = item
else:
cur_length += 1
if cur_length > max_length or (cur_length == max_length and cur_i < max_i):
return cur_item
return max_item

This is an O(n) solution.
mydict = {}
cnt, itm = 0, ''
for item in reversed(lst):
mydict[item] = mydict.get(item, 0) + 1
if mydict[item] >= cnt :
cnt, itm = mydict[item], item
print itm
(reversed is used to make sure that it returns the lowest index item)

Sort a copy of the list and find the longest run. You can decorate the list before sorting it with the index of each element, and then choose the run that starts with the lowest index in the case of a tie.

A one-liner:
def most_common (lst):
return max(((item, lst.count(item)) for item in set(lst)), key=lambda a: a[1])[0]

I am doing this using scipy stat module and lambda:
import scipy.stats
lst = [1,2,3,4,5,6,7,5]
most_freq_val = lambda x: scipy.stats.mode(x)[0][0]
print(most_freq_val(lst))
Result:
most_freq_val = 5

# use Decorate, Sort, Undecorate to solve the problem
def most_common(iterable):
# Make a list with tuples: (item, index)
# The index will be used later to break ties for most common item.
lst = [(x, i) for i, x in enumerate(iterable)]
lst.sort()
# lst_final will also be a list of tuples: (count, index, item)
# Sorting on this list will find us the most common item, and the index
# will break ties so the one listed first wins. Count is negative so
# largest count will have lowest value and sort first.
lst_final = []
# Get an iterator for our new list...
itr = iter(lst)
# ...and pop the first tuple off. Setup current state vars for loop.
count = 1
tup = next(itr)
x_cur, i_cur = tup
# Loop over sorted list of tuples, counting occurrences of item.
for tup in itr:
# Same item again?
if x_cur == tup[0]:
# Yes, same item; increment count
count += 1
else:
# No, new item, so write previous current item to lst_final...
t = (-count, i_cur, x_cur)
lst_final.append(t)
# ...and reset current state vars for loop.
x_cur, i_cur = tup
count = 1
# Write final item after loop ends
t = (-count, i_cur, x_cur)
lst_final.append(t)
lst_final.sort()
answer = lst_final[0][2]
return answer
print most_common(['x', 'e', 'a', 'e', 'a', 'e', 'e']) # prints 'e'
print most_common(['goose', 'duck', 'duck', 'goose']) # prints 'goose'

Building on Luiz's answer, but satisfying the "in case of draws the item with the lowest index should be returned" condition:
from statistics import mode, StatisticsError
def most_common(l):
try:
return mode(l)
except StatisticsError as e:
# will only return the first element if no unique mode found
if 'no unique mode' in e.args[0]:
return l[0]
# this is for "StatisticsError: no mode for empty data"
# after calling mode([])
raise
Example:
>>> most_common(['a', 'b', 'b'])
'b'
>>> most_common([1, 2])
1
>>> most_common([])
StatisticsError: no mode for empty data

Simple one line solution
moc= max([(lst.count(chr),chr) for chr in set(lst)])
It will return most frequent element with its frequency.

You probably don't need this anymore, but this is what I did for a similar problem. (It looks longer than it is because of the comments.)
itemList = ['hi', 'hi', 'hello', 'bye']
counter = {}
maxItemCount = 0
for item in itemList:
try:
# Referencing this will cause a KeyError exception
# if it doesn't already exist
counter[item]
# ... meaning if we get this far it didn't happen so
# we'll increment
counter[item] += 1
except KeyError:
# If we got a KeyError we need to create the
# dictionary key
counter[item] = 1
# Keep overwriting maxItemCount with the latest number,
# if it's higher than the existing itemCount
if counter[item] > maxItemCount:
maxItemCount = counter[item]
mostPopularItem = item
print mostPopularItem

ans = [1, 1, 0, 0, 1, 1]
all_ans = {ans.count(ans[i]): ans[i] for i in range(len(ans))}
print(all_ans)
all_ans={4: 1, 2: 0}
max_key = max(all_ans.keys())
4
print(all_ans[max_key])
1

#This will return the list sorted by frequency:
def orderByFrequency(list):
listUniqueValues = np.unique(list)
listQty = []
listOrderedByFrequency = []
for i in range(len(listUniqueValues)):
listQty.append(list.count(listUniqueValues[i]))
for i in range(len(listQty)):
index_bigger = np.argmax(listQty)
for j in range(listQty[index_bigger]):
listOrderedByFrequency.append(listUniqueValues[index_bigger])
listQty[index_bigger] = -1
return listOrderedByFrequency
#And this will return a list with the most frequent values in a list:
def getMostFrequentValues(list):
if (len(list) <= 1):
return list
list_most_frequent = []
list_ordered_by_frequency = orderByFrequency(list)
list_most_frequent.append(list_ordered_by_frequency[0])
frequency = list_ordered_by_frequency.count(list_ordered_by_frequency[0])
index = 0
while(index < len(list_ordered_by_frequency)):
index = index + frequency
if(index < len(list_ordered_by_frequency)):
testValue = list_ordered_by_frequency[index]
testValueFrequency = list_ordered_by_frequency.count(testValue)
if (testValueFrequency == frequency):
list_most_frequent.append(testValue)
else:
break
return list_most_frequent
#tests:
print(getMostFrequentValues([]))
print(getMostFrequentValues([1]))
print(getMostFrequentValues([1,1]))
print(getMostFrequentValues([2,1]))
print(getMostFrequentValues([2,2,1]))
print(getMostFrequentValues([1,2,1,2]))
print(getMostFrequentValues([1,2,1,2,2]))
print(getMostFrequentValues([3,2,3,5,6,3,2,2]))
print(getMostFrequentValues([1,2,2,60,50,3,3,50,3,4,50,4,4,60,60]))
Results:
[]
[1]
[1]
[1, 2]
[2]
[1, 2]
[2]
[2, 3]
[3, 4, 50, 60]

Here:
def most_common(l):
max = 0
maxitem = None
for x in set(l):
count = l.count(x)
if count > max:
max = count
maxitem = x
return maxitem
I have a vague feeling there is a method somewhere in the standard library that will give you the count of each element, but I can't find it.

This is the obvious slow solution (O(n^2)) if neither sorting nor hashing is feasible, but equality comparison (==) is available:
def most_common(items):
if not items:
raise ValueError
fitems = []
best_idx = 0
for item in items:
item_missing = True
i = 0
for fitem in fitems:
if fitem[0] == item:
fitem[1] += 1
d = fitem[1] - fitems[best_idx][1]
if d > 0 or (d == 0 and fitems[best_idx][2] > fitem[2]):
best_idx = i
item_missing = False
break
i += 1
if item_missing:
fitems.append([item, 1, i])
return items[best_idx]
But making your items hashable or sortable (as recommended by other answers) would almost always make finding the most common element faster if the length of your list (n) is large. O(n) on average with hashing, and O(n*log(n)) at worst for sorting.

>>> li = ['goose', 'duck', 'duck']
>>> def foo(li):
st = set(li)
mx = -1
for each in st:
temp = li.count(each):
if mx < temp:
mx = temp
h = each
return h
>>> foo(li)
'duck'

I needed to do this in a recent program. I'll admit it, I couldn't understand Alex's answer, so this is what I ended up with.
def mostPopular(l):
mpEl=None
mpIndex=0
mpCount=0
curEl=None
curCount=0
for i, el in sorted(enumerate(l), key=lambda x: (x[1], x[0]), reverse=True):
curCount=curCount+1 if el==curEl else 1
curEl=el
if curCount>mpCount \
or (curCount==mpCount and i<mpIndex):
mpEl=curEl
mpIndex=i
mpCount=curCount
return mpEl, mpCount, mpIndex
I timed it against Alex's solution and it's about 10-15% faster for short lists, but once you go over 100 elements or more (tested up to 200000) it's about 20% slower.

def most_frequent(List):
counter = 0
num = List[0]
for i in List:
curr_frequency = List.count(i)
if(curr_frequency> counter):
counter = curr_frequency
num = i
return num
List = [2, 1, 2, 2, 1, 3]
print(most_frequent(List))

Hi this is a very simple solution, with linear time complexity
L = ['goose', 'duck', 'duck']
def most_common(L):
current_winner = 0
max_repeated = None
for i in L:
amount_times = L.count(i)
if amount_times > current_winner:
current_winner = amount_times
max_repeated = i
return max_repeated
print(most_common(L))
"duck"
Where number, is the element in the list that repeats most of the time

numbers = [1, 3, 7, 4, 3, 0, 3, 6, 3]
max_repeat_num = max(numbers, key=numbers.count) *# which number most* frequently
max_repeat = numbers.count(max_repeat_num) *#how many times*
print(f" the number {max_repeat_num} is repeated{max_repeat} times")

def mostCommonElement(list):
count = {} // dict holder
max = 0 // keep track of the count by key
result = None // holder when count is greater than max
for i in list:
if i not in count:
count[i] = 1
else:
count[i] += 1
if count[i] > max:
max = count[i]
result = i
return result
mostCommonElement(["a","b","a","c"]) -> "a"

The most common element should be the one which is appearing more than N/2 times in the array where N being the len(array). The below technique will do it in O(n) time complexity, with just consuming O(1) auxiliary space.
from collections import Counter
def majorityElement(arr):
majority_elem = Counter(arr)
size = len(arr)
for key, val in majority_elem.items():
if val > size/2:
return key
return -1

def most_common(lst):
if max([lst.count(i)for i in lst]) == 1:
return False
else:
return max(set(lst), key=lst.count)

def popular(L):
C={}
for a in L:
C[a]=L.count(a)
for b in C.keys():
if C[b]==max(C.values()):
return b
L=[2,3,5,3,6,3,6,3,6,3,7,467,4,7,4]
print popular(L)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

create all possible combinations with multiple variants from list - python

Related

Count occurance of an item in a list and store it in another list if it is exists more than once

Counting strings in lists and then filtering & matching, in python

Remove adjacent duplicates given a condition

Find values in list which differ from reference list by up to N characters

Find the most common element in a list

Categories

Resources