Fastest way to find Indexes of item in list? - python

If one was to attempt to find the indexes of an item in a list you could do it a couple different ways here is what I know to be the fastest:
aList = [123, 'xyz', 'zara','xyz', 'abc'];
indices = [i for i, x in enumerate(aList) if x == "xyz"]
print(indices)
Another way not pythonic and slower:
count = 0
indices = []
aList = [123, 'xyz', 'zara','xyz', 'abc'];
for i in range(0,len(aList):
if 'xyz' == aList[i]:
indices.append(i)
print(indices)
The first method is undoubtedly faster however what if you wanted to go faster, is there a way? For the first index using method:
aList = [123, 'xyz', 'zara','xyz', 'abc'];
print "Index for xyz : ", aList.index( 'xyz' )
is very fast but can't handle multiple indexes.
How might one go about speeding things up?

Use list.index(elem, start)! That uses a for loop in C (see its implementation list_index_impl function in the source of CPython's listobject.c).
Avoid looping through all the elements in Python, it is slower than in C.
def index_finder(lst, item):
"""A generator function, if you might not need all the indices"""
start = 0
while True:
try:
start = lst.index(item, start)
yield start
start += 1
except ValueError:
break
import array
def index_find_all(lst, item, results=None):
""" If you want all the indices.
Pass results=[] if you explicitly need a list,
or anything that can .append(..)
"""
if results is None:
length = len(lst)
results = (array.array('B') if length <= 2**8 else
array.array('H') if length <= 2**16 else
array.array('L') if length <= 2**32 else
array.array('Q'))
start = 0
while True:
try:
start = lst.index(item, start)
results.append(start)
start += 1
except ValueError:
return results
# Usage example
l = [1, 2, 3, 4, 5, 6, 7, 8] * 32
print(*index_finder(l, 1))
print(*index_find_all(l, 1))

def find(target, myList):
for i in range(len(myList)):
if myList[i] == target:
yield i
def find_with_list(myList, target):
inds = []
for i in range(len(myList)):
if myList[i] == target:
inds += i,
return inds
In [8]: x = range(50)*200
In [9]: %timeit [i for i,j in enumerate(x) if j == 3]
1000 loops, best of 3: 598 us per loop
In [10]: %timeit list(find(3,x))
1000 loops, best of 3: 607 us per loop
In [11]: %timeit find(3,x)
1000000 loops, best of 3: 375 ns per loop
In [55]: %timeit find_with_list(x,3)
1000 loops, best of 3: 618 us per loop
Assuming you want a list as your output:
All options seemed exhibit similar time performance for my test with the list comprehension being the fastest (barely).
If using a generator is acceptable, it's way faster than the other approaches. Though it doesn't account for actually iterating over the indices, nor does it store them, so the indices cannot be iterated over a second time.

Simply create a dictionary of item->index from the list of items using zip like so:
items_as_dict = dict(zip(list_of_items,range(0,len(list_of_items))))
index = items_as_dict(item)

To get the index of the item, you can use the dictionary.
aList = [123, 'xyz', 'zara','xyz', 'abc'];
#The following apporach works only on lists with unique values
aList = list(np.unique(aList));
dict = enumerate(aList);
# get inverse mapping of above dictionary, replace key with values
inv_dict = dict(zip(dict.values(),dict.keys()))
# to get index of item by value, use 'inv_dict' and to get value by index, use 'dict'
valueofItemAtIndex0 = dict[0]; # value = 123
indexofItemWithValue123 = inv_dict[123]; # index = 0

D=dict()
for i, item in enumerate(l):
if item not in D:
D[item] = [i]
else:
D[item].append(i)
Then simply call D[item] to get the indices that match. You'll give up initial calculation time but gain it during call time.

I used another way to find the index of a element in a list in Python 3:
def index_of(elem, a):
a_e = enumerate(a)
a_f = list(filter(lambda x: x[1] == elem, a_e))
if a_f:
return a_f[0][0]
else:
return -1
Some tests:
a=[1,2,3,4,2]
index_of(2,a)
This function always return the first occurrence of the element. If element ins't in the list, return -1. For my goals, that solution worked well.

Related

Find two numbers from a list that add up to a specific number

This is super bad and messy, I am new to this, please help me.
Basically, I was trying to find two numbers from a list that add up to a target number.
I have set up an example with lst = [2, 4, 6, 10] and a target value of target = 8. The answer in this example would be (2, 6) and (6, 2).
Below is my code but it is long and ugly and I am sure there is a better way of doing it. Can you please see how I can improve from my code below?
from itertools import product, permutations
numbers = [2, 4, 6, 10]
target_number = 8
two_nums = (list(permutations(numbers, 2)))
print(two_nums)
result1 = (two_nums[0][0] + two_nums[0][1])
result2 = (two_nums[1][0] + two_nums[1][1])
result3 = (two_nums[2][0] + two_nums[2][1])
result4 = (two_nums[3][0] + two_nums[3][1])
result5 = (two_nums[4][0] + two_nums[4][1])
result6 = (two_nums[5][0] + two_nums[5][1])
result7 = (two_nums[6][0] + two_nums[6][1])
result8 = (two_nums[7][0] + two_nums[7][1])
result9 = (two_nums[8][0] + two_nums[8][1])
result10 = (two_nums[9][0] + two_nums[9][1])
my_list = (result1, result2, result3, result4, result5, result6, result7, result8, result9, result10)
print (my_list)
for i in my_list:
if i == 8:
print ("Here it is:" + str(i))
For every number on the list, you can look for his complementary (number that when added to the previous one would give the required target sum). If it exists, get the pair and exit, otherwise move on.
This would look like the following:
numbers = [2, 4, 6, 10]
target_number = 8
for i, number in enumerate(numbers[:-1]): # note 1
complementary = target_number - number
if complementary in numbers[i+1:]: # note 2
print("Solution Found: {} and {}".format(number, complementary))
break
else: # note 3
print("No solutions exist")
which produces:
Solution Found: 2 and 6
Notes:
You do not have to check the last number; if there was a pair you would have already found it by then.
Notice that the membership check (which is quite costly in lists) is optimized since it considers the slice numbers[i+1:] only. The previous numbers have been checked already. A positive side-effect of the slicing is that the existence of e.g., one 4 in the list, does not give a pair for a target value of 8.
This is an excellent setup to explain the miss-understood and often confusing use of else in for-loops. The else triggers only if the loop was not abruptly ended by a break.
If the e.g., 4 - 4 solution is acceptable to you even when having a single 4 in the list you can modify as follows:
numbers = [2, 4, 6, 10]
target_number = 8
for i, number in enumerate(numbers):
complementary = target_number - number
if complementary in numbers[i:]:
print("Solution Found: {} and {}".format(number, complementary))
break
else:
print("No solutions exist")
A list comprehension will work well here. Try this:
from itertools import permutations
numbers = [2, 4, 6, 10]
target_number = 8
solutions = [pair for pair in permutations(numbers, 2) if sum(pair) == 8]
print('Solutions:', solutions)
Basically, this list comprehension looks at all the pairs that permutations(numbers, 2) returns, but only keeps the ones whose total sum equals 8.
The simplest general way to do this is to iterate over your list and for each item iterate over the rest of the list to see if it adds up to the target value. The downside of this is it is an O(n^2) operation. I don't know off the top of my head if there is a more efficient solution. I'm not 100% sure my syntax is correct, but it should look something like the following:
done = False
for i, val in enumerate(numbers):
if val >= target_number:
continue
for j, val2 in enumerate(numbers, i+1):
if val + val2 == target_number:
print ("Here it is: " + str(i) + "," + str(j))
done = True
break
if done:
break
Of course you should create this as a function that returns your result instead of just printing it. That would remove the need for the "done" variable.
If you are trying to find the answer for multiple integers with a long list that has duplicate values, I would recommend using frozenset. The "checked" answer will only get the first answer and then stop.
import numpy as np
numbers = np.random.randint(0, 100, 1000)
target = 17
def adds_to_target(base_list, target):
return_list = []
for i in range(len(base_list)):
return_list.extend([list((base_list[i], b)) for b in base_list if (base_list[i] + b)==target])
return set(map(frozenset, return_list))
# sample output
{frozenset({7, 10}),
frozenset({4, 13}),
frozenset({8, 9}),
frozenset({5, 12}),
frozenset({2, 15}),
frozenset({3, 14}),
frozenset({0, 17}),
frozenset({1, 16}),
frozenset({6, 11})}
1) In the first for loop, lists containing two integers that sum to the target value are added to "return_list" i.e. a list of lists is created.
2) Then frozenset takes out all duplicate pairs.
%timeit adds_to_target(numbers, target_number)
# 312 ms ± 8.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
you can do it in one line with list comprehension like below:
from itertools import permutations
numbers = [2, 4, 6, 10]
target_number = 8
two_nums = (list(permutations(numbers, 2)))
result=[i for i in two_nums if i[0]+i[1] == target_number]
[(2,6) , (6,2)]
If you want a way to do this efficiently without itertools -
numbers = [1,3,4,5,6,2,3,4,1]
target = 5
number_dict = {}
pairs = []
for num in numbers:
number_dict[num] = number_dict.get(num, 0) + 1
complement = target - num
if complement in number_dict.keys():
pairs.append((num, complement))
number_dict.pop(num)
number_dict.pop(complement)
This is this simple :)
def func(array, target):
flag = 0;
for x in array:
for y in array:
if (target-x) == y and x != y:
print(x,y)
flag = 1
break
if flag ==1:
break
import pandas as pd
Filename = "D:\\python interview\\test.txt"
wordcount_dict = dict()
#input("Enter Filename:")
list_ = [1,2,4,6,8]
num = 10
for number in list_:
num_add = number
for number_ in list_:
if number_ + num_add == num and number_ != num_add :
print(number_ , num_add)
n is the sum desired, L is the List. Basically you enter inside the loop and from that no to end of list iterate through the next loop. If L[i],L[j] indexes in list adds up to n and if L[i]!=L[j] print it.
numbers=[1,2,3,4,9,8,5,10,20,30,6]
def two_no_summer(n,L):
for i in range(0,len(L)):
for j in range(i,len(L)):
if (L[i]+L[j]==n) & (L[i]!=L[j]):
print(L[i],L[j])
Execution: https://i.stack.imgur.com/Wu47x.jpg

fastest list index searching

What is the fastest way to find the index of an element in a list of integers?
Now I am doing
if value in mylist:
return mylist.index(value)
but it seems I am doing the same thing two times: to know if value is in mylist I also know the index position. I tried also other solutions:
try:
return mylist.index(value)
except ValueError:
return None
or
for i, x in enumerate(mylist):
if x == value:
return i
return None
but all these solutions seems to be slower.
The array is non-sorted with only 4 elements.
As you have only four items, you can also try this :
if value == mylist[0]:
return 0
elif value == mylist[1]:
return 1
elif value == mylist[2]:
return 2
elif value == mylist [3]:
return 3
Let me know how it works in your case. I am curious. :)
You can use a set to check for membership, it will be more efficient than checking the list but the greatest overhead is the indexing:
In [54]: l = [1,2,3,4]
In [55]: s = set([1,2,3,4])
In [56]: timeit l.index(6) if 6 in s else False
10000000 loops, best of 3: 79.9 ns per loop
In [57]: timeit l.index(6) if 6 in l else False
10000000 loops, best of 3: 141 ns per loop
In [58]: timeit l.index(4) if 4 in l else False
1000000 loops, best of 3: 381 ns per loop
In [59]: timeit l.index(4) if 4 in s else False
1000000 loops, best of 3: 333 ns per loop
Just using if-elses is fast, but if you are always searching on the same list (or your list doesn't change too often) you can be a bit faster by storing the element -> index mapping in a dict and then doing a dictionary lookup.
So your code should look something like this:
# Precompute the mapping.
mapping = { index: value for value, index in enumerate(TEST_LIST) }
# Search function:
def lookup(value):
return mapping.get(value, None)
I ran some tests comparing this with other approaches. Here's my test code:
import timeit
TEST_LIST = [100, -2, 10007, 2**70 + 1]
mapping = { index: value for value, index in enumerate(TEST_LIST) }
NUM_TIMES = 10**6
def by_if_else(lst, value):
if lst[0] == value:
return 0
elif lst[1] == value:
return 1
elif lst[2] == value:
return 2
elif lst[3] == value:
return 3
else:
return None
def by_index(lst, value):
for i in xrange(4):
if lst[i] == value:
return i
return None
def by_exception(lst, value):
try:
lst.index(value)
except ValueError:
return None
def by_iter(lst, value):
for index, element in enumerate(lst):
if element == value:
return value
return None
def by_dict(lst, value):
return mapping.get(value, None)
def TimeFunction(function_name, value):
if 'dict' in function_name:
return timeit.timeit(
stmt = '%s(mapping, %d)' % (function_name, value),
setup = 'from __main__ import %s, mapping' % function_name,
number=NUM_TIMES)
else:
return timeit.timeit(
stmt = '%s(TEST_LIST, %d)' % (function_name, value),
setup = 'from __main__ import %s, TEST_LIST' % function_name,
number=NUM_TIMES)
def RunTestsOn(value):
print "Looking for %d in %s" % (value, str(TEST_LIST))
function_names = [name for name in globals() if name.startswith('by_')]
for function_name in function_names:
print "Function: %s\nTime: %f" % (
function_name, TimeFunction(function_name, value))
def main():
values_to_look_for = TEST_LIST + [ -10**70 - 1, 55, 29]
for value in values_to_look_for:
RunTestsOn(value)
if __name__ == '__main__':
main()
It looks like the if-else approach is faster when the values being searched for are small and are present in the list (I removed runtimes for the other functions):
Looking for 10007 in [100, -2, 10007, 1180591620717411303425L]
Function: by_dict
Time: 0.213232
Function: by_if_else
Time: 0.181917
But slower if the value is large (i.e. comparison in expensive):
Looking for 1180591620717411303425 in [100, -2, 10007, 1180591620717411303425L]
Function: by_dict
Time: 0.223594
Function: by_if_else
Time: 0.380222
Or, when the value isn't present in the list at all (even if the value is small):
Looking for 29 in [100, -2, 10007, 1180591620717411303425L]
Function: by_dict
Time: 0.195733
Function: by_if_else
Time: 0.267689
While it is obvious that using a dict should be faster due to queries on it being O(1) as opposed to O(n) for all the other approaches, for such a small list, the interpreter is probably creating optimized bytecode for the if-else version and the overhead of doing pointer chases through a hashtable offsetting a lot of the speed advantage of the dict. But it still appears to be slightly faster most of the time. I would suggest you test out this approach on your data and see which works better for you.

Python: how to search for a substring in a set the fast way?

I have a set containing ~300.000 tuples
In [26]: sa = set(o.node for o in vrts_l2_5)
In [27]: len(sa)
Out[27]: 289798
In [31]: random.sample(sa, 1)
Out[31]: [('835644', '4696507')]
Now I want to lookup elements based on a common substring, e.g. the first 4 'digits' (in fact the elements are strings). This is my approach:
def lookup_set(x_appr, y_appr):
return [n for n in sa if n[0].startswith(x_appr) and n[1].startswith(y_appr)]
In [36]: lookup_set('6652','46529')
Out[36]: [('665274', '4652941'), ('665266', '4652956')]
Is there a more efficient, that is, faster way to to this?
You can do it in O(log(n) + m) time, where n is the number of tuples and m is the number of matching tuples, if you can afford to keep two sorted copies of the tuples.
Sorting itself will cost O(nlog(n)), i.e. it will be asymptotically slower then your naive approach, but if you have to do a certain number of queries(more than log(n), which is almost certainly quite small) it will pay off.
The idea is that you can use bisection to find the candidates that have the correct first value and the correct second value and then intersect these sets.
However note that you want a strange kind of comparison: you care for all strings starting with the given argument. This simply means that when searching for the right-most occurrence you should fill the key with 9s.
A complete working(although not tested very much) code:
from random import randint
from operator import itemgetter
first = itemgetter(0)
second = itemgetter(1)
sa = [(str(randint(0, 1000000)), str(randint(0, 1000000))) for _ in range(300000)]
f_sorted = sorted(sa, key=first)
s_sorted = sa
s_sorted.sort(key=second)
max_length = max(len(s) for _,s in sa)
# See: bisect module from stdlib
def bisect_right(seq, element, key):
lo = 0
hi = len(seq)
element = element.ljust(max_length, '9')
while lo < hi:
mid = (lo+hi)//2
if element < key(seq[mid]):
hi = mid
else:
lo = mid + 1
return lo
def bisect_left(seq, element, key):
lo = 0
hi = len(seq)
while lo < hi:
mid = (lo+hi)//2
if key(seq[mid]) < element:
lo = mid + 1
else:
hi = mid
return lo
def lookup_set(x_appr, y_appr):
x_left = bisect_left(f_sorted, x_appr, key=first)
x_right = bisect_right(f_sorted, x_appr, key=first)
x_candidates = f_sorted[x_left:x_right + 1]
y_left = bisect_left(s_sorted, y_appr, key=second)
y_right = bisect_right(s_sorted, y_appr, key=second)
y_candidates = s_sorted[y_left:y_right + 1]
return set(x_candidates).intersection(y_candidates)
And the comparison with your initial solution:
In [2]: def lookup_set2(x_appr, y_appr):
...: return [n for n in sa if n[0].startswith(x_appr) and n[1].startswith(y_appr)]
In [3]: lookup_set('123', '124')
Out[3]: set([])
In [4]: lookup_set2('123', '124')
Out[4]: []
In [5]: lookup_set('123', '125')
Out[5]: set([])
In [6]: lookup_set2('123', '125')
Out[6]: []
In [7]: lookup_set('12', '125')
Out[7]: set([('12478', '125908'), ('124625', '125184'), ('125494', '125940')])
In [8]: lookup_set2('12', '125')
Out[8]: [('124625', '125184'), ('12478', '125908'), ('125494', '125940')]
In [9]: %timeit lookup_set('12', '125')
1000 loops, best of 3: 589 us per loop
In [10]: %timeit lookup_set2('12', '125')
10 loops, best of 3: 145 ms per loop
In [11]: %timeit lookup_set('123', '125')
10000 loops, best of 3: 102 us per loop
In [12]: %timeit lookup_set2('123', '125')
10 loops, best of 3: 144 ms per loop
As you can see this solution is about 240-1400 times faster(in these examples) than your naive approach.
If you have a big set of matches:
In [19]: %timeit lookup_set('1', '2')
10 loops, best of 3: 27.1 ms per loop
In [20]: %timeit lookup_set2('1', '2')
10 loops, best of 3: 152 ms per loop
In [21]: len(lookup_set('1', '2'))
Out[21]: 3587
In [23]: %timeit lookup_set('', '2')
10 loops, best of 3: 182 ms per loop
In [24]: %timeit lookup_set2('', '2')
1 loops, best of 3: 212 ms per loop
In [25]: len(lookup_set2('', '2'))
Out[25]: 33053
As you can see this solution is faster even if the number of matches is about 10% of the total size. However, if you try to match all the data:
In [26]: %timeit lookup_set('', '')
1 loops, best of 3: 360 ms per loop
In [27]: %timeit lookup_set2('', '')
1 loops, best of 3: 221 ms per loop
It becomes (not so much) slower, although this is a quite peculiar case, and I doubt you'll frequently match almost all the elements.
Note that the time take to sort the data is quite small:
In [13]: from random import randint
...: from operator import itemgetter
...:
...: first = itemgetter(0)
...: second = itemgetter(1)
...:
...: sa2 = [(str(randint(0, 1000000)), str(randint(0, 1000000))) for _ in range(300000)]
In [14]: %%timeit
...: f_sorted = sorted(sa2, key=first)
...: s_sorted = sorted(sa2, key=second)
...: max_length = max(len(s) for _,s in sa2)
...:
1 loops, best of 3: 881 ms per loop
As you can see it takes less than one second to do the two sorted copies. Actually the above code would be slightly faster since it sorts "in-place" the second copy(although tim-sort could still require O(n) memory).
This means that if you have to do more than about 6-8 queries this solution will be faster.
Note: python'd standard library provides a bisect module. However it doesn't allow a key parameter(even though I remember reading that Guido wanted it, so it may be added in the future). Hence if you want to use it directly, you'll have to use the "decorate-sort-undecorate" idiom.
Instead of:
f_sorted = sorted(sa, key=first)
You should do:
f_sorted = sorted((first, (first,second)) for first,second in sa)
I.e. you explicitly insert the key as the first element of the tuple. Afterwards you could use ('123', '') as element to pass to the bisect_* functions and it should find the correct index.
I decided to avoid this. I copy pasted the code from the sources of the module and slightly modified it to provide a simpler interface for your use-case.
Final remark: if you could convert the tuple elements to integers then the comparisons would be faster. However, most of the time would still be taken to perform the intersection of the sets, so I don't know exactly how much it will improve performances.
You could use a trie data structure. It is possible to build one with a tree of dict objects (see How to create a TRIE in Python) but there is a package marisa-trie that implements a memory-efficient version by binding to c++ libraries
I have not used this library before, but playing around with it, I got this working:
from random import randint
from marisa_trie import RecordTrie
sa = [(str(randint(1000000,9999999)),str(randint(1000000,9999999))) for i in range(100000)]
# make length of string in packed format big enough!
fmt = ">10p10p"
sa_tries = (RecordTrie(fmt, zip((unicode(first) for first, _ in sa), sa)),
RecordTrie(fmt, zip((unicode(second) for _, second in sa), sa)))
def lookup_set(sa_tries, x_appr, y_appr):
"""lookup prefix in the appropriate trie and intersect the result"""
return (set(item[1] for item in sa_tries[0].items(unicode(x_appr))) &
set(item[1] for item in sa_tries[1].items(unicode(y_appr))))
lookup_set(sa_tries, "2", "4")
I went through and implemented the 4 suggested solutions to compare their efficiency. I ran the tests with different prefix lengths to see how the input would affect performance. The trie and sorted list performance is definitely sensitive to the length of input with both getting faster as the input gets longer (I think it is actually sensitivity to the size of output since the output gets smaller as the prefix gets longer). However, the sorted set solution is definitely faster in all situations.
In these timing tests, there were 200000 tuples in sa and 10 runs for each method:
for prefix length 1
lookup_set_startswith : min=0.072107 avg=0.073878 max=0.077299
lookup_set_int : min=0.030447 avg=0.037739 max=0.045255
lookup_set_trie : min=0.111548 avg=0.124679 max=0.147859
lookup_set_sorted : min=0.012086 avg=0.013643 max=0.016096
for prefix length 2
lookup_set_startswith : min=0.066498 avg=0.069850 max=0.081271
lookup_set_int : min=0.027356 avg=0.034562 max=0.039137
lookup_set_trie : min=0.006949 avg=0.010091 max=0.032491
lookup_set_sorted : min=0.000915 avg=0.000944 max=0.001004
for prefix length 3
lookup_set_startswith : min=0.065708 avg=0.068467 max=0.079485
lookup_set_int : min=0.023907 avg=0.033344 max=0.043196
lookup_set_trie : min=0.000774 avg=0.000854 max=0.000929
lookup_set_sorted : min=0.000149 avg=0.000155 max=0.000163
for prefix length 4
lookup_set_startswith : min=0.065742 avg=0.068987 max=0.077351
lookup_set_int : min=0.026766 avg=0.034558 max=0.052269
lookup_set_trie : min=0.000147 avg=0.000167 max=0.000189
lookup_set_sorted : min=0.000065 avg=0.000068 max=0.000070
Here's the code:
import random
def random_digits(num_digits):
return random.randint(10**(num_digits-1), (10**num_digits)-1)
sa = [(str(random_digits(6)),str(random_digits(7))) for _ in range(200000)]
### naive approach
def lookup_set_startswith(x_appr, y_appr):
return [item for item in sa if item[0].startswith(x_appr) and item[1].startswith(y_appr) ]
### trie approach
from marisa_trie import RecordTrie
# make length of string in packed format big enough!
fmt = ">10p10p"
sa_tries = (RecordTrie(fmt, zip([unicode(first) for first, second in sa], sa)),
RecordTrie(fmt, zip([unicode(second) for first, second in sa], sa)))
def lookup_set_trie(x_appr, y_appr):
# lookup prefix in the appropriate trie and intersect the result
return set(item[1] for item in sa_tries[0].items(unicode(x_appr))) & \
set(item[1] for item in sa_tries[1].items(unicode(y_appr)))
### int approach
sa_ints = [(int(first), int(second)) for first, second in sa]
sa_lens = tuple(map(len, sa[0]))
def lookup_set_int(x_appr, y_appr):
x_limit = 10**(sa_lens[0]-len(x_appr))
y_limit = 10**(sa_lens[1]-len(y_appr))
x_int = int(x_appr) * x_limit
y_int = int(y_appr) * y_limit
return [sa[i] for i, int_item in enumerate(sa_ints) \
if (x_int <= int_item[0] and int_item[0] < x_int+x_limit) and \
(y_int <= int_item[1] and int_item[1] < y_int+y_limit) ]
### sorted set approach
from operator import itemgetter
first = itemgetter(0)
second = itemgetter(1)
sa_sorted = (sorted(sa, key=first), sorted(sa, key=second))
max_length = max(len(s) for _,s in sa)
# See: bisect module from stdlib
def bisect_right(seq, element, key):
lo = 0
hi = len(seq)
element = element.ljust(max_length, '9')
while lo < hi:
mid = (lo+hi)//2
if element < key(seq[mid]):
hi = mid
else:
lo = mid + 1
return lo
def bisect_left(seq, element, key):
lo = 0
hi = len(seq)
while lo < hi:
mid = (lo+hi)//2
if key(seq[mid]) < element:
lo = mid + 1
else:
hi = mid
return lo
def lookup_set_sorted(x_appr, y_appr):
x_left = bisect_left(sa_sorted[0], x_appr, key=first)
x_right = bisect_right(sa_sorted[0], x_appr, key=first)
x_candidates = sa_sorted[0][x_left:x_right]
y_left = bisect_left(sa_sorted[1], y_appr, key=second)
y_right = bisect_right(sa_sorted[1], y_appr, key=second)
y_candidates = sa_sorted[1][y_left:y_right]
return set(x_candidates).intersection(y_candidates)
####
# test correctness
ntests = 10
candidates = [lambda x, y: set(lookup_set_startswith(x,y)),
lambda x, y: set(lookup_set_int(x,y)),
lookup_set_trie,
lookup_set_sorted]
print "checking correctness (or at least consistency)..."
for dlen in range(1,5):
print "prefix length %d:" % dlen,
for i in range(ntests):
print " #%d" % i,
prefix = map(str, (random_digits(dlen), random_digits(dlen)))
answers = [c(*prefix) for c in candidates]
for i, ans in enumerate(answers):
for j, ans2 in enumerate(answers[i+1:]):
assert ans == ans2, "answers for %s for #%d and #%d don't match" \
% (prefix, i, j+i+1)
print
####
# time calls
import timeit
import numpy as np
ntests = 10
candidates = [lookup_set_startswith,
lookup_set_int,
lookup_set_trie,
lookup_set_sorted]
print "timing..."
for dlen in range(1,5):
print "for prefix length", dlen
times = [ [] for c in candidates ]
for _ in range(ntests):
prefix = map(str, (random_digits(dlen), random_digits(dlen)))
for c, c_times in zip(candidates, times):
tstart = timeit.default_timer()
trash = c(*prefix)
c_times.append(timeit.default_timer()-tstart)
for c, c_times in zip(candidates, times):
print " %-25s: min=%f avg=%f max=%f" % (c.func_name, min(c_times), np.mean(c_times), max(c_times))
Integer manipulation is much faster than string. (and smaller in memory as well)
So if you can compare integers instead you'll be much faster.
I suspect something like this should work for you:
sa = set(int(o.node) for o in vrts_l2_5)
Then this may work for you:
def lookup_set(samples, x_appr, x_len, y_appr, y_len):
"""
x_appr == SSS0000 where S is the digit to search for
x_len == number of digits to S (if SSS0000 then x_len == 4)
"""
return ((x, y) for x, y in samples if round(x, -x_len) == x_appr and round(y, -y_len) == y_approx)
Also, it returns a generator, so you're not loading all the results into memory at once.
Updated to use round method mentioned by Bakuriu
There may be, but not by terribly much. str.startswith and and are both shortcutting operators (they can return once they find a failure), and indexing tuples is a fast operation. Most of the time spent here will be from object lookups, such as finding the startswith method for each string. Probably the most worthwhile option is to run it through Pypy.
A faster solution would be to create a dictionary dict and put the first value as a key and the second as a value.
Then you would search keys matching x_appr in the ordered key list of dict (the ordered list would allow you to optimize the search in key list with a dichotomy for example). This will provide a key list named for example k_list.
And then lookup for values of dict having a key in k_list and matching y_appr.
You can also include the second step (value that match y_appr) before appending to k_list. So that k_list will contains all the key of the correct elements of dict.
Here I've just compare 'in' method and 'find' method:
The CSV input file contains a list of URL
# -*- coding: utf-8 -*-
### test perfo str in set
import re
import sys
import time
import json
import csv
import timeit
cache = set()
#######################################################################
def checkinCache(c):
global cache
for s in cache:
if c in s:
return True
return False
#######################################################################
def checkfindCache(c):
global cache
for s in cache:
if s.find(c) != -1:
return True
return False
#######################################################################
print "1/3-loading pages..."
with open("liste_all_meta.csv.clean", "rb") as f:
reader = csv.reader(f, delimiter=",")
for i,line in enumerate(reader):
cache.add(re.sub("'","",line[2].strip()))
print " "+str(len(cache))+" PAGES IN CACHE"
print "2/3-test IN..."
tstart = timeit.default_timer()
for i in range(0, 1000):
checkinCache("string to find"+str(i))
print timeit.default_timer()-tstart
print "3/3-test FIND..."
tstart = timeit.default_timer()
for i in range(0, 1000):
checkfindCache("string to find"+str(i))
print timeit.default_timer()-tstart
print "\n\nBYE\n"
results in seconds:
1/3-loading pages...
482897 PAGES IN CACHE
2/3-test IN...
107.765980005
3/3-test FIND...
167.788629055
BYE
so, the 'in' method is faster than 'find' method :)
Have fun

Python: Check the occurrences in a list against a value

lst = [1,2,3,4,1]
I want to know 1 occurs twice in this list, is there any efficient way to do?
lst.count(1) would return the number of times it occurs. If you're going to be counting items in a list, O(n) is what you're going to get.
The general function on the list is list.count(x), and will return the number of times x occurs in a list.
Are you asking whether every item in the list is unique?
len(set(lst)) == len(lst)
Whether 1 occurs more than once?
lst.count(1) > 1
Note that the above is not maximally efficient, because it won't short-circuit -- even if 1 occurs twice, it will still count the rest of the occurrences. If you want it to short-circuit you will have to write something a little more complicated.
Whether the first element occurs more than once?
lst[0] in lst[1:]
How often each element occurs?
import collections
collections.Counter(lst)
Something else?
For multiple occurrences, this give you the index of each occurence:
>>> lst=[1,2,3,4,5,1]
>>> tgt=1
>>> found=[]
>>> for index, suspect in enumerate(lst):
... if(tgt==suspect):
... found.append(index)
...
>>> print len(found), "found at index:",", ".join(map(str,found))
2 found at index: 0, 5
If you want the count of each item in the list:
>>> lst=[1,2,3,4,5,2,2,1,5,5,5,5,6]
>>> count={}
>>> for item in lst:
... count[item]=lst.count(item)
...
>>> count
{1: 2, 2: 3, 3: 1, 4: 1, 5: 5, 6: 1}
def valCount(lst):
res = {}
for v in lst:
try:
res[v] += 1
except KeyError:
res[v] = 1
return res
u = [ x for x,y in valCount(lst).iteritems() if y > 1 ]
u is now a list of all values which appear more than once.
Edit:
#katrielalex: thank you for pointing out collections.Counter, of which I was not previously aware. It can also be written more concisely using a collections.defaultdict, as demonstrated in the following tests. All three methods are roughly O(n) and reasonably close in run-time performance (using collections.defaultdict is in fact slightly faster than collections.Counter).
My intention was to give an easy-to-understand response to what seemed a relatively unsophisticated request. Given that, are there any other senses in which you consider it "bad code" or "done poorly"?
import collections
import random
import time
def test1(lst):
res = {}
for v in lst:
try:
res[v] += 1
except KeyError:
res[v] = 1
return res
def test2(lst):
res = collections.defaultdict(lambda: 0)
for v in lst:
res[v] += 1
return res
def test3(lst):
return collections.Counter(lst)
def rndLst(lstLen):
r = random.randint
return [r(0,lstLen) for i in xrange(lstLen)]
def timeFn(fn, *args):
st = time.clock()
res = fn(*args)
return time.clock() - st
def main():
reps = 5000
res = []
tests = [test1, test2, test3]
for t in xrange(reps):
lstLen = random.randint(10,50000)
lst = rndLst(lstLen)
res.append( [lstLen] + [timeFn(fn, lst) for fn in tests] )
res.sort()
return res
And the results, for random lists containing up to 50,000 items, are as follows:
(Vertical axis is time in seconds, horizontal axis is number of items in list)
Another way to get all items that occur more than once:
lst = [1,2,3,4,1]
d = {}
for x in lst:
d[x] = x in d
print d[1] # True
print d[2] # False
print [x for x in d if d[x]] # [1]
You could also sort the list which is O(n*log(n)), then check the adjacent elements for equality, which is O(n). The result is O(n*log(n)). This has the disadvantage of requiring the entire list be sorted before possibly bailing when a duplicate is found.
For a large list with a relatively rare duplicates, this could be the about the best you can do. The best way to approach this really does depend on the size of the data involved and its nature.

Find the most common element in a list

What is an efficient way to find the most common element in a Python list?
My list items may not be hashable so can't use a dictionary.
Also in case of draws the item with the lowest index should be returned. Example:
>>> most_common(['duck', 'duck', 'goose'])
'duck'
>>> most_common(['goose', 'duck', 'duck', 'goose'])
'goose'
A simpler one-liner:
def most_common(lst):
return max(set(lst), key=lst.count)
Borrowing from here, this can be used with Python 2.7:
from collections import Counter
def Most_Common(lst):
data = Counter(lst)
return data.most_common(1)[0][0]
Works around 4-6 times faster than Alex's solutions, and is 50 times faster than the one-liner proposed by newacct.
On CPython 3.6+ (any Python 3.7+) the above will select the first seen element in case of ties. If you're running on older Python, to retrieve the element that occurs first in the list in case of ties you need to do two passes to preserve order:
# Only needed pre-3.6!
def most_common(lst):
data = Counter(lst)
return max(lst, key=data.get)
With so many solutions proposed, I'm amazed nobody's proposed what I'd consider an obvious one (for non-hashable but comparable elements) -- [itertools.groupby][1]. itertools offers fast, reusable functionality, and lets you delegate some tricky logic to well-tested standard library components. Consider for example:
import itertools
import operator
def most_common(L):
# get an iterable of (item, iterable) pairs
SL = sorted((x, i) for i, x in enumerate(L))
# print 'SL:', SL
groups = itertools.groupby(SL, key=operator.itemgetter(0))
# auxiliary function to get "quality" for an item
def _auxfun(g):
item, iterable = g
count = 0
min_index = len(L)
for _, where in iterable:
count += 1
min_index = min(min_index, where)
# print 'item %r, count %r, minind %r' % (item, count, min_index)
return count, -min_index
# pick the highest-count/earliest item
return max(groups, key=_auxfun)[0]
This could be written more concisely, of course, but I'm aiming for maximal clarity. The two print statements can be uncommented to better see the machinery in action; for example, with prints uncommented:
print most_common(['goose', 'duck', 'duck', 'goose'])
emits:
SL: [('duck', 1), ('duck', 2), ('goose', 0), ('goose', 3)]
item 'duck', count 2, minind 1
item 'goose', count 2, minind 0
goose
As you see, SL is a list of pairs, each pair an item followed by the item's index in the original list (to implement the key condition that, if the "most common" items with the same highest count are > 1, the result must be the earliest-occurring one).
groupby groups by the item only (via operator.itemgetter). The auxiliary function, called once per grouping during the max computation, receives and internally unpacks a group - a tuple with two items (item, iterable) where the iterable's items are also two-item tuples, (item, original index) [[the items of SL]].
Then the auxiliary function uses a loop to determine both the count of entries in the group's iterable, and the minimum original index; it returns those as combined "quality key", with the min index sign-changed so the max operation will consider "better" those items that occurred earlier in the original list.
This code could be much simpler if it worried a little less about big-O issues in time and space, e.g....:
def most_common(L):
groups = itertools.groupby(sorted(L))
def _auxfun((item, iterable)):
return len(list(iterable)), -L.index(item)
return max(groups, key=_auxfun)[0]
same basic idea, just expressed more simply and compactly... but, alas, an extra O(N) auxiliary space (to embody the groups' iterables to lists) and O(N squared) time (to get the L.index of every item). While premature optimization is the root of all evil in programming, deliberately picking an O(N squared) approach when an O(N log N) one is available just goes too much against the grain of scalability!-)
Finally, for those who prefer "oneliners" to clarity and performance, a bonus 1-liner version with suitably mangled names:-).
from itertools import groupby as g
def most_common_oneliner(L):
return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]
What you want is known in statistics as mode, and Python of course has a built-in function to do exactly that for you:
>>> from statistics import mode
>>> mode([1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6])
3
Note that if there is no "most common element" such as cases where the top two are tied, this will raise StatisticsError on Python
<=3.7, and on 3.8 onwards it will return the first one encountered.
Without the requirement about the lowest index, you can use collections.Counter for this:
from collections import Counter
a = [1936, 2401, 2916, 4761, 9216, 9216, 9604, 9801]
c = Counter(a)
print(c.most_common(1)) # the one most common element... 2 would mean the 2 most common
[(9216, 2)] # a set containing the element, and it's count in 'a'
If they are not hashable, you can sort them and do a single loop over the result counting the items (identical items will be next to each other). But it might be faster to make them hashable and use a dict.
def most_common(lst):
cur_length = 0
max_length = 0
cur_i = 0
max_i = 0
cur_item = None
max_item = None
for i, item in sorted(enumerate(lst), key=lambda x: x[1]):
if cur_item is None or cur_item != item:
if cur_length > max_length or (cur_length == max_length and cur_i < max_i):
max_length = cur_length
max_i = cur_i
max_item = cur_item
cur_length = 1
cur_i = i
cur_item = item
else:
cur_length += 1
if cur_length > max_length or (cur_length == max_length and cur_i < max_i):
return cur_item
return max_item
This is an O(n) solution.
mydict = {}
cnt, itm = 0, ''
for item in reversed(lst):
mydict[item] = mydict.get(item, 0) + 1
if mydict[item] >= cnt :
cnt, itm = mydict[item], item
print itm
(reversed is used to make sure that it returns the lowest index item)
Sort a copy of the list and find the longest run. You can decorate the list before sorting it with the index of each element, and then choose the run that starts with the lowest index in the case of a tie.
A one-liner:
def most_common (lst):
return max(((item, lst.count(item)) for item in set(lst)), key=lambda a: a[1])[0]
I am doing this using scipy stat module and lambda:
import scipy.stats
lst = [1,2,3,4,5,6,7,5]
most_freq_val = lambda x: scipy.stats.mode(x)[0][0]
print(most_freq_val(lst))
Result:
most_freq_val = 5
# use Decorate, Sort, Undecorate to solve the problem
def most_common(iterable):
# Make a list with tuples: (item, index)
# The index will be used later to break ties for most common item.
lst = [(x, i) for i, x in enumerate(iterable)]
lst.sort()
# lst_final will also be a list of tuples: (count, index, item)
# Sorting on this list will find us the most common item, and the index
# will break ties so the one listed first wins. Count is negative so
# largest count will have lowest value and sort first.
lst_final = []
# Get an iterator for our new list...
itr = iter(lst)
# ...and pop the first tuple off. Setup current state vars for loop.
count = 1
tup = next(itr)
x_cur, i_cur = tup
# Loop over sorted list of tuples, counting occurrences of item.
for tup in itr:
# Same item again?
if x_cur == tup[0]:
# Yes, same item; increment count
count += 1
else:
# No, new item, so write previous current item to lst_final...
t = (-count, i_cur, x_cur)
lst_final.append(t)
# ...and reset current state vars for loop.
x_cur, i_cur = tup
count = 1
# Write final item after loop ends
t = (-count, i_cur, x_cur)
lst_final.append(t)
lst_final.sort()
answer = lst_final[0][2]
return answer
print most_common(['x', 'e', 'a', 'e', 'a', 'e', 'e']) # prints 'e'
print most_common(['goose', 'duck', 'duck', 'goose']) # prints 'goose'
Building on Luiz's answer, but satisfying the "in case of draws the item with the lowest index should be returned" condition:
from statistics import mode, StatisticsError
def most_common(l):
try:
return mode(l)
except StatisticsError as e:
# will only return the first element if no unique mode found
if 'no unique mode' in e.args[0]:
return l[0]
# this is for "StatisticsError: no mode for empty data"
# after calling mode([])
raise
Example:
>>> most_common(['a', 'b', 'b'])
'b'
>>> most_common([1, 2])
1
>>> most_common([])
StatisticsError: no mode for empty data
Simple one line solution
moc= max([(lst.count(chr),chr) for chr in set(lst)])
It will return most frequent element with its frequency.
You probably don't need this anymore, but this is what I did for a similar problem. (It looks longer than it is because of the comments.)
itemList = ['hi', 'hi', 'hello', 'bye']
counter = {}
maxItemCount = 0
for item in itemList:
try:
# Referencing this will cause a KeyError exception
# if it doesn't already exist
counter[item]
# ... meaning if we get this far it didn't happen so
# we'll increment
counter[item] += 1
except KeyError:
# If we got a KeyError we need to create the
# dictionary key
counter[item] = 1
# Keep overwriting maxItemCount with the latest number,
# if it's higher than the existing itemCount
if counter[item] > maxItemCount:
maxItemCount = counter[item]
mostPopularItem = item
print mostPopularItem
ans = [1, 1, 0, 0, 1, 1]
all_ans = {ans.count(ans[i]): ans[i] for i in range(len(ans))}
print(all_ans)
all_ans={4: 1, 2: 0}
max_key = max(all_ans.keys())
4
print(all_ans[max_key])
1
#This will return the list sorted by frequency:
def orderByFrequency(list):
listUniqueValues = np.unique(list)
listQty = []
listOrderedByFrequency = []
for i in range(len(listUniqueValues)):
listQty.append(list.count(listUniqueValues[i]))
for i in range(len(listQty)):
index_bigger = np.argmax(listQty)
for j in range(listQty[index_bigger]):
listOrderedByFrequency.append(listUniqueValues[index_bigger])
listQty[index_bigger] = -1
return listOrderedByFrequency
#And this will return a list with the most frequent values in a list:
def getMostFrequentValues(list):
if (len(list) <= 1):
return list
list_most_frequent = []
list_ordered_by_frequency = orderByFrequency(list)
list_most_frequent.append(list_ordered_by_frequency[0])
frequency = list_ordered_by_frequency.count(list_ordered_by_frequency[0])
index = 0
while(index < len(list_ordered_by_frequency)):
index = index + frequency
if(index < len(list_ordered_by_frequency)):
testValue = list_ordered_by_frequency[index]
testValueFrequency = list_ordered_by_frequency.count(testValue)
if (testValueFrequency == frequency):
list_most_frequent.append(testValue)
else:
break
return list_most_frequent
#tests:
print(getMostFrequentValues([]))
print(getMostFrequentValues([1]))
print(getMostFrequentValues([1,1]))
print(getMostFrequentValues([2,1]))
print(getMostFrequentValues([2,2,1]))
print(getMostFrequentValues([1,2,1,2]))
print(getMostFrequentValues([1,2,1,2,2]))
print(getMostFrequentValues([3,2,3,5,6,3,2,2]))
print(getMostFrequentValues([1,2,2,60,50,3,3,50,3,4,50,4,4,60,60]))
Results:
[]
[1]
[1]
[1, 2]
[2]
[1, 2]
[2]
[2, 3]
[3, 4, 50, 60]
Here:
def most_common(l):
max = 0
maxitem = None
for x in set(l):
count = l.count(x)
if count > max:
max = count
maxitem = x
return maxitem
I have a vague feeling there is a method somewhere in the standard library that will give you the count of each element, but I can't find it.
This is the obvious slow solution (O(n^2)) if neither sorting nor hashing is feasible, but equality comparison (==) is available:
def most_common(items):
if not items:
raise ValueError
fitems = []
best_idx = 0
for item in items:
item_missing = True
i = 0
for fitem in fitems:
if fitem[0] == item:
fitem[1] += 1
d = fitem[1] - fitems[best_idx][1]
if d > 0 or (d == 0 and fitems[best_idx][2] > fitem[2]):
best_idx = i
item_missing = False
break
i += 1
if item_missing:
fitems.append([item, 1, i])
return items[best_idx]
But making your items hashable or sortable (as recommended by other answers) would almost always make finding the most common element faster if the length of your list (n) is large. O(n) on average with hashing, and O(n*log(n)) at worst for sorting.
>>> li = ['goose', 'duck', 'duck']
>>> def foo(li):
st = set(li)
mx = -1
for each in st:
temp = li.count(each):
if mx < temp:
mx = temp
h = each
return h
>>> foo(li)
'duck'
I needed to do this in a recent program. I'll admit it, I couldn't understand Alex's answer, so this is what I ended up with.
def mostPopular(l):
mpEl=None
mpIndex=0
mpCount=0
curEl=None
curCount=0
for i, el in sorted(enumerate(l), key=lambda x: (x[1], x[0]), reverse=True):
curCount=curCount+1 if el==curEl else 1
curEl=el
if curCount>mpCount \
or (curCount==mpCount and i<mpIndex):
mpEl=curEl
mpIndex=i
mpCount=curCount
return mpEl, mpCount, mpIndex
I timed it against Alex's solution and it's about 10-15% faster for short lists, but once you go over 100 elements or more (tested up to 200000) it's about 20% slower.
def most_frequent(List):
counter = 0
num = List[0]
for i in List:
curr_frequency = List.count(i)
if(curr_frequency> counter):
counter = curr_frequency
num = i
return num
List = [2, 1, 2, 2, 1, 3]
print(most_frequent(List))
Hi this is a very simple solution, with linear time complexity
L = ['goose', 'duck', 'duck']
def most_common(L):
current_winner = 0
max_repeated = None
for i in L:
amount_times = L.count(i)
if amount_times > current_winner:
current_winner = amount_times
max_repeated = i
return max_repeated
print(most_common(L))
"duck"
Where number, is the element in the list that repeats most of the time
numbers = [1, 3, 7, 4, 3, 0, 3, 6, 3]
max_repeat_num = max(numbers, key=numbers.count) *# which number most* frequently
max_repeat = numbers.count(max_repeat_num) *#how many times*
print(f" the number {max_repeat_num} is repeated{max_repeat} times")
def mostCommonElement(list):
count = {} // dict holder
max = 0 // keep track of the count by key
result = None // holder when count is greater than max
for i in list:
if i not in count:
count[i] = 1
else:
count[i] += 1
if count[i] > max:
max = count[i]
result = i
return result
mostCommonElement(["a","b","a","c"]) -> "a"
The most common element should be the one which is appearing more than N/2 times in the array where N being the len(array). The below technique will do it in O(n) time complexity, with just consuming O(1) auxiliary space.
from collections import Counter
def majorityElement(arr):
majority_elem = Counter(arr)
size = len(arr)
for key, val in majority_elem.items():
if val > size/2:
return key
return -1
def most_common(lst):
if max([lst.count(i)for i in lst]) == 1:
return False
else:
return max(set(lst), key=lst.count)
def popular(L):
C={}
for a in L:
C[a]=L.count(a)
for b in C.keys():
if C[b]==max(C.values()):
return b
L=[2,3,5,3,6,3,6,3,6,3,7,467,4,7,4]
print popular(L)

Categories

Resources