Function for Simple Moving Average (SMA) - python

I have the following function for calculating SMA in python:
import numpy as np
def calcSma(data, smaPeriod):
sma = []
count = 0
for i in xrange(data.size):
if data[i] is None:
sma.append(None)
else:
count += 1
if count < smaPeriod:
sma.append(None)
else:
sma.append(np.mean(data[i-smaPeriod+1:i+1]))
return np.array(sma)
This function works, but I find it very little pythonic. I don't like the indexing and counting I'm doing, nor the way I have to append to the list and then turn it into a numpy array before I return it.
The reason I have to deal with all these None, is because I want to return an array at the same size as the input array. This makes it easier to plot and deal with on a general level later. I can easily do stuff such as this:
sma = calcSma(data=data, smaPeriod=20)
sma2 = calcSma(data=sma, smaPeriod=10)
plt.plot(data)
plt.plot(sma)
plt.plot(sma2)
plt.show()
So, any ideas on how this can be done prettier and more pythonic?

Pythonic enough I hope
import numpy as np
def calcSma(data, smaPeriod):
j = next(i for i, x in enumerate(data) if x is not None)
our_range = range(len(data))[j + smaPeriod - 1:]
empty_list = [None] * (j + smaPeriod - 1)
sub_result = [np.mean(data[i - smaPeriod + 1: i + 1]) for i in our_range]
return np.array(empty_list + sub_result)

Here is another implementation of moving average just using standard Python library:
from collections import deque
import itertools
def moving_average(iterable, n=3):
# http://en.wikipedia.org/wiki/Moving_average
it = iter(iterable)
# create an iterable object from input argument
d = deque(itertools.islice(it, n-1))
# create deque object by slicing iterable
d.appendleft(0)
s = sum(d)
for elem in it:
s += elem - d.popleft()
d.append(elem)
yield s / n
# example on how to use it
for i in moving_average([40, 30, 50, 46, 39, 44]):
print(i)
# 40.0
# 42.0
# 45.0
# 43.0

Related

Numpy, how can i index an array, to keep items that are smaller than the previous and next 5 items following them?

I'm making a trading strategy that uses support and resistance levels. One of the ways i'm finding those is by searching for maxima's/minima's (prices that are higher/lower than the previous and next 5 prices).
I have an array of smoothed closing prices and i first tried to find them with a for loop :
def find_max_min(smoothed_prices) # smoothed_prices = np.array([1.873,...])
avg_delta = np.diff(smoothed_prices).mean()
maximas = []
minimas = []
for index in range(len(smoothed_prices)):
if index < 5 or index > len(smoothed_prices) - 6:
continue
current_value = smoothed_prices[index]
previous_points = smoothed_prices[index - 5:index]
next_points = smoothed_prices [index+1:index+6]
previous_are_higher = all(x > current_value for x in previous_points)
next_are_higher = all(x > current_value for x in next_points)
previous_are_smaller = all(x < current_value for x in previous_points)
next_are_smaller = all(x < current_value for x in next_points)
previous_delta_is_enough = abs(previous[0] - current_value) > avg_delta
next_delta_is_enough = abs(next_points[-1] - current_value) > avg_delta
delta_is_enough = previous_delta_is_enough and next_delta_is_enough
if previous_are_higher and next_are_higher and delta_is_enough:
minimas.append(current_value)
elif previous_are_higher and next_are_higher and delta_is_enough:
maximas.append(current_value)
else:
continue
return maximas, minimas
(This isn't the actual code that i used because i erased it, this may not work but is was something like that)
So this code could find the maximas and minimas but it was way too slow and i need to use the function multiple times per secs on huge arrays.
My question is : is it possible to do it with a numpy mask in a similar way as this :
smoothed_prices = s
minimas = s[all(x > s[index] for x in s[index-5:index]) and all(x > s[index] for x in s[index+1:index+6])]
maximas = ...
or do you know how i could to it in another efficient numpy way ?
I have thought of a way, it should be faster than the for loop you presented, but it uses more memory. Simply put, it creates a intermediate matrix of windows, then it just gets the max and min of each window:
def find_max_min(arr, win_pad_size=5):
windows = np.zeros((len(arr) - 2 * win_pad_size, 2 * win_pad_size + 1))
for i in range(2 * win_pad_size + 1):
windows[:, i] = arr[i:i+windows.shape[0]]
return windows.max(axis=1), windows.min(axis=1)
Edit: I found a faster way to calculate the sub-sequences (I had called windows) from Split Python sequence into subsequences. It doesn't use more memory, instead, it creates a view of the array.
def subsequences(ts, window):
shape = (ts.size - window + 1, window)
strides = ts.strides * 2
return np.lib.stride_tricks.as_strided(ts, shape=shape, strides=strides)
def find_max_min(arr, win_pad_size=5):
windows = subsequences(arr, 2 * win_pad_size + 1)
return windows.max(axis=1), windows.min(axis=1)
You can do it easily by:
from skimage.util import view_as_windows
a = smoothed_prices[4:-5]
a[a == view_as_windows(smoothed_prices, (10)).min(-1)]
Please note that since you are looking at minimas within +/- 5 of the index, they can be in indices [4:-5] of your array.

Any better/faster way to calculate the relative rank of each element in an array?

I want to calculate the relative rank of each element in an array among elements before it. For example in an array [2,1,4,3], the relative rank (from small to large) of the second element (1) among a subset array of [2,1] is 1. The relative rank of the third element (4) among a subset array of [2,1,4] is 3. The final relative rank of each element should be [1,1,3,3].
I'm using the following python code:
x = np.array([2,1,4,3])
rr = np.ones(4)
for i in range(1,4):
rr[i] = sum(x[i] >= x[:i+1])
Are there any other faster ways?
Not sure if it's faster, but you can do this with a list comprehension, which always brightens my day:
[sorted(x[:i+1]).index(v)+1 for i, v in enumerate(x)]
Here's a vectorized way with broadcasting -
n = len(x)
m1 = x[1:,None]>=x
m2 = np.tri(n-1,n,k=1, dtype=bool)
rr[1:] = (m1 & m2).sum(1)
Alternatively, we could bring in einsum or np.matmul to do the last step of sum-reduction -
(m1.astype(np.float32)[:,None,:] # m2[:,:,None])[:,0,0]
np.einsum('ij,ij->i',m1.astype(np.float32),m2)
Your current algorithm takes quadratic time, which isn't going to scale to large inputs. You can do a lot better.
One way to do better would be to use a sorted data structure, like sortedcontainers.SortedList, and perform a series of lookups and insertions. The following example implementation returns a list, assumes no ties, and starts ranks from 0:
import sortedcontainers
def rank(nums):
sortednums = sortedcontainers.SortedList()
ranks = []
for num in nums:
ranks.append(sortednums.bisect_left(num))
sortednums.add(num)
return ranks
Most of the work is inside the SortedList implementation, and SortedList is pretty fast, so this shouldn't have too much Python overhead. The existence of sortedcontainers definitely makes this more convenient than the next option, if not necessarily more efficient.
This option runs in... O(n log n)-ish time. SortedList uses a two-layer hierarchy instead of a traditional tree structure, making a deliberate tradeoff of more data movement for less pointer chasing, so insertion isn't theoretically O(log n), but it's efficient in practice.
The next option would be to use an augmented mergesort. If you do this, you're going to want to use Numba or Cython, because you'll have to write the loops manually.
The basic idea is to do a mergesort, but tracking the rank of each element in its subarray as you go. When you merge two sorted subarrays, each element on the left side keeps its old rank, while the rank values for elements on the right side get adjusted upward for how many elements on the left were less than them.
This option runs in O(n log n).
An unoptimized implementation operating on Python lists, assuming no ties, and starting ranks at 0, would look like this:
def rank(nums):
_, indexes, ranks = _augmented_mergesort(nums)
result = [None]*len(nums)
for i, rank_ in zip(indexes, ranks):
result[i] = rank_
return result
def _augmented_mergesort(nums):
# returns sorted nums, indexes of sorted nums in original nums, and corresponding ranks
if len(nums) == 1:
return nums, [0], [0]
left, right = nums[:len(nums)//2], nums[len(nums)//2:]
return _merge(*_augmented_mergesort(left), *_augmented_mergesort(right))
def _merge(lnums, lindexes, lranks, rnums, rindexes, rranks):
nums, indexes, ranks = [], [], []
i_left = i_right = 0
def add_from_left():
nonlocal i_left
nums.append(lnums[i_left])
indexes.append(lindexes[i_left])
ranks.append(lranks[i_left])
i_left += 1
def add_from_right():
nonlocal i_right
nums.append(rnums[i_right])
indexes.append(rindexes[i_right] + len(lnums))
ranks.append(rranks[i_right] + i_left)
i_right += 1
while i_left < len(lnums) and i_right < len(rnums):
if lnums[i_left] < rnums[i_right]:
add_from_left()
elif lnums[i_left] > rnums[i_right]:
add_from_right()
else:
raise ValueError("Tie detected")
if i_left < len(lnums):
nums += lnums[i_left:]
indexes += lindexes[i_left:]
ranks += lranks[i_left:]
else:
while i_right < len(rnums):
add_from_right()
return nums, indexes, ranks
For an optimized implementation, you'd want an insertion sort base case, you'd want to use Numba or Cython, you'd want to operate on arrays, and you'd want to not do so much allocation.
You are all my hero! doing great job!. I'd like to show you the comparison of each of your solution:
import numpy as np
import time
import sortedcontainers
def John(x):
n=len(x)
rr=np.ones(n)
for i in range(1,n):
rr[i]=sum(x[i]>=x[:i+1])
return rr
def Matvei(x):
return [sorted(x[:i+1]).index(v)+1 for i, v in enumerate(x)]
def Divarkar1(x):
n = len(x)
m1 = x[1:,None]>=x
m2 = np.tri(n-1,n,k=1, dtype=bool)
rr[1:] = (m1 & m2).sum(1)
return rr
def Divarkar2(x):
n = len(x)
m1 = x[1:,None]>=x
m2 = np.tri(n-1,n,k=1, dtype=bool)
(m1.astype(np.float32)[:,None,:] # m2[:,:,None])[:,0,0]
rr[1:]=np.einsum('ij,ij->i',m1.astype(np.float32),m2)
return rr
def Monica(x):
sortednums = sortedcontainers.SortedList()
ranks = []
for num in x:
ranks.append(sortednums.bisect_left(num))
sortednums.add(num)
return np.array(ranks)+1
x=np.random.rand(4000)
t1=time.time()
rr=John(x)
t2=time.time()
print(t2-t1)
#print(rr)
t1=time.time()
rr=Matvei(x)
t2=time.time()
print(t2-t1)
#print(rr)
t1=time.time()
rr=Divarkar1(x)
t2=time.time()
print(t2-t1)
#print(rr)
t1=time.time()
rr=Divarkar2(x)
t2=time.time()
print(t2-t1)
#print(rr)
t1=time.time()
rr=Monica(x)
t2=time.time()
print(t2-t1)
#print(rr)
The results are:
19.5
2.9
0.079
0.25
0.017
I ran several times and results are similar. The best one is Monica's algorithm!
Many thanks to everyone!
John
when I converted all algorithms into numpy 2D array, I found my algorithm is the best. Of course the performance also depends on the dimension of 2D array. But 380x900 is my case. I think Numpy array calculation benefits it a lot. Here are codes:
import numpy as np
import time
import sortedcontainers
def John(x): #x is 1D array
n=len(x)
rr=[]
for i in range(n):
rr.append(np.sum(x[i]>=x[:i+1]))
return np.array(rr)
def John_2D(rv): #rv is 2d numpy array. rank it along axis 1!
nr,nc=rv.shape
rr=[]
for i in range(nc):
rr.append(np.sum((rv[:,:i+1]<=rv[:,i:i+1]),axis=1))
return np.array(rr).T
def Matvei(x): #x is 1D array
return [sorted(x[:i+1]).index(v)+1 for i, v in enumerate(x)]
def Divarkar1(x):#x is 1D array
n = len(x)
rr=np.ones(n,dtype=int)
m1 = x[1:,None]>=x
m2 = np.tri(n-1,n,k=1, dtype=bool)
rr[1:] = (m1 & m2).sum(1)
return rr
def Divarkar2(x):#x is 1D array
n = len(x)
rr=np.ones(n,dtype=int)
m1 = x[1:,None]>=x
m2 = np.tri(n-1,n,k=1, dtype=bool)
(m1.astype(np.float32)[:,None,:] # m2[:,:,None])[:,0,0]
rr[1:]=np.einsum('ij,ij->i',m1.astype(np.float32),m2)
return rr
def Monica1(nums): #nums is 1D array
sortednums = sortedcontainers.SortedList()
ranks = []
for num in nums:
ranks.append(sortednums.bisect_left(num))
sortednums.add(num)
return np.array(ranks)+1
def Monica2(nums): #nums is 1D array
_, indexes, ranks = _augmented_mergesort(nums)
result = [None]*len(nums)
for i, rank_ in zip(indexes, ranks):
result[i] = rank_
return np.array(result)+1
def _augmented_mergesort(nums): #nums is 1D array
# returns sorted nums, indexes of sorted nums in original nums, and corresponding ranks
if len(nums) == 1:
return nums, [0], [0]
left, right = nums[:len(nums)//2], nums[len(nums)//2:] #split the array by half
return _merge(*_augmented_mergesort(left), *_augmented_mergesort(right))
def _merge(lnums, lindexes, lranks, rnums, rindexes, rranks):
nums, indexes, ranks = [], [], []
i_left = i_right = 0
def add_from_left():
nonlocal i_left
nums.append(lnums[i_left])
indexes.append(lindexes[i_left])
ranks.append(lranks[i_left])
i_left += 1
def add_from_right():
nonlocal i_right
nums.append(rnums[i_right])
indexes.append(rindexes[i_right] + len(lnums))
ranks.append(rranks[i_right] + i_left)
i_right += 1
while i_left < len(lnums) and i_right < len(rnums):
if lnums[i_left] < rnums[i_right]:
add_from_left()
elif lnums[i_left] > rnums[i_right]:
add_from_right()
else:
raise ValueError("Tie detected")
if i_left < len(lnums):
while i_left < len(lnums):
add_from_left()
#nums += lnums[i_left:]
#indexes += lindexes[i_left:]
#ranks += lranks[i_left:]
else:
while i_right < len(rnums):
add_from_right()
return nums, indexes, ranks
def rank_2D(f,nums): #f is method, nums is 2D numpy array
result=[]
for x in nums:
result.append(f(x))
return np.array(result)
x=np.random.rand(6000)
for f in [John, Matvei, Divarkar1, Divarkar2, Monica1, Monica2]:
t1=time.time()
rr=f(x)
t2=time.time()
print(f'{f.__name__+"_1D: ":16} {(t2-t1):.3f}')
print()
x=np.random.rand(380,900)
t1=time.time()
rr=John_2D(x)
t2=time.time()
print(f'{"John_2D:":16} {(t2-t1):.3f}')
#print(rr)
for f in [Matvei, Divarkar1, Divarkar2, Monica1, Monica2]:
t1=time.time()
rr=rank_2D(f,x)
t2=time.time()
print(f'{f.__name__+"_2D: ":16} {(t2-t1):.3f}')
#print(rr)
The typical results are:
John_1D: 0.069
Matvei_1D: 7.208
Divarkar1_1D: 0.163
Divarkar2_1D: 0.488
Monica1_1D: 0.032
Monica2_1D: 0.082
John_2D: 0.409
Matvei_2D: 49.044
Divarkar1_2D: 1.276
Divarkar2_2D: 4.065
Monica1_2D: 1.090
Monica2_2D: 3.571
For 1D array, Monica1 method is the best, but my numpy-version method is not too bad.
For 2D array, my numpy-version method is the best.
Welcome to test and comment.
Thanks
John

python itertools permutations with tied values

I want to find efficiently permutations of a vector which has tied values.
E.g., if perm_vector = [0,0,1,2] I would want to obtain as output all combinations of [0,0,1,2], [0,0,2,1], [0,1,2,0] and so on, but I don't want to obtain [0,0,1,2] twice which is what the standard itertools.permutations(perm_vector) would give.
I tried the following but it works really SLOW when perm_vector grows in len:
vectors_list = []
for it in itertools.permutations(perm_vector):
vectors_list.append(list(it))
df_vectors_list = pd.DataFrame( vectors_list)
df_gb = df_vectors_list.groupby(list(df_vectors_list.columns))
vectors_list = pd.DataFrame(df_gb.groups.keys()).T
The question is of more general "speed-up" nature, actually. The main time is spent on creating the permutations of long vectors - even without the duplicity, creation of permutations of a vector of 12 unique values takes a "infinity". Is there a possibility to call the itertools iteratively without accessing the entire permutations data but working on bunches of it?
Try this if perm_vector is small:
import itertools as iter
{x for x in iter.permutations(perm_vector)}
This should give you unique values, because now it becomes a set, which by default delete duplications.
If perm_vector is large, you might want to try backtracking:
def permu(L, left, right, cache):
for i in range(left, right):
L[left], L[i] = L[i], L[left]
L_tuple = tuple(L)
if L_tuple not in cache:
permu(L, left + 1, right, cache)
L[left], L[i] = L[i], L[left]
cache[L_tuple] = 0
cache = {}
permu(perm_vector, 0, len(perm_vector), cache)
cache.keys()
How about this:
from collections import Counter
def starter(l):
cnt = Counter(l)
res = [None] * len(l)
return worker(cnt, res, len(l) - 1)
def worker(cnt, res, n):
if n < 0:
yield tuple(res)
else:
for k in cnt.keys():
if cnt[k] != 0:
cnt[k] = cnt[k] - 1
res[n] = k
for r in worker(cnt, res, n - 1):
yield r
cnt[k] = cnt[k] + 1

Inverse of random.shuffle()?

I have a function, for simplicity I'll call it shuffler and it takes an list, gives random a seed 17 and then prints that list shuffled.
def shuffler( n ):
import random
random.seed( 17 )
print( random.shuffle( n ) )
How would I create another function called unshuffler that "unshuffles" that list that is returned by shuffler(), bringing it back to the list I inputted into shuffler() assuming that I know the seed?
Just wanted to contribute an answer that's more compatible with functional patterns commonly used with numpy. Ultimately this solution should perform the fastest as it will take advantage of numpy's internal optimizations, which themselves can be further optimized via the use of projects like numba. It ought to be much faster than using conventional loop structures in python.
import numpy as np
original_data = np.array([23, 44, 55, 19, 500, 201]) # Some random numbers to represent the original data to be shuffled
data_length = original_data.shape[0]
# Here we create an array of shuffled indices
shuf_order = np.arange(data_length)
np.random.shuffle(shuf_order)
shuffled_data = original_data[shuf_order] # Shuffle the original data
# Create an inverse of the shuffled index array (to reverse the shuffling operation, or to "unshuffle")
unshuf_order = np.zeros_like(shuf_order)
unshuf_order[shuf_order] = np.arange(data_length)
unshuffled_data = shuffled_data[unshuf_order] # Unshuffle the shuffled data
print(f"original_data: {original_data}")
print(f"shuffled_data: {shuffled_data}")
print(f"unshuffled_data: {unshuffled_data}")
assert np.all(np.equal(unshuffled_data, original_data))
Here are two functions that do what you need:
import random
import numpy as np
def shuffle_forward(l):
order = range(len(l)); random.shuffle(order)
return list(np.array(l)[order]), order
def shuffle_backward(l, order):
l_out = [0] * len(l)
for i, j in enumerate(order):
l_out[j] = l[i]
return l_out
Example
l = range(10000); random.shuffle(l)
l_shuf, order = shuffle_forward(l)
l_unshuffled = shuffle_backward(l_shuf, order)
print l == l_unshuffled
#True
Reseed the random generator with the seed in question and then shuffle the list 1, 2, ..., n. This tells you exactly what ended up where in the shuffle.
In Python3:
import random
import numpy as np
def shuffle_forward(l):
order = list(range(len(l)); random.shuffle(order))
return list(np.array(l)[order]), order
def shuffle_backward(l, order):
l_out = [0] * len(l)
for i, j in enumerate(order):
l_out[j] = l[i]
return l_out

Fast way to remove a few items from a list/queue

This is a follow up to a similar question which asked the best way to write
for item in somelist:
if determine(item):
code_to_remove_item
and it seems the consensus was on something like
somelist[:] = [x for x in somelist if not determine(x)]
However, I think if you are only removing a few items, most of the items are being copied into the same object, and perhaps that is slow. In an answer to another related question, someone suggests:
for item in reversed(somelist):
if determine(item):
somelist.remove(item)
However, here the list.remove will search for the item, which is O(N) in the length of the list. May be we are limited in that the list is represented as an array, rather than a linked list, so removing items will need to move everything after it. However, it is suggested here that collections.dequeue is represented as a doubly linked list. It should then be possible to remove in O(1) while iterating. How would we actually accomplish this?
Update:
I did some time testing as well, with the following code:
import timeit
setup = """
import random
random.seed(1)
b = [(random.random(),random.random()) for i in xrange(1000)]
c = []
def tokeep(x):
return (x[1]>.45) and (x[1]<.5)
"""
listcomp = """
c[:] = [x for x in b if tokeep(x)]
"""
filt = """
c = filter(tokeep, b)
"""
print "list comp = ", timeit.timeit(listcomp,setup, number = 10000)
print "filtering = ", timeit.timeit(filt,setup, number = 10000)
and got:
list comp = 4.01255393028
filtering = 3.59962391853
The list comprehension is the asymptotically optimal solution:
somelist = [x for x in somelist if not determine(x)]
It only makes one pass over the list, so runs in O(n) time. Since you need to call determine() on each object, any algorithm will require at least O(n) operations. The list comprehension does have to do some copying, but it's only copying references to the objects not copying the objects themselves.
Removing items from a list in Python is O(n), so anything with a remove, pop, or del inside the loop will be O(n**2).
Also, in CPython list comprehensions are faster than for loops.
If you need to remove item in O(1) you can use HashMaps
Since list.remove is equivalent to del list[list.index(x)], you could do:
for idx, item in enumerate(somelist):
if determine(item):
del somelist[idx]
But: you should not modify the list while iterating over it. It will bite you, sooner or later. Use filter or list comprehension first, and optimise later.
A deque is optimized for head and tail removal, not for arbitrary removal in the middle. The removal itself is fast, but you still have to traverse the list to the removal point. If you're iterating through the entire length, then the only difference between filtering a deque and filtering a list (using filter or a comprehension) is the overhead of copying, which at worst is a constant multiple; it's still a O(n) operation. Also, note that the objects in the list aren't being copied -- just the references to them. So it's not that much overhead.
It's possible that you could avoid copying like so, but I have no particular reason to believe this is faster than a straightforward list comprehension -- it's probably not:
write_i = 0
for read_i in range(len(L)):
L[write_i] = L[read_i]
if L[read_i] not in ['a', 'c']:
write_i += 1
del L[write_i:]
I took a stab at this. My solution is slower, but requires less memory overhead (i.e. doesn't create a new array). It might even be faster in some circumstances!
This code has been edited since its first posting
I had problems with timeit, I might be doing this wrong.
import timeit
setup = """
import random
random.seed(1)
global b
setup_b = [(random.random(), random.random()) for i in xrange(1000)]
c = []
def tokeep(x):
return (x[1]>.45) and (x[1]<.5)
# define and call to turn into psyco bytecode (if using psyco)
b = setup_b[:]
def listcomp():
c[:] = [x for x in b if tokeep(x)]
listcomp()
b = setup_b[:]
def filt():
c = filter(tokeep, b)
filt()
b = setup_b[:]
def forfilt():
marked = (i for i, x in enumerate(b) if tokeep(x))
shift = 0
for n in marked:
del b[n - shift]
shift += 1
forfilt()
b = setup_b[:]
def forfiltCheating():
marked = (i for i, x in enumerate(b) if (x[1] > .45) and (x[1] < .5))
shift = 0
for n in marked:
del b[n - shift]
shift += 1
forfiltCheating()
"""
listcomp = """
b = setup_b[:]
listcomp()
"""
filt = """
b = setup_b[:]
filt()
"""
forfilt = """
b = setup_b[:]
forfilt()
"""
forfiltCheating = '''
b = setup_b[:]
forfiltCheating()
'''
psycosetup = '''
import psyco
psyco.full()
'''
print "list comp = ", timeit.timeit(listcomp, setup, number = 10000)
print "filtering = ", timeit.timeit(filt, setup, number = 10000)
print 'forfilter = ', timeit.timeit(forfilt, setup, number = 10000)
print 'forfiltCheating = ', timeit.timeit(forfiltCheating, setup, number = 10000)
print '\nnow with psyco \n'
print "list comp = ", timeit.timeit(listcomp, psycosetup + setup, number = 10000)
print "filtering = ", timeit.timeit(filt, psycosetup + setup, number = 10000)
print 'forfilter = ', timeit.timeit(forfilt, psycosetup + setup, number = 10000)
print 'forfiltCheating = ', timeit.timeit(forfiltCheating, psycosetup + setup, number = 10000)
And here are the results
list comp = 6.56407690048
filtering = 5.64738512039
forfilter = 7.31555104256
forfiltCheating = 4.8994679451
now with psyco
list comp = 8.0485959053
filtering = 7.79016900063
forfilter = 9.00477004051
forfiltCheating = 4.90830993652
I must be doing something wrong with psyco, because it is actually running slower.
elements are not copied by list comprehension
this took me a while to figure out. See the example code below, to experiment yourself with different approaches
code
You can specify how long a list element takes to copy and how long it takes to evaluate. The time to copy is irrelevant for list comprehension, as it turned out.
import time
import timeit
import numpy as np
def ObjectFactory(time_eval, time_copy):
"""
Creates a class
Parameters
----------
time_eval : float
time to evaluate (True or False, i.e. keep in list or not) an object
time_copy : float
time to (shallow-) copy an object. Used by list comprehension.
Returns
-------
New class with defined copy-evaluate performance
"""
class Object:
def __init__(self, id_, keep):
self.id_ = id_
self._keep = keep
def __repr__(self):
return f"Object({self.id_}, {self.keep})"
#property
def keep(self):
time.sleep(time_eval)
return self._keep
def __copy__(self): # list comprehension does not copy the object
time.sleep(time_copy)
return self.__class__(self.id_, self._keep)
return Object
def remove_items_from_list_list_comprehension(lst):
return [el for el in lst if el.keep]
def remove_items_from_list_new_list(lst):
new_list = []
for el in lst:
if el.keep:
new_list += [el]
return new_list
def remove_items_from_list_new_list_by_ind(lst):
new_list_inds = []
for ee in range(len(lst)):
if lst[ee].keep:
new_list_inds += [ee]
return [lst[ee] for ee in new_list_inds]
def remove_items_from_list_del_elements(lst):
"""WARNING: Modifies lst"""
new_list_inds = []
for ee in range(len(lst)):
if lst[ee].keep:
new_list_inds += [ee]
for ind in new_list_inds[::-1]:
if not lst[ind].keep:
del lst[ind]
if __name__ == "__main__":
ClassSlowCopy = ObjectFactory(time_eval=0, time_copy=0.1)
ClassSlowEval = ObjectFactory(time_eval=1e-8, time_copy=0)
keep_ratio = .8
n_runs_timeit = int(1e2)
n_elements_list = int(1e2)
lsts_to_tests = dict(
list_slow_copy_remove_many = [ClassSlowCopy(ii, np.random.rand() > keep_ratio) for ii in range(n_elements_list)],
list_slow_copy_keep_many = [ClassSlowCopy(ii, np.random.rand() > keep_ratio) for ii in range(n_elements_list)],
list_slow_eval_remove_many = [ClassSlowEval(ii, np.random.rand() > keep_ratio) for ii in range(n_elements_list)],
list_slow_eval_keep_many = [ClassSlowEval(ii, np.random.rand() > keep_ratio) for ii in range(n_elements_list)],
)
for lbl, lst in lsts_to_tests.items():
print()
for fct in [
remove_items_from_list_list_comprehension,
remove_items_from_list_new_list,
remove_items_from_list_new_list_by_ind,
remove_items_from_list_del_elements,
]:
lst_loc = lst.copy()
t = timeit.timeit(lambda: fct(lst_loc), number=n_runs_timeit)
print(f"{fct.__name__}, {lbl}: {t=}")
output
remove_items_from_list_list_comprehension, list_slow_copy_remove_many: t=0.0064229519994114526
remove_items_from_list_new_list, list_slow_copy_remove_many: t=0.006507338999654166
remove_items_from_list_new_list_by_ind, list_slow_copy_remove_many: t=0.006562008995388169
remove_items_from_list_del_elements, list_slow_copy_remove_many: t=0.0076057760015828535
remove_items_from_list_list_comprehension, list_slow_copy_keep_many: t=0.006243691001145635
remove_items_from_list_new_list, list_slow_copy_keep_many: t=0.007145451003452763
remove_items_from_list_new_list_by_ind, list_slow_copy_keep_many: t=0.007032064997474663
remove_items_from_list_del_elements, list_slow_copy_keep_many: t=0.007690364996960852
remove_items_from_list_list_comprehension, list_slow_eval_remove_many: t=1.2495998149970546
remove_items_from_list_new_list, list_slow_eval_remove_many: t=1.1657221479981672
remove_items_from_list_new_list_by_ind, list_slow_eval_remove_many: t=1.2621939050004585
remove_items_from_list_del_elements, list_slow_eval_remove_many: t=1.4632593330024974
remove_items_from_list_list_comprehension, list_slow_eval_keep_many: t=1.1344162709938246
remove_items_from_list_new_list, list_slow_eval_keep_many: t=1.1323430630000075
remove_items_from_list_new_list_by_ind, list_slow_eval_keep_many: t=1.1354237199993804
remove_items_from_list_del_elements, list_slow_eval_keep_many: t=1.3084568729973398
import collections
list1=collections.deque(list1)
for i in list2:
try:
list1.remove(i)
except:
pass
INSTEAD OF CHECKING IF ELEMENT IS THERE. USING TRY EXCEPT.
I GUESS THIS FASTER

Categories

Resources