I have array and need max of rolling difference with dynamic window.
a = np.array([8, 18, 5,15,12])
print (a)
[ 8 18 5 15 12]
So first I create difference by itself:
b = a - a[:, None]
print (b)
[[ 0 10 -3 7 4]
[-10 0 -13 -3 -6]
[ 3 13 0 10 7]
[ -7 3 -10 0 -3]
[ -4 6 -7 3 0]]
Then replace upper triangle matrix to 0:
c = np.tril(b)
print (c)
[[ 0 0 0 0 0]
[-10 0 0 0 0]
[ 3 13 0 0 0]
[ -7 3 -10 0 0]
[ -4 6 -7 3 0]]
Last need max values per diagonal, so it means:
max([0,0,0,0,0]) = 0
max([-10,13,-10,3]) = 13
max([3,3,-7]) = 3
max([-7,6]) = 6
max([-4]) = -4
So expected output is:
[0, 13, 3, 6, -4]
What is some nice vectorized solution? Or is possible some another way for expected output?
Use ndarray.diagonal
v = [max(c.diagonal(-i)) for i in range(b.shape[0])]
print(v) # [0, 13, 3, 6, -4]
Not sure exactly how efficient this is considering the advanced indexing involved, but this is one way to do that:
import numpy as np
a = np.array([8, 18, 5, 15, 12])
b = a[:, None] - a
# Fill lower triangle with largest negative
b[np.tril_indices(len(a))] = np.iinfo(b.dtype).min # np.finfo for float
# Put diagonals as rows
s = b.strides[1]
diags = np.ndarray((len(a) - 1, len(a) - 1), b.dtype, b, offset=s, strides=(s, (len(a) + 1) * s))
# Get maximum from each row and add initial zero
c = np.r_[0, diags.max(1)]
print(c)
# [ 0 13 3 6 -4]
EDIT:
Another alternative, which may not be what you were looking for though, is just using Numba, for example like this:
import numpy as np
import numba as nb
def max_window_diffs_jdehesa(a):
a = np.asarray(a)
dtinf = np.iinfo(b.dtype) if np.issubdtype(b.dtype, np.integer) else np.finfo(b.dtype)
out = np.full_like(a, dtinf.min)
_pwise_diffs(a, out)
return out
#nb.njit(parallel=True)
def _pwise_diffs(a, out):
out[0] = 0
for w in nb.prange(1, len(a)):
for i in range(len(a) - w):
out[w] = max(a[i] - a[i + w], out[w])
a = np.array([8, 18, 5, 15, 12])
print(max_window_diffs(a))
# [ 0 13 3 6 -4]
Comparing these methods to the original:
import numpy as np
import numba as nb
def max_window_diffs_orig(a):
a = np.asarray(a)
b = a - a[:, None]
out = np.zeros(len(a), b.dtype)
out[-1] = b[-1, 0]
for i in range(1, len(a) - 1):
out[i] = np.diag(b, -i).max()
return out
def max_window_diffs_jdehesa_np(a):
a = np.asarray(a)
b = a[:, None] - a
dtinf = np.iinfo(b.dtype) if np.issubdtype(b.dtype, np.integer) else np.finfo(b.dtype)
b[np.tril_indices(len(a))] = dtinf.min
s = b.strides[1]
diags = np.ndarray((len(a) - 1, len(a) - 1), b.dtype, b, offset=s, strides=(s, (len(a) + 1) * s))
return np.concatenate([[0], diags.max(1)])
def max_window_diffs_jdehesa_nb(a):
a = np.asarray(a)
dtinf = np.iinfo(b.dtype) if np.issubdtype(b.dtype, np.integer) else np.finfo(b.dtype)
out = np.full_like(a, dtinf.min)
_pwise_diffs(a, out)
return out
#nb.njit(parallel=True)
def _pwise_diffs(a, out):
out[0] = 0
for w in nb.prange(1, len(a)):
for i in range(len(a) - w):
out[w] = max(a[i] - a[i + w], out[w])
np.random.seed(0)
a = np.random.randint(0, 100, size=100)
r = max_window_diffs_orig(a)
print((max_window_diffs_jdehesa_np(a) == r).all())
# True
print((max_window_diffs_jdehesa_nb(a) == r).all())
# True
%timeit max_window_diffs_orig(a)
# 348 µs ± 986 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit max_window_diffs_jdehesa_np(a)
# 91.7 µs ± 1.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit max_window_diffs_jdehesa_nb(a)
# 19.7 µs ± 88.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
np.random.seed(0)
a = np.random.randint(0, 100, size=10000)
%timeit max_window_diffs_orig(a)
# 651 ms ± 26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit max_window_diffs_jdehesa_np(a)
# 1.61 s ± 6.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit max_window_diffs_jdehesa_nb(a)
# 22 ms ± 967 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The first one may be a bit better for smaller arrays, but doesn't work well for bigger ones. Numba on the other hand is pretty good in all cases.
You can use numpy.diagonal:
a = np.array([8, 18, 5,15,12])
b = a - a[:, None]
c = np.tril(b)
for i in range(b.shape[0]):
print(max(c.diagonal(-i)))
Output:
0
13
3
6
-4
Here's a vectorized solution with strides -
from skimage.util import view_as_windows
n = len(a)
z = np.zeros(n-1,dtype=a.dtype)
p = np.concatenate((a,z))
s = view_as_windows(p,n)
mask = np.tri(n,k=-1,dtype=bool)[:,::-1]
v = s[0]-s
out = np.where(mask,v.min()-1,v).max(1)
With one-loop for memory-efficiency -
n = len(a)
out = [max(a[:-i+n]-a[i:]) for i in range(n)]
Use np.max in place of max for better use of array-memory.
You can abuse the fact that reshaping non-square arrays of shape (N+1, N) to (N, N+1) will make diagonals appear as columns
from scipy.linalg import toeplitz
a = toeplitz([1,2,3,4], [1,4,3])
# array([[1, 4, 3],
# [2, 1, 4],
# [3, 2, 1],
# [4, 3, 2]])
a.reshape(3, 4)
# array([[1, 4, 3, 2],
# [1, 4, 3, 2],
# [1, 4, 3, 2]])
Which you can then use like (note that I've swapped the sign and set the lower triangle to zero)
smallv = -10000 # replace this with np.nan if you have floats
a = np.array([8, 18, 5,15,12])
b = a[:, None] - a
b[np.tril_indices(len(b), -1)] = smallv
d = np.vstack((b, np.full(len(b), smallv)))
d.reshape(len(d) - 1, -1).max(0)[:-1]
# array([ 0, 13, 3, 6, -4])
Related
I have an array in numpy. I want to roll the first column by 1, second column by 2, etc.
Here is an example.
>>> x = np.reshape(np.arange(15), (5, 3))
>>> x
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
What I want to do:
>>> y = roll(x)
>>> y
array([[12, 10, 8],
[ 0, 13, 11],
[ 3, 1, 14],
[ 6, 4, 2],
[ 9, 7, 5]])
What is the best way to do it?
The real array will be very big. I'm using cupy, the GPU version of numpy. I will prefer solution fastest on GPU, but of course, any idea is welcomed.
You could use advanced indexing:
import numpy as np
x = np.reshape(np.arange(15), (5, 3))
h, w = x.shape
rows, cols = np.arange(h), np.arange(w)
offsets = cols + 1
shifted = np.subtract.outer(rows, offsets) % h
y = x[shifted, cols]
y:
array([[12, 10, 8],
[ 0, 13, 11],
[ 3, 1, 14],
[ 6, 4, 2],
[ 9, 7, 5]])
I implemented a naive solution (roll_for) and compares it to #Chrysophylaxs 's solution (roll_indexing).
Conclusion: roll_indexing is faster for small arrays, but the difference shrinks when the array goes bigger, and is eventually slower than roll_for for very large arrays.
Implementations:
import numpy as np
def roll_for(x, shifts=None, axis=-1):
if shifts is None:
shifts = np.arange(1, x.shape[axis] + 1) # OP requirement
xt = x.swapaxes(axis, 0) # https://stackoverflow.com/a/31094758/13636407
yt = np.empty_like(xt)
for idx, shift in enumerate(shifts):
yt[idx] = np.roll(xt[idx], shift=shift)
return yt.swapaxes(0, axis)
def roll_indexing(x):
h, w = x.shape
rows, cols = np.arange(h), np.arange(w)
offsets = cols + 1
shifted = np.subtract.outer(rows, offsets) % h # fix
return x[shifted, cols]
Tests:
M, N = 5, 3
x = np.arange(M * N).reshape(M, N)
expected = np.array([[12, 10, 8], [0, 13, 11], [3, 1, 14], [6, 4, 2], [9, 7, 5]])
assert np.array_equal(expected, roll_for(x))
assert np.array_equal(expected, roll_indexing(x))
M, N = 100, 200
# roll_indexing did'nt work when M < N before fix
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
Benchmark:
M, N = 100, 100
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 859 µs ± 2.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit roll_indexing(x) # 81 µs ± 255 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
M, N = 1_000, 1_000
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 12.7 ms ± 56.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit roll_indexing(x) # 12.4 ms ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
M, N = 10_000, 10_000
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 1.3 s ± 6.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit roll_indexing(x) # 1.61 s ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
How can I increase performance Python script using numpy and numba?
I’m trying to convert decimal number to 21-number system.
Input: [15, 18, 28, 11, 7, 5, 41, 139, 6, 507]
Output: [[15], [18], [1, 7], [11], [7], [5], [1, 20], [6, 13], [6], [1, 3, 3]]
My script is working well using CPU.
How can I modify my script? I want to increase performance using GPU.
import numpy as np
from timeit import default_timer as timer
from numba import vectorize
import numba as nb
elements = [
"n|0",
"n|1",
"n|2",
"n|3",
"n|4",
"n|5",
"n|6",
"n|7",
"n|8",
"n|9",
"n|10",
"o|+",
"o|*",
"o|/",
"om|-",
"bl|(",
"br|)",
"e|**2",
"e|**3",
"e|**0.5",
"e|**(1/3)",
]
elements_len = len(elements)
def decimal_to_custom(number):
x = (number % elements_len)
ch = [x]
if (number - x != 0):
return decimal_to_custom(number // elements_len) + ch
else:
return ch
decimal_numbers = np.array([15, 18, 28, 11, 7, 5, 41, 139, 6, 507]) #very big array
custom_numers = []
for decimal_number in decimal_numbers:
custom_numer = decimal_to_custom(decimal_number)
custom_numers.append(custom_numer)
print(custom_numers)
Your code can be summarized as:
import numpy as np
def decimal_to_custom(number, k):
x = (number % k)
ch = [x]
if (number - x != 0):
return decimal_to_custom(number // k, k) + ch
else:
return ch
def remainders_OP(arr, k):
result = []
for value in arr:
result.append(decimal_to_custom(value, k))
return result
decimal_numbers = np.array([15, 18, 28, 11, 7, 5, 41, 139, 6, 507]) #very big array
print(remainders_OP(decimal_numbers, elements_len))
# [[15], [18], [1, 7], [11], [7], [5], [1, 20], [6, 13], [6], [1, 3, 3]]
This code can be speed-up already by replacing the costly recursive implementation of decimal_to_custom() with an iterative and simpler version mod_list() which appends and revert rather than the very expensive head insert (equivalent to list.insert(0, x)) that is implemented in OP:
def mod_list(x, k):
result = []
while x >= k:
result.append(x % k)
x //= k
result.append(x)
return result[::-1]
def remainders(arr, k):
result = []
for x in arr:
result.append(mod_list(x, k))
return result
print(remainders(decimal_numbers, elements_len))
# [[15], [18], [1, 7], [11], [7], [5], [1, 20], [6, 13], [6], [1, 3, 3]]
Now, both can be accelerated with Numba, to obtain some speed-up:
import numba as nb
#nb.njit
def mod_list_nb(x, k):
result = []
while x >= k:
result.append(x % k)
x //= k
result.append(x)
return result[::-1]
#nb.njit
def remainders_nb(arr, k):
result = []
for x in arr:
result.append(mod_list_nb(x, k))
return result
print(remainders_nb(decimal_numbers, elements_len))
# [[15], [18], [1, 7], [11], [7], [5], [1, 20], [6, 13], [6], [1, 3, 3]]
A number of options can be passed on to the decorator, including target_backend="cuda" to have the computation to run on the GPU.
As we shall see with the benchmarks, it is not going to be beneficial.
The reason is that list.append() (as well as list.insert()) is not easy to run in parallel, and hence you cannot easily exploit the massive parallelism of GPUs!
Anyway, the above solutions are slowed down by the choice of the underlying data container.
If one uses fixed size arrays instead of dynamically growing a list at each iteration, this is going to result in a much faster execution:
def remainders_fixed_np(arr, k, m):
arr = arr.copy()
n = len(arr)
result = np.empty((n, m), dtype=np.int_)
for i in range(m - 1, -1, -1):
result[:, i] = arr[:, i + 1] % k
arr //= k
return result
print(remainders_fixed_np(decimal_numbers, elements_len, 3).T)
# [[ 0 0 0 0 0 0 0 0 0 1]
# [ 0 0 1 0 0 0 1 6 0 3]
# [15 18 7 11 7 5 20 13 6 3]]
or, with Numba acceleration (and avoiding unnecessary computation):
#nb.njit
def remainders_fixed_nb(arr, k, m):
n = len(arr)
result = np.zeros((n, m), dtype=np.int_)
for i in range(n):
j = m - 1
x = arr[i]
while x >= k:
q, r = divmod(x, k)
result[i, j] = r
x = q
j -= 1
result[i, j] = x
return result
print(remainders_fixed_nb(decimal_numbers, elements_len, 3).T)
# [[ 0 0 0 0 0 0 0 0 0 1]
# [ 0 0 1 0 0 0 1 6 0 3]
# [15 18 7 11 7 5 20 13 6 3]]
Some Benchmarks
Now some benchmarks run on Google Colab show some indicative timings, where:
the _nb ending indicates Numba acceleration
the _pnb ending indicates Numba acceleration with parallel=True and the outermost range() replaced with nb.prange()
the _cunb ending indicates Numba acceleration with target CUDA target_backend="cuda"
the _cupnb is Numba acceleration with both parallelization and target CUDA
m = 4
n = 100000
arr = np.random.randint(1, k ** m - 1, n)
funcs = remainders_OP, remainders, remainders_nb, remainders_cunb
base = funcs[0](arr, k)
for func in funcs:
res = func(arr, k)
is_good = base == res
print(f"{func.__name__:>16s} {is_good!s:>5s} ", end="")
%timeit -n 4 -r 4 func(arr, k)
# remainders_OP True 333 ms ± 4.38 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
# remainders True 268 ms ± 5.11 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
# remainders_nb True 46.9 ms ± 3.16 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
# remainders_cunb True 46.4 ms ± 1.71 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
fixed_funcs = remainders_fixed_np, remainders_fixed_nb, remainders_fixed_pnb, remainders_fixed_cunb, remainders_fixed_cupnb
base = fixed_funcs[0](arr, k, m)
for func in fixed_funcs:
res = func(arr, k, m)
is_good = np.all(base == res)
print(f"{func.__name__:>24s} {is_good!s:>5s} ", end="")
%timeit -n 8 -r 8 func(arr, k, m)
# remainders_fixed_np True 10 ms ± 2.09 ms per loop (mean ± std. dev. of 8 runs, 8 loops each)
# remainders_fixed_nb True 3.6 ms ± 315 µs per loop (mean ± std. dev. of 8 runs, 8 loops each)
# remainders_fixed_pnb True 2.68 ms ± 550 µs per loop (mean ± std. dev. of 8 runs, 8 loops each)
# remainders_fixed_cunb True 3.49 ms ± 192 µs per loop (mean ± std. dev. of 8 runs, 8 loops each)
# remainders_fixed_cupnb True 2.63 ms ± 314 µs per loop (mean ± std. dev. of 8 runs, 8 loops each)
This indicate that running on the GPU has minimal effect.
The greatest speed-up is obtained by changing the data container to a pre-allocated one.
The Numba acceleration provides some acceleration both with the dynamic allocation and with the pre-allocated versions.
Consider an list of numpy arrays with values either -1’s or 1’s allocated in random positions.
a = np.array([1,-1,1,1,-1,1,-1,-1,1,-1])
b = np.array([-1,-1,1,-1,1,1,-1,1,-1,-1])
I need to perform operations on these arrays like sum and point wise multiplication.
For example, after summing 2 arrays i will have a new one with values -2,0 and 2.
c = a + b
c = [ 0 -2 2 0 0 2 -2 0 0 -2]
Now i would like to “normalize” it back to -1’s and 1’s.
For the 2’s and -2’s it is easy:
c[c < 0] = -1
c[c > 0] = 1
The problem is the 0. For them i would like to randomly choose either a -1 or a 1.
The desired output would be like:
c = [ 1 -1 1 -1 -1 1 -1 1 -1 -1]
In generalized terms my question is how to find all N values equal to x, in an array, then substitute each for a random number.
My question is how to do this in the most “pythonic”, and fastest, way?
Thank’s
Just Posting the final results from the answers i got so far.
If anyone in the future has a better solution please share it!
I timed the 3 solutions i found and one i did.
def Norm1(HV):
HV[HV > 0] = 1
HV[HV < 0] = -1
zind = np.where(HV == 0)[0]
HV[zind] = np.array([np.random.choice([1, -1]) for _ in zind])
return HV
def norm2(HV):
if HV == 0:
return np.random.choice(np.array([-1,1]))
else:
return HV / HV * np.sign(HV)
Norm2 = np.vectorize(norm2)
def Norm3(HV):
HV[HV > 0] = 1
HV[HV < 0] = -1
mask = HV==0;
HV[mask] = np.random.choice((-1,1),HV[mask].shape)
return HV
def generate(size):
return np.random.binomial(1, 0.5, size=size) * 2 - 1
def Norm4(arr):
np.floor_divide(arr, 2, out=arr)
positions = (arr == 0)
size = sum(positions)
np.add.at(arr, positions, generate(size)
The timings were:
%%timeit
d = Norm1(c)
203 µs ± 5.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%%timeit
d = Norm2(c)
33.4 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
d = Norm3(c)
217 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%%timeit
d = Norm4(c)
21 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
So as it stands it looks like answer 1 and 3 are the best ones. The difference between them looks minimal, but after trying some more runs the number 1 always come slightly on top.
Thanks for the Helps guys!
I will add some references to HD computing in the question as this is a core problem in this application so it will be easier for someone to find it if needed.
I'm not in any way claiming this is the fastest nor most efficient approach.
c = np.array([ 0, -2, 2, 0, 0, 2, -2, 0, 0, -2])
def norm(a):
if a == 0:
return np.random.choice(np.array([-1,1]))
else:
return a / a * np.sign(a)
v_norm = np.vectorize(norm)
norm_arr = v_norm(c)
Result:
In [64]: norm_arr
Out[64]: array([ 1, -1, 1, 1, -1, 1, -1, 1, -1, -1])
You might use:
>>> c = [0, -2, 2, 0, 0, 2, -2, 0, 0, -2]
>>> c = np.array([0, -2, 2, 0, 0, 2, -2, 0, 0, -2])
>>> zind = np.where(c==0)[0]
>>> c[zind] = np.array([np.random.choice([1, -1]) for _ in zind])
>>> c
array([ 1, -2, 2, -1, -1, 2, -2, -1, 1, -2])
I have a sorted integer array, e.g., [0, 0, 1, 1, 1, 2, 4, 4], and I would like to determine where the integer blocks start and how long the blocks are. The block sizes are small but the array itself can be very large, so efficiency is important. The total number of blocks is also known.
numpy.unique does the trick:
import numpy
a = numpy.array([0, 0, 1, 1, 1, 2, 4, 4])
num_blocks = 4
print(a)
_, idx_start, count = numpy.unique(a, return_index=True, return_counts=True)
print(idx_start)
print(count)
[0 0 1 1 1 2 4 4]
[0 2 5 6]
[2 3 1 2]
but is slow. I would assume that, given the specific structure of the input array, there's a more efficient solution.
For example, something as simple as
import numpy
a = numpy.array([0, 0, 1, 1, 1, 2, 3, 3])
num_blocks = 4
k = 0
z = a[k]
block_idx = 0
counts = numpy.empty(num_blocks, dtype=int)
count = 0
while k < len(a):
if z == a[k]:
count += 1
else:
z = a[k]
counts[block_idx] = count
count = 1
block_idx += 1
k += 1
counts[block_idx] = count
print(counts)
gives the block sizes, and a simple numpy.cumsum would give index_start. Using a Python loop is slow of course.
Any hints?
Here's one with some masking and slicing -
def grp_start_len(a):
m = np.r_[True,a[:-1] != a[1:],True] #np.concatenate for a bit more boost
idx = np.flatnonzero(m)
return idx[:-1], np.diff(idx)
Sample run -
In [18]: a
Out[18]: array([0, 0, 1, 1, 1, 2, 4, 4])
In [19]: grp_start_len(a)
Out[19]: (array([0, 2, 5, 6]), array([2, 3, 1, 2]))
Timings (setup from #AGN Gazer's solution) -
In [24]: np.random.seed(0)
In [25]: a = np.sort(np.random.randint(1, 10000, 10000))
In [26]: %timeit _, idx_start, count = np.unique(a, return_index=True, return_counts=True)
1000 loops, best of 3: 411 µs per loop
# #AGN Gazer's solution
In [27]: %timeit st = np.where(np.ediff1d(a, a[-1] + 1, a[0] + 1))[0]; idx = st[:-1]; cnt = np.ediff1d(st)
10000 loops, best of 3: 81.2 µs per loop
In [28]: %timeit grp_start_len(a)
10000 loops, best of 3: 60.1 µs per loop
Bumping up the sizes 10x more -
In [40]: np.random.seed(0)
In [41]: a = np.sort(np.random.randint(1, 100000, 100000))
In [42]: %timeit _, idx_start, count = np.unique(a, return_index=True, return_counts=True)
...: %timeit st = np.where(np.ediff1d(a, a[-1] + 1, a[0] + 1))[0]; idx = st[:-1]; cnt = np.ediff1d(st)
...: %timeit grp_start_len(a)
100 loops, best of 3: 5.34 ms per loop
1000 loops, best of 3: 792 µs per loop
1000 loops, best of 3: 463 µs per loop
np.where(np.ediff1d(a, None, a[0]))[0]
If you want to have the first "0" as in your answer, add a non-zero number to a[0]:
np.where(np.ediff1d(a, None, a[0] + 1))[0]
EDIT (Block length):
Ah, just noticed that you also want to get block length. Then, modify the above code:
st = np.where(np.ediff1d(a, a[-1] + 1, a[0] + 1))[0]
idx = st[:-1]
cnt = np.ediff1d(st)
Then,
>>> print(idx)
[0 2 5 6]
>>> print(cnt)
[2 3 1 2]
EDIT 2 (Timing tests)
In [69]: a = np.sort(np.random.randint(1, 10000, 10000))
In [70]: %timeit _, idx_start, count = np.unique(a, return_index=True, return_counts=True)
240 µs ± 7.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [71]: %timeit st = np.where(np.ediff1d(a, a[-1] + 1, a[0] + 1))[0]; idx = st[:-1]; cnt = np.ediff1d(st)
74.3 µs ± 816 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
"Motelling" is a way to smooth response to a signal.
For example: Given a time-varying signal St that takes integer values 1-5, and a response function Ft({S0...t}) that assigns [-1, 0, +1] to each signal, a standard motelling response function would return:
-1 if St = 1, or if (St = 2) & (Ft-1 = -1)
+1 if St = 5, or if (St = 4) & (Ft-1 = +1)
0 otherwise
If I have a DataFrame by time of the signal {S}, is there a vectorized way to apply this motelling function?
E.g., if DataFrame df['S'].values = [1, 2, 2, 2, 3, 5, 3, 4, 1]
then is there a vectorized approach that would produce:
df['F'].values = [-1, -1, -1, -1, 0, 1, 0, 0, -1]
Or, absent a vectorized solution, is there something obviously faster than the following DataFrame.itertuples() approach I am using now?
df = pd.DataFrame(np.random.random_integers(1,5,100000), columns=['S'])
# First set response for time t
df['F'] = np.where(df['S'] == 5, 1, np.where(df['S'] == 1, -1, 0))
# Now loop to apply motelling
previousF = 0
for row in df.itertuples():
df.at[row.Index, 'F'] = np.where((row.S >= 4) & (previousF == 1), 1,
np.where((row.S <= 2) & (previousF == -1), -1, row.F))
previousF = row.F
With a complex DataFrame the loop portion takes O(minute per million rows)!
You can try regex.
The patterns we are looking for are
(1) 1 follows by 1 or 2. (We select this rule because any 2 comes after 1 can be considered as 1 and keep influence the next row's result)
(2) 5 follows by 4 or 5. (Similarly any 4 comes after 5 can be considered as 5)
(1) will results in consecutive -1s and (2) will results in consecutive 1s. The rest that does not match will be 0.
Using these rules, the rest of work is to do replacement. We espeically use a method lambda m: "x"*len(m.group(0)) that can turn the matched results into the length of such matches. (see reference)
import re
s = [1, 2, 2, 2, 3, 5, 3, 4, 1]
str_s = "".join(str(i) for i in s)
s1 = re.sub("5[45]*", lambda m: "x"*len(m.group(0)),str_s)
s2 = re.sub("1[12]*", lambda m: "y"*len(m.group(0)),s1)
l = list(s2)
l2 = [v if v in ["x", "y"] else 0 for v in l]
l3 = [1 if v == 'x' else v for v in l2]
l4 = [-1 if v == 'y' else v for v in l3]
[-1, -1, -1, -1, 0, 1, 0, 0, -1]
Bigger dataset
def tai(s):
str_s = "".join(str(i) for i in s)
s1 = re.sub("5[45]*", lambda m: "x"*len(m.group(0)),str_s)
s2 = re.sub("1[12]*", lambda m: "y"*len(m.group(0)),s1)
l = list(s2)
l2 = [v if v in ["x", "y"] else 0 for v in l]
l3 = [1 if v == 'x' else v for v in l2]
l4 = [-1 if v == 'y' else v for v in l3]
return l4
s = np.random.randint(1,6,100000)
%timeit tai(s)
104 ms ± 6.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each
df = pd.DataFrame(np.random.randint(1,6,100000), columns=['S'])
# First set response for time t
df['F'] = np.where(df['S'] == 5, 1, np.where(df['S'] == 1, -1, 0))
# Now loop to apply motelling
%%timeit # (OP's answer)
previousF = 0
for row in df.itertuples():
df.at[row.Index, 'F'] = np.where((row.S >= 4) & (previousF == 1), 1,
np.where((row.S <= 2) & (previousF == -1), -1, row.F))
previousF = row.F
1.11 s ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Reference
Replace substrings in python with the length of each substring
You may notice that since the consecutive elements of F[t] depend on one another this doesn't vectorize well. I'm partial to using numba in this cases. Your function is simple, it works on a numpy array (series is just array under the hood) and it's not easy to vectorize -> numba is ideal for this.
Imports and function:
import numpy as np
import pandas as pd
def motel(S):
F = np.zeros_like(S)
for t in range(S.shape[0]):
if (S[t] == 1) or (S[t] == 2 and F[t-1] == -1):
F[t] = -1
elif (S[t] == 5) or (S[t] == 4 and F[t-1] == 1):
F[t] = 1
# no else required sinze it's already set to zero
return F
Here we can just jit-compile the function
import numba
jit_motel = numba.jit(nopython=True)(motel)
And ensure that the normal and jit versions return expected values
S = pd.Series([1, 2, 2, 2, 3, 5, 3, 4, 1])
print("motel(S) = ", motel(S))
print("jit_motel(S)", jit_motel(S.values))
result:
motel(S) = [-1 -1 -1 -1 0 1 0 0 -1]
jit_motel(S) [-1 -1 -1 -1 0 1 0 0 -1]
For timing, let's scale:
N = 10**4
S = pd.Series( np.random.randint(1, 5, N) )
%timeit jit_motel(S.values)
%timeit motel(S.values)
result:
82.7 µs ± 1.03 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
7.75 ms ± 77.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
For your million data points (didn't time normal function because I didn't wanna wait =) )
N = 10**6
S = pd.Series( np.random.randint(1, 5, N) )
%timeit motel(S.values)
result:
768 ms ± 7.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Boom! Less than a second for a million entries. This approach is simple, readable, and fast. Only downside is the Numba dependency, but it's included in anaconda and available in conda easily (maybe pip I'm not sure).
To aggregate the other answers, first I should note that apparently DataFrame.itertuples() does not iterate deterministically, or as expected, so the sample in the OP doesn't always produce the correct result on large samples.
Thanks to the other answers, I realized that a mechanical application of the motelling logic not only produces correct results, but does so surprisingly quickly when we use DataFrame.fill functions:
def dfmotel(df):
# We'll copy results into column F as we build them
df['F'] = np.nan
# This algo is destructive, so we operate on a copy of the signal
df['temp'] = df['S']
# Fill forward the negative signal
df.loc[df['temp'] == 2, 'temp'] = np.nan
df['temp'].ffill(inplace=True)
df.loc[df['temp'] == 1, 'F'] = -1
# Fill forward the positive signal
df.loc[df['temp'] == 4, 'temp'] = np.nan
df['temp'].ffill(inplace=True)
df.loc[df['temp'] == 5, 'F'] = 1
# All other signals are zero
df['F'].fillna(0, inplace=True)
For all timing tests we will operate on the same input:
df = pd.DataFrame(np.random.randint(1,5,1000000), columns=['S'])
For the DataFrame-based function above we get:
%timeit dfmotel(df.copy())
123 ms ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
This is quite acceptable performance.
tai was first to present this very clever solution using RegEx (which is what inspired my function above), but it can't match the speed of staying in number space:
import re
def tai(s):
str_s = "".join(str(i) for i in s)
s1 = re.sub("5[45]*", lambda m: "x"*len(m.group(0)),str_s)
s2 = re.sub("1[12]*", lambda m: "y"*len(m.group(0)),s1)
l = list(s2)
l2 = [v if v in ["x", "y"] else 0 for v in l]
l3 = [1 if v == 'x' else v for v in l2]
l4 = [-1 if v == 'y' else v for v in l3]
return l4
%timeit tai(df['S'].values)
899 ms ± 9.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
But nothing beats compiled code. Thanks to evamicur for this solution using the convenient numba in-line compiler:
import numba
def motel(S):
F = np.zeros_like(S)
for t in range(S.shape[0]):
if (S[t] == 1) or (S[t] == 2 and F[t-1] == -1):
F[t] = -1
elif (S[t] == 5) or (S[t] == 4 and F[t-1] == 1):
F[t] = 1
return F
jit_motel = numba.jit(nopython=True)(motel)
%timeit jit_motel(df['S'].values)
9.06 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)