extract longest non NaN sequence from array - python

I want to extract the longest sequence of consecutive non NaN values from an array in Python. So from this one:
a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]
I would like to get
a_nonNaN_long = [4, 6, 8, 4, 6, 6, 4, 3, 2]
So the way I was thinking to go about this is to get the first non NaN value using this function
def firstNonNan(listfloats):
i = 0
for item in listfloats:
i += 1
if math.isnan(item) == False:
return i
And then use the index from this function in a while loop to get subsection of the array until I find the longest consecutive sequence of Non nan values. I wonder if anybody has some other/better way to do it?

You can use itertools.groupby to get non-NaN stretches, then max for the longest:
from itertools import groupby
import math
out = max((list(g) for k,g in groupby(a, math.isnan) if not k), key=len)
Output:
[4, 6, 8, 4, 6, 6, 4, 3, 2]
Used input:
NaN = float('nan')
a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]

To see whether numpy based vectorized approaches or pure python algorithms to find the bounds of non-NaN sequences could be performance competitive with a python groupby answer such as the one by #mozway, I used perfplot to benchmark four strategies: pure python with groupby, pure python without groupby, numpy and numba.
In one run, I used a python list as input (as in the question), which puts the onus on the numpy and numba runs to do the round-trip conversion from list to np.array and back, as well as to deal with heterogeneous data types given that NaN is float and the values are int.
Here's how it looks:
In the second run, I used a python list as input for the non-numpy solutions, and an np.array as input (with NaN replaced by an integer sentinel outside the range of actual values, to allow a dtype of int32) and output, to see whether recasting the problem in numpy-friendly array types would help the numpy/numba solutions.
Here are the results:
Conclusion:
The numpy and numba solutions are about 1.5 orders of magnitude faster if they are allowed to use homogeneous numpy input and output. Otherwise (if they must incur the roundtrip overhead of list-to-homogenous-numpy conversion and back) they are more or less on top of the pure python solutions.
Pure python groupby beats non-groupby by a bit, and in the second case, numba beats numpy by a little.
Note that none of the others beats the groubpy solution by #mozway for simplicity of expression.
Here is the benchmark code:
NaN = float("nan")
a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]
import numpy as np
aNp = np.array([0 if v is NaN else v for v in a], np.int32)
def foo_1(a):
mnMinus1 = min(v for v in a if not v is NaN) - 1
np_a = np.array([mnMinus1 if v is NaN else v for v in a], np.int32)
return list(np_foo_1(np_a, mnMinus1))
def foo_2(a):
mnMinus1 = min(v for v in a if not v is NaN) - 1
np_a = np.array([mnMinus1 if v is NaN else v for v in a], np.int32)
return list(np_foo_2(np_a, mnMinus1))
def np_foo_1(b, mnMinus1):
R = np.concatenate((np.array([mnMinus1]), b[:-1]))
bIsnan = b==mnMinus1
RIsnan = R==mnMinus1
nonNanL = (~bIsnan) & RIsnan
nonNanR = bIsnan & ~RIsnan
index = np.arange(len(b))
left, right = index[nonNanL], index[nonNanR]
lens = right - left
i = lens.argmax()
result = b[left[i]:right[i]]
return result
from numba import njit
#njit
def np_foo_2(b, mnMinus1):
R = np.concatenate((np.array([mnMinus1]), b[:-1]))
bIsnan = b==mnMinus1
RIsnan = R==mnMinus1
nonNanL = (~bIsnan) & RIsnan
nonNanR = bIsnan & ~RIsnan
index = np.arange(len(b))
left, right = index[nonNanL], index[nonNanR]
lens = right - left
i = lens.argmax()
result = b[left[i]:right[i]]
return result
def foo_3(a):
LR = []
left, right = 0, 0
while left < len(a):
while right < len(a) and a[right] is not NaN:
right += 1
LR.append((left, right))
left = right + 1
while left < len(a) and a[left] is NaN:
left += 1
right = left + 1
#i = max(zip(LR, range(len(LR))), key = lambda x: x[0][1] - x[0][0])[-1]
i, mx = 0, 0
for j in range(len(LR)):
cur = LR[j][1] - LR[j][0]
if cur > mx:
i, mx = j, cur
return a[LR[i][0]:LR[i][1]]
left = [i for i, v in enumerate(a) if v is not NaN and (not i or a[i-1] is NaN)]
right = [i for i, v in enumerate(a) if v is NaN and (False if not i else a[i-1] is not NaN)]
#i = max(zip(left, right, range(len(left))), key = lambda x: x[1] - x[0])[-1]
i, mx = 0, 0
for j in range(len(left)):
cur = right[j] - left[j]
if cur > mx:
i, mx = j, cur
return a[left[i]:right[i]]
from itertools import groupby
import math
def foo_4(a):
out = max((list(g) for k,g in groupby(a, math.isnan) if not k), key=len)
return out
def dual2_foo_1(a, np_a):
return foo_1(a)
def dual2_foo_2(a, np_a):
return foo_2(a)
def dual2_foo_3(a, np_a):
return foo_3(a)
def dual2_foo_4(a, np_a):
return foo_4(a)
def dual_foo_1(a, np_a):
return np_foo_1(np_a, 0)
def dual_foo_2(a, np_a):
return np_foo_2(np_a, 0)
def dual_foo_3(a, np_a):
return foo_3(a)
def dual_foo_4(a, np_a):
return foo_4(a)
foo_count = 4
foo_names=['foo_' + str(i + 1) for i in range(foo_count)]
foo_labels=['numpy', 'numpy_numba', 'python_find_seq_bounds', 'python_groupby']
exec("foo_funcs=[" + ','.join(f"foo_{str(i + 1)}" for i in range(foo_count)) + "]")
exec("dual_foo_funcs=[" + ','.join(f"dual_foo_{str(i + 1)}" for i in range(foo_count)) + "]")
exec("dual2_foo_funcs=[" + ','.join(f"dual2_foo_{str(i + 1)}" for i in range(foo_count)) + "]")
for foo in foo_names:
print(f'{foo} output:')
print(eval(f"{foo}(a)"))
import perfplot
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["figure.autolayout"] = True
perfplot.show(
setup=lambda n: (a * n, np.array([0 if v is NaN else v for v in a * n], np.int32)),
kernels=dual_foo_funcs,
labels=foo_labels,
n_range=[2 ** k for k in range(11)],
equality_check=np.allclose,
xlabel='n/24',
logx="auto",
logy="auto"
)
perfplot.show(
setup=lambda n: (a * n, np.array([0 if v is NaN else v for v in a * n], np.int32)),
kernels=dual2_foo_funcs, # switch to dual_foo_funcs for second benchmark
labels=foo_labels,
n_range=[2 ** k for k in range(11)],
equality_check=np.allclose,
xlabel='n/24',
logx="auto",
logy="auto"
)

Related

Calculate distance between arrays that contain NaN

consider array1 and array2, with:
array1 = [a1 a2 NaN ... an]
array2 = [[NaN b2 b3 ... bn],
[b21 NaN b23 ... b2n],
...]
Both arrays are numpy-arrays. There is an easy way to compute the Euclidean distance between array1and each row of array2:
EuclideanDistance = np.sqrt(((array1 - array2)**2).sum(axis=1))
What messes up this computation are the NaN values. Of course, I could easily replace NaN with some number. But instead, I want to do the following:
When I compare array1 with row_x of array2, I count the columns in which one of the arrays has NaN and the other doesn't. Let's assume the count is 3. I will then delete these columns from both arrays and compute the Euclidean distance between the two. In the end, I add a minus_value * count to the calculated distance.
Now, I cannot think of a fast and efficient way to do this. Can somebody help me?
Here are a few of my ideas:
minus = 1000
dist = np.zeros(shape=(array1.shape[0])) # this array will store the distance of array1 to each row of array2
array1 = np.repeat(array1, array2.shape[0], axis=0) # now array1 has the same dimensions as array2
for i in range(0, array1.shape[0]):
boolarray = np.logical_or(np.isnan(array1[i]), np.isnan(array2[i]))
count = boolarray.sum()
deleteIdxs = boolarray.nonzero() # this should give the indices where boolarray is True
dist[i] = np.sqrt(((np.delete(array1[i], deleteIdxs, axis=0) - np.delete(array2[i], deleteIdxs, axis=0))**2).sum(axis=0))
dist[i] = dist[i] + count*minus
These lines look more than ugly to me, however. Also, I keep getting an index error: Apparently deleteIdxs contains an index that is out of range for array1. Don't know how this can even be.
You can find all the indices with where the value is nan using:
indices_1 = np.isnan(array1)
indices_2 = np.isnan(array2)
Which you can combine to:
indices_total = indices_1 + indices_2
And you can keep all the not nan values using:
array_1_not_nan = array1[~indices_total]
array_2_not_nan = array2[~indices_total]
I would write a function to handle the distance calculation. I am sure there is a faster and more efficient way to write this (list comprehensions, aggregations, etc.), but readability counts, right? :)
import numpy as np
def calculate_distance(fixed_arr, var_arr, penalty):
s_sum = 0.0
counter = 0
for num_1, num_2 in zip(fixed_arr, var_arr):
if np.isnan(num_1) or np.isnan(num_2):
counter += 1
else:
s_sum += (num_1 - num_2) ** 2
return np.sqrt(s_sum) + penalty * counter, counter
array1 = np.array([1, 2, 3, np.NaN, 5, 6])
array2 = np.array(
[
[3, 4, 9, 3, 4, 8],
[3, 4, np.NaN, 3, 4, 8],
[np.NaN, 9, np.NaN, 3, 4, 8],
[np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN],
]
)
dist = np.zeros(len(array2))
minus = 10
for index, arr in enumerate(array2):
dist[index], _ = calculate_distance(array1, arr, minus)
print(dist)
You have to think about the value for the minus variable very carefully. Is adding a random value really useful?
As #Nathan suggested, a more resource efficient can easily be implemented.
fixed_arr = array1
penalty = minus
dist = [
(
lambda indices=(np.isnan(fixed_arr) + np.isnan(var_arr)): np.linalg.norm(
fixed_arr[~indices] - var_arr[~indices]
)
+ (indices == True).sum() * penalty
)()
for var_arr in array2
]
print(dist)
However I would only try to implement something like this if I absolutely needed to (if it's the bottleneck). For all other times I would be happy to sacrifice some resources in order to gain some readability and extensibility.
You can filter out the columns containing nan with:
mask1 = np.isnan(arr1)
mask2 = np.isnan(arr2).any(0)
mask = ~(mask1 | mask2)
# the two filtered arrays
arr1[mask], arr2[mask]

Vectorising the padding of ragged arrays [duplicate]

consider the list of lists l
l = [[1, 2, 3], [1, 2]]
if I convert this to a np.array I'll get a one dimensional object array with [1, 2, 3] in the first position and [1, 2] in the second position.
print(np.array(l))
[[1, 2, 3] [1, 2]]
I want this instead
print(np.array([[1, 2, 3], [1, 2, np.nan]]))
[[ 1. 2. 3.]
[ 1. 2. nan]]
I can do this with a loop, but we all know how unpopular loops are
def box_pir(l):
lengths = [i for i in map(len, l)]
shape = (len(l), max(lengths))
a = np.full(shape, np.nan)
for i, r in enumerate(l):
a[i, :lengths[i]] = r
return a
print(box_pir(l))
[[ 1. 2. 3.]
[ 1. 2. nan]]
how do I do this in a fast, vectorized way?
timing
setup functions
%%cython
import numpy as np
def box_pir_cython(l):
lengths = [len(item) for item in l]
shape = (len(l), max(lengths))
a = np.full(shape, np.nan)
for i, r in enumerate(l):
a[i, :lengths[i]] = r
return a
def box_divikar(v):
lens = np.array([len(item) for item in v])
mask = lens[:,None] > np.arange(lens.max())
out = np.full(mask.shape, np.nan)
out[mask] = np.concatenate(v)
return out
def box_hpaulj(LoL):
return np.array(list(zip_longest(*LoL, fillvalue=np.nan))).T
def box_simon(LoL):
max_len = len(max(LoL, key=len))
return np.array([x + [np.nan]*(max_len-len(x)) for x in LoL])
def box_dawg(LoL):
cols=len(max(LoL, key=len))
rows=len(LoL)
AoA=np.empty((rows,cols, ))
AoA.fill(np.nan)
for idx in range(rows):
AoA[idx,0:len(LoL[idx])]=LoL[idx]
return AoA
def box_pir(l):
lengths = [len(item) for item in l]
shape = (len(l), max(lengths))
a = np.full(shape, np.nan)
for i, r in enumerate(l):
a[i, :lengths[i]] = r
return a
def box_pandas(l):
return pd.DataFrame(l).values
This seems to be a close one of this question, where the padding was with zeros instead of NaNs. Interesting approaches were posted there, along with mine based on broadcasting and boolean-indexing. So, I would just modify one line from my post there to solve this case like so -
def boolean_indexing(v, fillval=np.nan):
lens = np.array([len(item) for item in v])
mask = lens[:,None] > np.arange(lens.max())
out = np.full(mask.shape,fillval)
out[mask] = np.concatenate(v)
return out
Sample run -
In [32]: l
Out[32]: [[1, 2, 3], [1, 2], [3, 8, 9, 7, 3]]
In [33]: boolean_indexing(l)
Out[33]:
array([[ 1., 2., 3., nan, nan],
[ 1., 2., nan, nan, nan],
[ 3., 8., 9., 7., 3.]])
In [34]: boolean_indexing(l,-1)
Out[34]:
array([[ 1, 2, 3, -1, -1],
[ 1, 2, -1, -1, -1],
[ 3, 8, 9, 7, 3]])
I have posted few runtime results there for all the posted approaches on that Q&A, which could be useful.
Probably the fastest list version uses itertools.zip_longest (may be izip_longest in Py2):
In [747]: np.array(list(itertools.zip_longest(*ll,fillvalue=np.nan))).T
Out[747]:
array([[ 1., 2., 3.],
[ 1., 2., nan]])
The plain zip produces:
In [748]: list(itertools.zip_longest(*ll))
Out[748]: [(1, 1), (2, 2), (3, None)]
another zip 'transposes':
In [751]: list(zip(*itertools.zip_longest(*ll)))
Out[751]: [(1, 2, 3), (1, 2, None)]
Often when starting with lists (or even an object array of lists), it is faster to stick with list methods. There's an substantial overhead in creating an array or dataframe.
This isn't the first time this question has been asked.
How can I pad and/or truncate a vector to a specified length using numpy?
My answer there includes both this zip_longest and your box_pir
I think there's also a fast numpy version using a flattened array, but I don't recall the details. It was probably given by Warren or Divakar.
I think the 'flattened' version works something along this line:
In [809]: ll
Out[809]: [[1, 2, 3], [1, 2]]
In [810]: sll=np.hstack(ll) # all values in a 1d array
In [816]: res=np.empty((2,3)); res.fill(np.nan) # empty target
get flattened indices where values go. This is the crucial step. Here the use of r_ is iterative; the fast version probably uses cumsum
In [817]: idx=np.r_[0:3, 3:3+2]
In [818]: idx
Out[818]: array([0, 1, 2, 3, 4])
In [819]: res.flat[idx]=sll
In [820]: res
Out[820]:
array([[ 1., 2., 3.],
[ 1., 2., nan]])
================
so the missing link is >np.arange() broadcasting
In [897]: lens=np.array([len(i) for i in ll])
In [898]: mask=lens[:,None]>np.arange(lens.max())
In [899]: mask
Out[899]:
array([[ True, True, True],
[ True, True, False]], dtype=bool)
In [900]: idx=np.where(mask.ravel())
In [901]: idx
Out[901]: (array([0, 1, 2, 3, 4], dtype=int32),)
I might write this as a form of slice assignment on each of the sub arrays that have been filled with a default:
def to_numpy(LoL, default=np.nan):
cols=len(max(LoL, key=len))
rows=len(LoL)
AoA=np.empty((rows,cols, ))
AoA.fill(default)
for idx in range(rows):
AoA[idx,0:len(LoL[idx])]=LoL[idx]
return AoA
I added in Divakar's Boolean Indexing as f4 and added to the timing testing. At least on my testing, (Python 2.7 and Python 3.5; Numpy 1.11) it is not the fastest.
Timing shows that izip_longest or f2 is slightly faster for most lists but slice assignment (which is f1) is faster for larger lists:
from __future__ import print_function
import numpy as np
try:
from itertools import izip_longest as zip_longest
except ImportError:
from itertools import zip_longest
def f1(LoL):
cols=len(max(LoL, key=len))
rows=len(LoL)
AoA=np.empty((rows,cols, ))
AoA.fill(np.nan)
for idx in range(rows):
AoA[idx,0:len(LoL[idx])]=LoL[idx]
return AoA
def f2(LoL):
return np.array(list(zip_longest(*LoL,fillvalue=np.nan))).T
def f3(LoL):
max_len = len(max(LoL, key=len))
return np.array([x + [np.nan]*(max_len-len(x)) for x in LoL])
def f4(LoL):
lens = np.array([len(item) for item in LoL])
mask = lens[:,None] > np.arange(lens.max())
out = np.full(mask.shape,np.nan)
out[mask] = np.concatenate(LoL)
return out
if __name__=='__main__':
import timeit
for case, LoL in (('small', [list(range(20)), list(range(30))] * 1000),
('medium', [list(range(20)), list(range(30))] * 10000),
('big', [list(range(20)), list(range(30))] * 100000),
('huge', [list(range(20)), list(range(30))] * 1000000)):
print(case)
for f in (f1, f2, f3, f4):
print(" ",f.__name__, timeit.timeit("f(LoL)", setup="from __main__ import f, LoL", number=100) )
Prints:
small
f1 0.245459079742
f2 0.209980010986
f3 0.350691080093
f4 0.332141160965
medium
f1 2.45869493484
f2 2.32307982445
f3 3.65722203255
f4 3.55545687675
big
f1 25.8796288967
f2 26.6177148819
f3 41.6916451454
f4 41.3140149117
huge
f1 262.429639101
f2 295.129109859
f3 427.606887817
f4 441.810388088
Maybe something like this? Don't know about your hardware, but means at 16ms for 100 loops for l2 = [list(range(20)), list(range(30))] * 10000.
from numpy import nan
def box(l):
max_lenght = len(max(l, key=len))
return [x + [nan]*(max_lenght-len(x)) for x in l]
If this is only for a 2D list, this might be your answer:
from numpy import nan
def even(data):
maxlen = max(len(l) for l in data)
for l in data:
l.extend([nan] * (maxlen - len(l)))
And if you don't want to modify the actual list:
from numpy import nan
def even(data):
res = data.copy()
maxlen = max(len(l) for l in res)
for l in res:
l.extend([nan] * (maxlen - len(l)))
return res

Python: how to avoid loop?

I have a list of entries
l = [5, 3, 8, 12, 24]
and a matrix M
M:
12 34 5 8 7
0 24 12 3 1
I want to find the indeces of the matrix where appear the numbers in l. For the k-entry of l I want to save a random couple of indices i, j where M[i][j]==l[k]. I am doing the following
indI = []
indJ = []
for i in l:
tmp = np.where(M == i)
rd = randint(len(tmp))
indI.append(tmp[0][rd])
indJ.append(tmp[1][rd])
I would like to see if there is a way to avoid that loop
One way in which you should be able to significantly speed up your code is to avoid duplicate work:
tmp = np.where(M == i)
As this gives you a list of all locations in M where the value is equal to i, it must be searching through the entire matrix. So for each element in l, you are searching through the full matrix.
Instead of doing that, try indexing your matrix as a first step:
matrix_index = {}
for i in len(M):
for j in len(M[i]):
if M[i][j] not in matrix_index:
matrix_index[M[i][j]] = [(i,j)]
else:
matrix_index[M[i][j]].append((i,j))
Then for each value in l, instead of doing a costly search through the full matrix, you can just get it straight from your matrix index.
Note: I haven't with numpy very much, so I may have gotten the specific syntax incorrect. There may also be a more idiomatic way of doing this in numpy.
If both l and M are not large matrices like the following:
In: l0 = [5, 3, 8, 12, 34, 1, 12]
In: M0 = [[12, 34, 5, 8, 7],
In: [ 0, 24, 12, 3, 1]]
In: l = np.asarray(l)
In: M = np.asarray(M)
You can try this:
In: np.where(l[None, None, :] == M[:, :, None])
Out:
(array([0, 0, 0, 0, 0, 1, 1, 1, 1]), <- i
array([0, 0, 1, 2, 3, 2, 2, 3, 4]), <- j
array([3, 6, 4, 0, 2, 3, 6, 1, 5])) <- k
The rows should be the i, j, k, respectively and read the column to get every (i, j, k) you need. For example, the 1st column [0, 0, 3] means M[0, 0] = l[3], and the 2nd column [0, 0, 6] says M[0, 0] = l[6], and vice versa. I think these are what you want.
However, the numpy trick can not be extended to very large matrices, such as 2M elements in l or 2500x2500 elements in M. They need quite a lot memory and very very long time to compute... if they are lucky not to crash for out of memory. :)
One solution that does not use the word for is
c = np.apply_along_axis(lambda row: np.random.choice(np.argwhere(row).ravel()), 1, M.ravel()[np.newaxis, :] == l[:, np.newaxis])
indI, indJ = c // M.shape[1], c % M.shape[1]
Note that while that solves the problem, M.ravel()[np.newaxis, :] == l[:, np.newaxis] will quickly produce MemoryErrors. A more pragmatic approach would be to get the indices of interest through something like
s = np.argwhere(M.ravel()[np.newaxis, :] == l[:, np.newaxis])
and then do the random choice post-processing by hand. This, however, probably does not yield any significant performance improvements over your search.
What makes it slow, though, is that you search through the entire matrix in every step of your loop; by pre-sorting the matrix (at a certain cost) gives you a straightforward way of making each individual search much faster:
In [312]: %paste
def direct_search(M, l):
indI = []
indJ = []
for i in l:
tmp = np.where(M == i)
rd = np.random.randint(len(tmp[0])) # Note the fix here
indI.append(tmp[0][rd])
indJ.append(tmp[1][rd])
return indI, indJ
def using_presorted(M, l):
a = np.argsort(M.ravel())
M_sorted = M.ravel()[a]
def find_indices(i):
s = np.searchsorted(M_sorted, i)
j = 0
while M_sorted[s + j] == i:
yield a[s + j]
j += 1
indices = [list(find_indices(i)) for i in l]
c = np.array([np.random.choice(i) for i in indices])
return c // M.shape[1], c % M.shape[1]
## -- End pasted text --
In [313]: M = np.random.randint(0, 1000000, (1000, 1000))
In [314]: l = np.random.choice(M.ravel(), 1000)
In [315]: %timeit direct_search(M, l)
1 loop, best of 3: 4.76 s per loop
In [316]: %timeit using_presorted(M, l)
1 loop, best of 3: 208 ms per loop
In [317]: indI, indJ = using_presorted(M, l) # Let us check that it actually works
In [318]: np.all(M[indI, indJ] == l)
Out[318]: True

How to make numpy.cumsum start after the first value

I have:
import numpy as np
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7, ..., 4])
x = (B/position**2)*dt
A = np.cumsum(x)
assert A[0] == 0 # I want this to be true.
Where B and dt are scalar constants. This is for a numerical integration problem with initial condition of A[0] = 0. Is there a way to set A[0] = 0 and then do a cumsum for everything else?
I don't understand what exactly your problem is, but here are some things you can do to have A[0] = 0.
You can create A to be longer by one index to have the zero as the first entry:
# initialize example data
import numpy as np
B = 1
dt = 1
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7])
# do calculation
A = np.zeros(len(position) + 1)
A[1:] = np.cumsum((B/position**2)*dt)
Result:
A = [ 0. 0.0625 0.11559096 0.16105356 0.20073547 0.23633533 0.26711403]
len(A) == len(position) + 1
Alternatively, you can manipulate the calculation to substract the first entry of the result:
# initialize example data
import numpy as np
B = 1
dt = 1
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7])
# do calculation
A = np.cumsum((B/position**2)*dt)
A = A - A[0]
Result:
[ 0. 0.05309096 0.09855356 0.13823547 0.17383533 0.20461403]
len(A) == len(position)
As you see, the results have different lengths. Is one of them what you expect?
1D cumsum
A wrapper around np.cumsum that sets first element to 0:
def cumsum(pmf):
cdf = np.empty(len(pmf) + 1, dtype=pmf.dtype)
cdf[0] = 0
np.cumsum(pmf, out=cdf[1:])
return cdf
Example usage:
>>> np.arange(1, 11)
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
>>> cumsum(np.arange(1, 11))
array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55])
N-D cumsum
A wrapper around np.cumsum that sets first element to 0, and works with N-D arrays:
def cumsum(pmf, axis=None, dtype=None):
if axis is None:
pmf = pmf.reshape(-1)
axis = 0
if dtype is None:
dtype = pmf.dtype
idx = [slice(None)] * pmf.ndim
# Create array with extra element along cumsummed axis.
shape = list(pmf.shape)
shape[axis] += 1
cdf = np.empty(shape, dtype)
# Set first element to 0.
idx[axis] = 0
cdf[tuple(idx)] = 0
# Perform cumsum on remaining elements.
idx[axis] = slice(1, None)
np.cumsum(pmf, axis=axis, dtype=dtype, out=cdf[tuple(idx)])
return cdf
Example usage:
>>> np.arange(1, 11).reshape(2, 5)
array([[ 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10]])
>>> cumsum(np.arange(1, 11).reshape(2, 5), axis=-1)
array([[ 0, 1, 3, 6, 10, 15],
[ 0, 6, 13, 21, 30, 40]])
I totally understand your pain, I wonder why Numpy doesn't allow this with np.cumsum. Anyway, though I'm really late and there's already another good answer, I prefer this one a bit more:
np.cumsum(np.pad(array, (1, 0), "constant"))
where array in your case is (B/position**2)*dt. You can change the order of np.pad and np.cumsum as well. I'm just adding a zero to the start of the array and calling np.cumsum.
You can use roll (shift right by 1) and then set the first entry to zero.

Summing values of numpy array based on indices in other array

Assume I have the following arrays:
N = 8
M = 4
a = np.zeros(M)
b = np.random.randint(M, size=N) # contains indices for a
c = np.random.rand(N) # contains random values
I want to sum the values of c according to the indices provided in b, and store them in a. Writing a loop for this is trivial:
for i, v in enumerate(b):
a[v] += c[i]
Since N can get quite big in my real-world problem I'd like to avoid using python loops, but I can't figure out how to write it as a numpy-statement. Can anyone help me out?
Ok, here some example values:
In [27]: b
Out[27]: array([0, 1, 2, 0, 2, 3, 1, 1])
In [28]: c
Out[28]:
array([ 0.15517108, 0.84717734, 0.86019899, 0.62413489, 0.24357903,
0.86015187, 0.85813481, 0.7071174 ])
In [30]: a
Out[30]: array([ 0.77930596, 2.41242955, 1.10377802, 0.86015187])
import numpy as np
N = 8
M = 4
b = np.array([0, 1, 2, 0, 2, 3, 1, 1])
c = np.array([ 0.15517108, 0.84717734, 0.86019899, 0.62413489, 0.24357903, 0.86015187, 0.85813481, 0.7071174 ])
a = ((np.mgrid[:M,:N] == b)[0] * c).sum(axis=1)
returns
array([ 0.77930597, 2.41242955, 1.10377802, 0.86015187])

Categories

Resources