Iterating within a numpy array - python

I have 3 2x2 Matrices, P1, P2, and P3, which are populated with randomly generated integers. I want to make sure that these matrices are positive definite (i.e. All eigenvalues are all greater than 0). My code is below.
P1 = np.random.randint(10, size=(m,n))
P2 = np.random.randint(10, size=(m,n))
P3 = np.random.randint(10, size=(m,n))
lambda1 = np.linalg.eigvals(P1)
lambda2 = np.linalg.eigvals(P2)
lambda3 = np.linalg.eigvals(P3)
for i in lambda1:
if (i <= 0): P1 = np.random.randint(10, size=(m,n))
for i in lambda2:
if (i <= 0): P2 = np.random.randint(10, size=(m,n))
for i in lambda3:
if (i <= 0): P3 = np.random.randint(10, size=(m,n))
print('Eigenvalue output to to verify that matrices are positive definite:\n')
print(u'\u03BB(P\u2081) = ' + str(np.linalg.eigvals(P1)))
print(u'\u03BB(P\u2082) = ' + str(np.linalg.eigvals(P2)))
print(u'\u03BB(P\u2083) = ' + str(np.linalg.eigvals(P3)))
Right now, the if statement will pretty much re-generate the matrix once or twice if the eigenvalues are not positive, but it will not verify that the eigenvalues are always positive. My first guess was to nest a while loop within the for loop, but I could not figure out a way to get that to work, and I'm unsure if that is the most efficient way.

This function creates an array with positive eigenvalues:
def create_arr_with_pos_ev(m,n):
ev = np.array([-1,-1])
while not all(ev>0):
arr = np.random.randint(10, size=(m,n))
ev = np.linalg.eigvals(arr)
return arr, ev
First I define dummy eigenvalues, that are lower than 0. Then I create a new array and calculate its eigenvalues. If there is a negative eigenvalue (while not all(ev>0)), create a new one.

As a supplement to the answer above, this can also be simplified a little further by taking out any input arguments to the function, and just defining the original matrix within the function:
def create_arr_with_pos_ev():
arr = np.random.randint(10, size=(m,n))
ev = np.linalg.eigvals(arr)
while not all (ev >0):
arr = np.random.randint(10, size=(m,n))
ev = np.linalg.eigvals(arr)
print('\nMatrix: \n' + str(arr) + '\nEigenvalues: \n',ev)
return arr, ev
Print:
P1,eig1=create_arr_with_pos_ev()
P2,eig2=create_arr_with_pos_ev()
P3,eig3=create_arr_with_pos_ev()
Output:
Matrix:
[[6 0]
[3 7]]
Eigenvalues:
[7. 6.]
Matrix:
[[9 3]
[4 2]]
Eigenvalues:
[10.4244289 0.5755711]
Matrix:
[[5 6]
[3 8]]
Eigenvalues:
[ 2. 11.]

Related

Is there a vectorized way to sample multiples times with np.random.choice() with differents p?

I'm trying to implement a variation ratio, and I need T samples from an array C, but each sample has different weights p_t.
I'm using this:
import numpy as np
from scipy import stats
batch_size = 1
T = 3
C = np.array(['A', 'B', 'C'])
# p_batch_T dimensions: (batch, sample, class)
p_batch_T = np.array([[[0.01, 0.98, 0.01],
[0.3, 0.15, 0.55],
[0.85, 0.1, 0.05]]])
def variation_ratio(C, p_T):
# This function works only with one sample from the batch.
Y_T = np.array([np.random.choice(C, size=1, p=p_t) for p_t in p_T]) # vectorize this
C_mode, frecuency = stats.mode(Y_T)
T = len(Y_T)
return 1.0 - (f/T)
def variation_ratio_batch(C, p_batch_T):
return np.array([variation_ratio(C, p_T) for p_T in p_batch_T]) # and vectorize this
Is there a way to implement these functions with any for?
In stead of sampling with the given distribution p_T, we can sample uniformly between [0,1] and compare that to the cumulative distribution:
Let's start with Y_T, say for p_T = p_batch_T[0]
cum_dist = p_batch_T.cumsum(axis=-1)
idx_T = (np.random.rand(len(C),1) < cum_dist[0]).argmax(-1)
Y_T = C[idx_T[...,None]]
_, f = stats.mode(Y_T) # here axis=0 is default
Now let take that to the variation_ratio_batch:
idx_T = (np.random.rand(len(p_batch_T), len(C),1) < cum_dist).argmax(-1)
Y = C[idx_T[...,None]]
f = stats.mode(Y, axis=1) # notice axis 0 is batch
out = 1 - (f/T)
You could do it this way:
First, create a 2D weights array of shape (T, len(C)) and take the cumulative sum:
n_rows = 5
n_cols = 3
weights = np.random.rand(n_rows, n_cols)
cum_weights = (weights / weights.sum(axis=1, keepdims=True)).cumsum(axis=1)
cum_weights might look like this:
array([[0.09048919, 0.58962127, 1. ],
[0.36333997, 0.58380885, 1. ],
[0.28761923, 0.63413879, 1. ],
[0.39446498, 0.98760834, 1. ],
[0.27862476, 0.79715149, 1. ]])
Next, we can compare cum_weights to the appropriately sized output of np.random.rand. By taking argmin, we find the index in each row where the random number generated is greater than the cumulative weight:
indices = (cum_weights < np.random.rand(n_rows, 1)).argmin(axis=1)
We can then use indices to index an array of values of shape (n_cols,), which is len(C) in your original example.
np.vectorize should work:
from functools import partial
import numpy as np
#partial(np.vectorize, excluded=['rng'], signature='(),(k)->()')
def choice_batched(rng, probs):
return rng.choice(a=probs.shape[-1], p=probs)
then
num_classes = 3
batch_size = 5
alpha = .5 # Dirichlet prior hyperparameter.
rng = np.random.default_rng()
probs = np.random.dirichlet(alpha=np.full(fill_value=alpha, shape=num_classes), size=batch_size)
# Check each row sums to 1.
assert np.allclose(probs.sum(axis=-1), 1)
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
gives
[2 0 0 0 1]
[1 0 0 0 1]
[2 0 2 0 1]
[1 0 0 0 0]
Here is my implementation of Quang's and gmds' solutions:
def sample(ws, k):
"""Weighted sample k elements along the last axis.
ws -- Tensor of probabilities, shape (*, n)
k -- Number of elements to sample.
Returns tensor of shape (*, k) with values in {0, ..., n-1}.
"""
assert np.allclose(ws.sum(-1), 1)
cs = ws.cumsum(-1)
ps = np.random.random(ws.shape[:-1] + (k,))
return (cs[..., None, :] < ps[..., None]).sum(-1)
Say we have some stuff
>>> stuff = array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
And some weights / sampling probabilities.
>>> ws = array([[0.41296038, 0.36070229, 0.22633733],
[0.37576672, 0.14518771, 0.47904557],
[0.14742326, 0.29182459, 0.56075215]])
And we want to sample 2 elements along each row. Then we do
>>> ids = sample(ws, 2)
[[2, 0],
[1, 2],
[2, 2]]
And we can retrieve the sampled values from stuff using np.take_along_axis:
>>> np.take_along_axis(stuff, ids)
[[2, 0],
[4, 5],
[8, 8]]
The code could be generalized to sampling along an axis other than the last one, but I got confused about broadcasting, so somebody else should have a stab at it!

Aggregate elements based on position vector

I'm trying to vectorize a very simple operation but can't seem to figure out how.
Given a very large numerical vector (over 1M positions) and another array of size n with a given set of positions, I would like to get back a vector of size n with elements being the average of the values of the first vector as specified by the second
a = np.array([1,2,3,4,5,6,7])
b = np.array([[0,1],[2],[3,5],[4,6]])
c = [1.5,3,5,6]
I need to repeat this operation many times so performance is an issue.
Vanilla python solution:
import numpy as np
import time
a = np.array([1,2,3,4,5,6,7])
b = np.array([[0,1],[2],[3,5],[4,6]])
begin = time.time()
for i in range(100000):
c = []
for d in b:
c.append(np.mean(a[d]))
print(time.time() - begin, c)
# 3.7529971599578857 [1.5, 3.0, 5.0, 6.0]
I'm not sure if this is necessarily faster but you may as well try:
import numpy as np
a = np.array([1, 2, 3, 4, 5, 6, 7])
b = np.array([[0, 1], [2], [3, 5], [4, 6]])
# Get the length of each subset of indices
lens = np.fromiter((len(bi) for bi in b), count=len(b), dtype=np.int32)
# Compute reduction indices
reduce_idx = np.roll(np.cumsum(lens), 1)
reduce_idx[0] = 0
# Make flattened array of index lists
idx = np.fromiter((i for bi in b for i in bi), count=lens.sum(), dtype=np.int32)
# Reorder according to indices
a2 = a[idx]
# Sum reordered array at reduction indices and divide by number of indices
c = np.add.reduceat(a2, reduce_idx) / lens
print(c)
# [1.5 3. 5. 6. ]

How to find the nearest neighbour index from one series to another

I have a target array A, which represents isobaric pressure levels in NCEP reanalysis data.
I also have the pressure at which a cloud is observed as a long time series, B.
What I am looking for is a k-nearest neighbour lookup that returns the indices of those nearest neighbours, something like knnsearch in Matlab that could be represented the same in python such as: indices, distance = knnsearch(A, B, n)
where indices is the nearest n indices in A for every value in B, and distance is how far removed the value in B is from the nearest value in A, and A and B can be of different lengths (this is the bottleneck that I have found with most solutions so far, whereby I would have to loop each value in B to return my indices and distance)
import numpy as np
A = np.array([1000, 925, 850, 700, 600, 500, 400, 300, 250, 200, 150, 100, 70, 50, 30, 20, 10]) # this is a fixed 17-by-1 array
B = np.array([923, 584.2, 605.3, 153.2]) # this can be any n-by-1 array
n = 2
What I would like returned from indices, distance = knnsearch(A, B, n) is this:
indices = [[1, 2],[4, 5] etc...]
where 923 in A is matched to first A[1]=925 and then A[2]=850
and 584.2 in A is matched to first A[4]=600 and then A[5]=500
distance = [[72, 77],[15.8, 84.2] etc...]
where 72 represents the distance between queried value in B to the nearest value in A e.g. distance[0, 0] == np.abs(B[0] - A[1])
The only solution I have been able to come up with is:
import numpy as np
def knnsearch(A, B, n):
indices = np.zeros((len(B), n))
distances = np.zeros((len(B), n))
for i in range(len(B)):
a = A
for N in range(n):
dif = np.abs(a - B[i])
ind = np.argmin(dif)
indices[i, N] = ind + N
distances[i, N] = dif[ind + N]
# remove this neighbour from from future consideration
np.delete(a, ind)
return indices, distances
array_A = np.array([1000, 925, 850, 700, 600, 500, 400, 300, 250, 200, 150, 100, 70, 50, 30, 20, 10])
array_B = np.array([923, 584.2, 605.3, 153.2])
neighbours = 2
indices, distances = knnsearch(array_A, array_B, neighbours)
print(indices)
print(distances)
returns:
[[ 1. 2.]
[ 4. 5.]
[ 4. 3.]
[10. 11.]]
[[ 2. 73. ]
[ 15.8 84.2]
[ 5.3 94.7]
[ 3.2 53.2]]
There must be a way to remove the for loops, as I need the performance should my A and B arrays contain many thousands of elements with many nearest neighbours...
Please help! Thanks :)
The second loop can easily be vectorized. The most straightforward way to do it is to use np.argsort and select the indices corresponding to the n smallest dif values. However, for large arrays, as only n values should be sorted, it is better to use np.argpartition.
Therefore, the code would look like something like that:
def vector_knnsearch(A, B, n):
indices = np.empty((len(B), n))
distances = np.empty((len(B), n))
for i,b in enumerate(B):
dif = np.abs(A - b)
min_ind = np.argpartition(dif,n)[:n] # Returns the indexes of the 3 smallest
# numbers but not necessarily sorted
ind = min_ind[np.argsort(dif[min_ind])] # sort output of argpartition just in case
indices[i, :] = ind
distances[i, :] = dif[ind]
return indices, distances
As said in the comments, the first loop can also be removed using a meshgrid, however, the extra use of memory and computation time to construct the meshgrid makes this approach slower for the dimensions I tried (and this will probably get worse for large arrays and end up in Memory Error). In addition, the readability of the code decreases. Overall, this would probably do this approach less pythonic.
def mesh_knnsearch(A, B, n):
m = len(B)
rng = np.arange(m).reshape((m,1))
Amesh, Bmesh = np.meshgrid(A,B)
dif = np.abs(Amesh-Bmesh)
min_ind = np.argpartition(dif,n,axis=1)[:,:n]
ind = min_ind[rng,np.argsort(dif[rng,min_ind],axis=1)]
return ind, dif[rng,ind]
Not that it is important to define this rng as a 2d array in order to retrieve a[rng[0],ind[0]], a[rng[1],ind[1]], etc and maintain the dimensions of the array, as opposed to a[:,ind] which retrieves a[:,ind[0]], a[:,ind[1]], etc.

Compute pairwise distance in a batch without replicating tensor in Tensorflow?

I want to compute the pairwise square distance of a batch of feature in Tensorflow. I have a simple implementation using + and * operations by
tiling the original tensor :
def pairwise_l2_norm2(x, y, scope=None):
with tf.op_scope([x, y], scope, 'pairwise_l2_norm2'):
size_x = tf.shape(x)[0]
size_y = tf.shape(y)[0]
xx = tf.expand_dims(x, -1)
xx = tf.tile(xx, tf.pack([1, 1, size_y]))
yy = tf.expand_dims(y, -1)
yy = tf.tile(yy, tf.pack([1, 1, size_x]))
yy = tf.transpose(yy, perm=[2, 1, 0])
diff = tf.sub(xx, yy)
square_diff = tf.square(diff)
square_dist = tf.reduce_sum(square_diff, 1)
return square_dist
This function takes as input two matrices of size (m,d) and (n,d) and compute the squared distance between each row vector. The output is a matrix of size (m,n) with element 'd_ij = dist(x_i, y_j)'.
The problem is that I have a large batch and high dim features 'm, n, d' replicating the tensor consume a lot of memory.
I'm looking for another way to implement this without increasing the memory usage and just only store the final distance tensor. Kind of double looping the original tensor.
You can use some linear algebra to turn it into matrix ops. Note that what you need matrix D where a[i] is the ith row of your original matrix and
D[i,j] = (a[i]-a[j])(a[i]-a[j])'
You can rewrite that into
D[i,j] = r[i] - 2 a[i]a[j]' + r[j]
Where r[i] is squared norm of ith row of the original matrix.
In a system that supports standard broadcasting rules you can treat r as a column vector and write D as
D = r - 2 A A' + r'
In TensorFlow you could write this as
A = tf.constant([[1, 1], [2, 2], [3, 3]])
r = tf.reduce_sum(A*A, 1)
# turn r into column vector
r = tf.reshape(r, [-1, 1])
D = r - 2*tf.matmul(A, tf.transpose(A)) + tf.transpose(r)
sess = tf.Session()
sess.run(D)
result
array([[0, 2, 8],
[2, 0, 2],
[8, 2, 0]], dtype=int32)
Using squared_difference:
def squared_dist(A):
expanded_a = tf.expand_dims(A, 1)
expanded_b = tf.expand_dims(A, 0)
distances = tf.reduce_sum(tf.squared_difference(expanded_a, expanded_b), 2)
return distances
One thing I noticed is that this solution using tf.squared_difference gives me out of memory (OOM) for very large vectors, while the approach by #YaroslavBulatov doesn't. So, I think decomposing the operation yields a smaller memory footprint (which I thought squared_difference would handle better under the hood).
Here is a more general solution for two tensors of coordinates A and B:
def squared_dist(A, B):
assert A.shape.as_list() == B.shape.as_list()
row_norms_A = tf.reduce_sum(tf.square(A), axis=1)
row_norms_A = tf.reshape(row_norms_A, [-1, 1]) # Column vector.
row_norms_B = tf.reduce_sum(tf.square(B), axis=1)
row_norms_B = tf.reshape(row_norms_B, [1, -1]) # Row vector.
return row_norms_A - 2 * tf.matmul(A, tf.transpose(B)) + row_norms_B
Note that this is the square distance. If you want to change this to the Euclidean distance, perform a tf.sqrt on the result. If you want to do that, don't forget to add a small constant to compensate for the floating point instabilities: dist = tf.sqrt(squared_dist(A, B) + 1e-6).
If you want compute other method , then change the order of the tf modules.
def compute_euclidean_distance(x, y):
size_x = x.shape.dims[0]
size_y = y.shape.dims[0]
for i in range(size_x):
tile_one = tf.reshape(tf.tile(x[i], [size_y]), [size_y, -1])
eu_one = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(tile_one, y), 2), axis=1)), axis=0)
if i == 0:
d = eu_one
else:
d = tf.concat([d, eu_one], axis=0)
return d

Partial convolution / correlation with numpy [duplicate]

I am learning numpy/scipy, coming from a MATLAB background. The xcorr function in Matlab has an optional argument "maxlag" that limits the lag range from –maxlag to maxlag. This is very useful if you are looking at the cross-correlation between two very long time series but are only interested in the correlation within a certain time range. The performance increases are enormous considering that cross-correlation is incredibly expensive to compute.
In numpy/scipy it seems there are several options for computing cross-correlation. numpy.correlate, numpy.convolve, scipy.signal.fftconvolve. If someone wishes to explain the difference between these, I'd be happy to hear, but mainly what is troubling me is that none of them have a maxlag feature. This means that even if I only want to see correlations between two time series with lags between -100 and +100 ms, for example, it will still calculate the correlation for every lag between -20000 and +20000 ms (which is the length of the time series). This gives a 200x performance hit! Do I have to recode the cross-correlation function by hand to include this feature?
Here are a couple functions to compute auto- and cross-correlation with limited lags. The order of multiplication (and conjugation, in the complex case) was chosen to match the corresponding behavior of numpy.correlate.
import numpy as np
from numpy.lib.stride_tricks import as_strided
def _check_arg(x, xname):
x = np.asarray(x)
if x.ndim != 1:
raise ValueError('%s must be one-dimensional.' % xname)
return x
def autocorrelation(x, maxlag):
"""
Autocorrelation with a maximum number of lags.
`x` must be a one-dimensional numpy array.
This computes the same result as
numpy.correlate(x, x, mode='full')[len(x)-1:len(x)+maxlag]
The return value has length maxlag + 1.
"""
x = _check_arg(x, 'x')
p = np.pad(x.conj(), maxlag, mode='constant')
T = as_strided(p[maxlag:], shape=(maxlag+1, len(x) + maxlag),
strides=(-p.strides[0], p.strides[0]))
return T.dot(p[maxlag:].conj())
def crosscorrelation(x, y, maxlag):
"""
Cross correlation with a maximum number of lags.
`x` and `y` must be one-dimensional numpy arrays with the same length.
This computes the same result as
numpy.correlate(x, y, mode='full')[len(a)-maxlag-1:len(a)+maxlag]
The return vaue has length 2*maxlag + 1.
"""
x = _check_arg(x, 'x')
y = _check_arg(y, 'y')
py = np.pad(y.conj(), 2*maxlag, mode='constant')
T = as_strided(py[2*maxlag:], shape=(2*maxlag+1, len(y) + 2*maxlag),
strides=(-py.strides[0], py.strides[0]))
px = np.pad(x, maxlag, mode='constant')
return T.dot(px)
For example,
In [367]: x = np.array([2, 1.5, 0, 0, -1, 3, 2, -0.5])
In [368]: autocorrelation(x, 3)
Out[368]: array([ 20.5, 5. , -3.5, -1. ])
In [369]: np.correlate(x, x, mode='full')[7:11]
Out[369]: array([ 20.5, 5. , -3.5, -1. ])
In [370]: y = np.arange(8)
In [371]: crosscorrelation(x, y, 3)
Out[371]: array([ 5. , 23.5, 32. , 21. , 16. , 12.5, 9. ])
In [372]: np.correlate(x, y, mode='full')[4:11]
Out[372]: array([ 5. , 23.5, 32. , 21. , 16. , 12.5, 9. ])
(It will be nice to have such a feature in numpy itself.)
Until numpy implements the maxlag argument, you can use the function ucorrelate from the pycorrelate package. ucorrelate operates on numpy arrays and has a maxlag keyword. It implements the correlation from using a for-loop and optimizes the execution speed with numba.
Example - autocorrelation with 3 time lags:
import numpy as np
import pycorrelate as pyc
x = np.array([2, 1.5, 0, 0, -1, 3, 2, -0.5])
c = pyc.ucorrelate(x, x, maxlag=3)
c
Result:
Out[1]: array([20, 5, -3])
The pycorrelate documentation contains a notebook showing perfect match between pycorrelate.ucorrelate and numpy.correlate:
matplotlib.pyplot provides matlab like syntax for computating and plotting of cross correlation , auto correlation etc.
You can use xcorr which allows to define the maxlags parameter.
import matplotlib.pyplot as plt
import numpy as np
data = np.arange(0,2*np.pi,0.01)
y1 = np.sin(data)
y2 = np.cos(data)
coeff = plt.xcorr(y1,y2,maxlags=10)
print(*coeff)
[-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
8 9 10] [ -9.81991753e-02 -8.85505028e-02 -7.88613080e-02 -6.91325329e-02
-5.93651264e-02 -4.95600447e-02 -3.97182508e-02 -2.98407146e-02
-1.99284126e-02 -9.98232812e-03 -3.45104289e-06 9.98555430e-03
1.99417667e-02 2.98641953e-02 3.97518558e-02 4.96037706e-02
5.94189688e-02 6.91964864e-02 7.89353663e-02 8.86346584e-02
9.82934198e-02] <matplotlib.collections.LineCollection object at 0x00000000074A9E80> Line2D(_line0)
#Warren Weckesser's answer is the best as it leverages numpy to get performance savings (and not just call corr for each lag). Nonetheless, it returns the cross-product (eg the dot product between the inputs at various lags). To get the actual cross-correlation I modified his answer w/ an optional mode argument, which if set to 'corr' returns the cross-correlation as such:
def crosscorrelation(x, y, maxlag, mode='corr'):
"""
Cross correlation with a maximum number of lags.
`x` and `y` must be one-dimensional numpy arrays with the same length.
This computes the same result as
numpy.correlate(x, y, mode='full')[len(a)-maxlag-1:len(a)+maxlag]
The return vaue has length 2*maxlag + 1.
"""
py = np.pad(y.conj(), 2*maxlag, mode='constant')
T = as_strided(py[2*maxlag:], shape=(2*maxlag+1, len(y) + 2*maxlag),
strides=(-py.strides[0], py.strides[0]))
px = np.pad(x, maxlag, mode='constant')
if mode == 'dot': # get lagged dot product
return T.dot(px)
elif mode == 'corr': # gets Pearson correlation
return (T.dot(px)/px.size - (T.mean(axis=1)*px.mean())) / \
(np.std(T, axis=1) * np.std(px))
I encountered the same problem some time ago, I paid more attention to the efficiency of calculation.Refer to the source code of MATLAB's function xcorr.m, I made a simple one.
import numpy as np
from scipy import signal, fftpack
import math
import time
def nextpow2(x):
if x == 0:
y = 0
else:
y = math.ceil(math.log2(x))
return y
def xcorr(x, y, maxlag):
m = max(len(x), len(y))
mx1 = min(maxlag, m - 1)
ceilLog2 = nextpow2(2 * m - 1)
m2 = 2 ** ceilLog2
X = fftpack.fft(x, m2)
Y = fftpack.fft(y, m2)
c1 = np.real(fftpack.ifft(X * np.conj(Y)))
index1 = np.arange(1, mx1+1, 1) + (m2 - mx1 -1)
index2 = np.arange(1, mx1+2, 1) - 1
c = np.hstack((c1[index1], c1[index2]))
return c
if __name__ == "__main__":
s = time.clock()
a = [1, 2, 3, 4, 5]
b = [6, 7, 8, 9, 10]
c = xcorr(a, b, 3)
e = time.clock()
print(c)
print(e-c)
Take the results of a certain run as an exmple:
[ 29. 56. 90. 130. 110. 86. 59.]
0.0001745000000001884
comparing with MATLAB code:
clear;close all;clc
tic
a = [1, 2, 3, 4, 5];
b = [6, 7, 8, 9, 10];
c = xcorr(a, b, 3)
toc
29.0000 56.0000 90.0000 130.0000 110.0000 86.0000 59.0000
时间已过 0.000279 秒。
If anyone can give a strict mathematical derivation about this,that would be very helpful.
I think I have found a solution, as I was facing the same problem:
If you have two vectors x and y of any length N, and want a cross-correlation with a window of fixed len m, you can do:
x = <some_data>
y = <some_data>
# Trim your variables
x_short = x[window:]
y_short = y[window:]
# do two xcorrelations, lagging x and y respectively
left_xcorr = np.correlate(x, y_short) #defaults to 'valid'
right_xcorr = np.correlate(x_short, y) #defaults to 'valid'
# combine the xcorrelations
# note the first value of right_xcorr is the same as the last of left_xcorr
xcorr = np.concatenate(left_xcorr, right_xcorr[1:])
Remember you might need to normalise the variables if you want a bounded correlation
Here is another answer, sourced from here, seems faster on the margin than np.correlate and has the benefit of returning a normalised correlation:
def rolling_window(self, a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def xcorr(self, x,y):
N=len(x)
M=len(y)
meany=np.mean(y)
stdy=np.std(np.asarray(y))
tmp=self.rolling_window(np.asarray(x),M)
c=np.sum((y-meany)*(tmp-np.reshape(np.mean(tmp,-1),(N-M+1,1))),-1)/(M*np.std(tmp,-1)*stdy)
return c
as I answered here, https://stackoverflow.com/a/47897581/5122657
matplotlib.xcorr has the maxlags param. It is actually a wrapper of the numpy.correlate, so there is no performance saving. Nevertheless it gives exactly the same result given by Matlab's cross-correlation function. Below I edited the code from matplotlib so that it will return only the correlation. The reason is that if we use matplotlib.corr as it is, it will return the plot as well. The problem is, if we put complex data type as the arguments into it, we will get "casting complex to real datatype" warning when matplotlib tries to draw the plot.
<!-- language: python -->
import numpy as np
import matplotlib.pyplot as plt
def xcorr(x, y, maxlags=10):
Nx = len(x)
if Nx != len(y):
raise ValueError('x and y must be equal length')
c = np.correlate(x, y, mode=2)
if maxlags is None:
maxlags = Nx - 1
if maxlags >= Nx or maxlags < 1:
raise ValueError('maxlags must be None or strictly positive < %d' % Nx)
c = c[Nx - 1 - maxlags:Nx + maxlags]
return c

Categories

Resources