How to test if two sparse arrays are (almost) equal?

How to test if two sparse arrays are (almost) equal? - python

I want to check if two sparse arrays are (almost) equal. Whereas for numpy arrays you can do:
import numpy as np
a = np.ones(200)
np.testing.assert_array_almost_equal(a, a)
This does not work for sparse arrays, which I can understand (either returns error AttributeError: ravel not found for smaller matrices or errors related to size of array). Is there a scipy equivalent to test sparse matrices? I could convert my sparse matrices are to dense matrices and use the numpy testing function, but sometimes this is not possible due to (memory/size) constraints. E.g.:
from scipy import sparse
b = sparse.rand(80000,8000,density=0.01)
type(b) # <class 'scipy.sparse.coo.coo_matrix'>
c = b.toarray() # ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
Is it possible to test these larger scipy arrays for equality, or should I test smaller samples?

Assuming that are we are not concerned with the non-zeros in one that array that might be within the tolerance value, we can simply get the row, col indices and the corresponding values and look for exact matches between the indices, while allclose() match for the values.
Hence, the implementation would be -
from scipy.sparse import find
def allclose(A, B, atol = 1e-8):
# If you want to check matrix shapes as well
if np.array_equal(A.shape, B.shape)==0:
return False
r1,c1,v1 = find(A)
r2,c2,v2 = find(B)
index_match = np.array_equal(r1,r2) & np.array_equal(c1,c2)
if index_match==0:
return False
else:
return np.allclose(v1,v2, atol=atol)
Here's another with nonzero and data methods to replace find function -
def allclose_v2(A, B, atol = 1e-8):
# If you want to check matrix shapes as well
if np.array_equal(A.shape, B.shape)==0:
return False
r1,c1 = A.nonzero()
r2,c2 = B.nonzero()
lidx1 = np.ravel_multi_index((r1,c1), A.shape)
lidx2 = np.ravel_multi_index((r2,c2), B.shape)
sidx1 = lidx1.argsort()
sidx2 = lidx2.argsort()
index_match = np.array_equal(lidx1[sidx1], lidx2[sidx2])
if index_match==0:
return False
else:
v1 = A.data
v2 = B.data
V1 = v1[sidx1]
V2 = v2[sidx2]
return np.allclose(V1,V2, atol=atol)
We can short-circuit at few places to speed it up further. On performance, I am focusing more at cases where only the values differ.

Related

efficient setting 1D range values in a DataFrame (or a ndarray) with boolean array

PREREQUISITE
import numpy as np
import pandas as pd
INPUT1:boolean 2d array (a sample array as below)
x = np.array(
[[False,False,False,False,True],
[True,False,False,False,False],
[False,False,True,False,True],
[False,True,True,False,False],
[False,False,False,False,False]])
INPUT2:1D Range values (a sample as below)
y=np.array([1,2,3,4])
EXPECTED OUTPUT:2D ndarray
[[0,0,0,0,1],
[1,0,0,0,2],
[2,0,1,0,1],
[3,1,1,0,2],
[4,2,2,0,3]]
I want to set a range value(vertical vector) for each True in 2d ndarray(INPUT1) efficiently. Is there some useful APIs or solutions for this purpose?

Unfortunately I couldn't come up with an elegant solution, so I came up with multiple inelegant ones. The two main approaches I could think of are
brute-force looping over each True value and assigning slices, and
using a single indexed assignment to replace the necessary values.
It turns out that the time complexity of these approaches is non-trivial, so depending on the size of your array either can be faster.
Using your example input:
import numpy as np
x = np.array(
[[False,False,False,False,True],
[True,False,False,False,False],
[False,False,True,False,True],
[False,True,True,False,False],
[False,False,False,False,False]])
y = np.array([1,2,3,4])
refout = np.array([[0,0,0,0,1],
[1,0,0,0,2],
[2,0,1,0,1],
[3,1,1,0,2],
[4,2,2,0,3]])
# alternative input with arbitrary size:
# N = 100; x = np.random.rand(N,N) < 0.2; y = np.arange(1,N)
def looping_clip(x, y):
"""Loop over Trues, use clipped slices"""
nmax = x.shape[0]
n = y.size
# initialize output
out = np.zeros_like(x, dtype=y.dtype)
# loop over True values
for i,j in zip(*x.nonzero()):
# truncate right-hand side where necessary
out[i:i+n, j] = y[:nmax-i]
return out
def looping_expand(x, y):
"""Loop over Trues, use an expanded buffer"""
n = y.size
nmax,mmax = x.shape
ivals,jvals = x.nonzero()
# initialize buffed-up output
out = np.zeros((nmax + max(n + ivals.max() - nmax,0), mmax), dtype=y.dtype)
# loop over True values
for i,j in zip(ivals, jvals):
# slice will always be complete, i.e. of length y.size
out[i:i+n, j] = y
return out[:nmax, :].copy() # rather not return a view to an auxiliary array
def index_2d(x, y):
"""Assign directly with 2d indices, use an expanded buffer"""
n = y.size
nmax,mmax = x.shape
ivals,jvals = x.nonzero()
# initialize buffed-up output
out = np.zeros((nmax + max(n + ivals.max() - nmax,0), mmax), dtype=y.dtype)
# now we can safely index for each "(ivals:ivals+n, jvals)" so to speak
upped_ivals = ivals[:,None] + np.arange(n) # shape (ntrues, n)
upped_jvals = jvals.repeat(y.size).reshape(-1, n) # shape (ntrues, n)
out[upped_ivals, upped_jvals] = y # right-hand size of shape (n,) broadcasts
return out[:nmax, :].copy() # rather not return a view to an auxiliary array
def index_1d(x,y):
"""Assign using linear indices, use an expanded buffer"""
n = y.size
nmax,mmax = x.shape
ivals,jvals = x.nonzero()
# initialize buffed-up output
out = np.zeros((nmax + max(n + ivals.max() - nmax,0), mmax), dtype=y.dtype)
# grab linear indices corresponding to Trues in a buffed-up array
inds = np.ravel_multi_index((ivals, jvals), out.shape)
# now all we need to do is start stepping along rows for each item and assign y
upped_inds = inds[:,None] + mmax*np.arange(n) # shape (ntrues, n)
out.flat[upped_inds] = y # y of shape (n,) broadcasts to (ntrues, n)
return out[:nmax, :].copy() # rather not return a view to an auxiliary array
# check that the results are correct
print(all([np.array_equal(refout, looping_clip(x,y)),
np.array_equal(refout, looping_expand(x,y)),
np.array_equal(refout, index_2d(x,y)),
np.array_equal(refout, index_1d(x,y))]))
I tried to document each function, but here's a synopsis:
looping_clip loops over every True value in the input and assigns to a corresponding slice in the output. We take care on the right-hand side to shorten the assigned array for when part of the slice would go beyond the edge of the array along the first dimension.
looping_expand loops over every True value in the input and assigns to a corresponding full slice in the output after allocating a padded output array ensuring that every slice will be full. We do more work when allocating a larger output array, but we don't have to shorten the right-hand side on assignment. We could omit the .copy() call in the last step, but I prefer not to return a nontrivially strided array (i.e. a view to an auxiliary array rather than a proper copy) as this might lead to obscure surprises for the user.
index_2d computes the 2d indices of every value to be assigned to, and assumes that duplicate indices will be handled in order. This is not guaranteed! (More on this a bit later.)
index_1d does the same using linearized indices and indexing into the flatiter of the output.
Here are the timings of the above methods using random arrays (see the commented line near the start):
What we can see is that for small and large arrays the looping versions are faster, but for linear sizes between roughly 10 and 150 the indexing versions are better. The reason I didn't go to higher sizes is that the indexing cases start to use a lot of memory, and I didn't want to have to worry about this messing with timings.
Just to make the above worse, note that the indexing versions assume that duplicate indices in a fancy indexing scenario are handled in order, so when True values are handled which are "lower" in the array, previous values will be overwritten as per your requirements. There's only one problem: this is not guaranteed:
For advanced assignments, there is in general no guarantee for the iteration order. This means that if an element is set more than once, it is not possible to predict the final result.
This doesn't sounds very encouraging. While in my experiments it seems that the indices are handled in order (according to C order), this can also be coincidence, or an implementation detail. So if you want to use the indexing versions, make sure that on your specific version and specific dimensions and shapes this still holds true.
We can make the assignment safer by getting rid of duplicate indices ourselves. For this we can make use of this answer by Divakar on a corresponding question:
def index_1d_safe(x,y):
"""Same as index_1d but use Divakar's safe solution for reducing duplicates"""
n = y.size
nmax,mmax = x.shape
ivals,jvals = x.nonzero()
# initialize buffed-up output
out = np.zeros((nmax + max(n + ivals.max() - nmax,0), mmax), dtype=y.dtype)
# grab linear indices corresponding to Trues in a buffed-up array
inds = np.ravel_multi_index((ivals, jvals), out.shape)
# now all we need to do is start stepping along rows for each item and assign y
upped_inds = inds[:,None] + mmax*np.arange(n) # shape (ntrues, n)
# now comes https://stackoverflow.com/a/44672126
# need additional step: flatten upped_inds and corresponding y values for selection
upped_flat_inds = upped_inds.ravel() # shape (ntrues, n) -> (ntrues*n,)
y_vals = np.broadcast_to(y, upped_inds.shape).ravel() # shape (ntrues, n) -> (ntrues*n,)
sidx = upped_flat_inds.argsort(kind='mergesort')
sindex = upped_flat_inds[sidx]
idx = sidx[np.r_[np.flatnonzero(sindex[1:] != sindex[:-1]), upped_flat_inds.size-1]]
out.flat[upped_flat_inds[idx]] = y_vals[idx]
return out[:nmax, :].copy() # rather not return a view to an auxiliary array
This still reproduces your expected output. The problem is that now the function takes much longer to finish:
Bummer. Considering how my indexing versions are only faster for an intermediate array size and how their faster versions are not guaranteed to work, perhaps it's simplest to just use one of the looping versions. This is not to say, of course, that there aren't any optimal vectorized solutions that I missed.

Why does my matrix vector multiplication in NumPy yield a two dimensional array instead of a one dimensional vector?

I have a matrix called inverseJ, which is a 2x2 matrix ([[0.07908312, 0.03071918], [-0.12699082, -0.0296126]]), and a one dimensional vector deltaT of length two ([-31.44630082, -16.9922145]). In NumPy, multiplying these should yield a one dimensional vector again, as in this example. However, when I multiply these using inverseJ.dot(deltaT), I get a two dimensional array ([[-3.00885838, 4.49657509]]) with the only element being the vector I am actually looking for. Does anyone know why I am not simply getting a vector? Any help is greatly appreciated!
Whole script for reference
from __future__ import division
import sys
import io
import os
from math import *
import numpy as np
if __name__ == "__main__":
# Fingertip position
x = float(sys.argv[1])
y = float(sys.argv[2])
# Initial guesses
q = np.array([0., 0.])
q[0] = float(sys.argv[3])
q[1] = float(sys.argv[4])
error = 0.01
while(error > 0.001):
# Configuration matrix
T = np.array([17.3*cos(q[0] + (5/3)*q[1])+25.7*cos(q[0] + q[1])+41.4*cos(q[0]),
17.3*sin(q[0] + (5/3)*q[1])+25.7*sin(q[0] + q[1])+41.4*sin(q[0])])
# Deviation
deltaT = np.subtract(np.array([x,y]), T)
error = deltaT[0]**2 + deltaT[1]**2
# Jacobian
J = np.matrix([ [-25.7*sin(q[0]+q[1])-17.3*sin(q[0]+(5/3)*q[1])-41.4*sin(q[0]), -25.7*sin(q[0]+q[1])-28.8333*sin(q[0]+(5/3)*q[1])],
[25.7*cos(q[0]+q[1])+17.3*cos(q[0]+(5/3)*q[1])+41.4*cos(q[0]), 25.7*cos(q[0]+q[1])+28.8333*cos(q[0]+(5/3)*q[1])]])
#Inverse of the Jacobian
det = J.item((0,0))*J.item((1,1)) - J.item((0,1))*J.item((1,0))
inverseJ = 1/det * np.matrix([ [J.item((1,1)), -J.item((0,1))],
[-J.item((1,0)), J.item((0,0))]])
### THE PROBLEMATIC MATRIX VECTOR MULTIPLICATION IN QUESTION
q = q + inverseJ.dot(deltaT)

When a matrix is involved in an operation, the output is another matrix. matrix object are matrices in the strict linear algebra sense. They are always 2D, even if they have only one element.
On the contrary, the example you mention uses arrays, not matrices. Arrays are more "loosely behaved". One of the differences is that "useless" dimensions are removed, yielding a 1D vector in this example.

This simply seems to be the way numpy.dot() functions. It does a simple array multiplication which, since one of the parameters is two dimensional, returns a two dimensional array. dot() is not a smart method, it just does what it's told without sanity checks from what I can gather in the documentation here. Note that this is not an error in your code, but you will have to extract the inner list yourself.

Python Numpy error : setting an array element with a sequence

I'm quite new to Python and Numpy, so I apologize if I'm missing something obvious here.
I have a function that solves a system of 2 differential equations :
import numpy as np
import numpy.linalg as la
def solve_ode(x0, a0, beta, t):
At = np.array([[0.23*t, (-10**5)*t], [0, -beta*t]], dtype=np.float32)
# get eigenvalues and eigenvectors
evals, V = la.eig(At)
Vi = la.inv(V)
# get e^At coeff
eAt = V # np.exp(evals) # Vi
xt = eAt*x0
return xt
However, running it with this code :
import matplotlib.pyplot as plt
# initial values
x0 = 10**6
a0 = 2.5
beta = 0.05
t = np.linspace(0, 3600, 360)
plt.semilogy(t, solve_ode(x0, a0, beta, t))
... throws this error :
ValueError: setting an array element with a sequence.
At this line :
At = np.array([[0.23*t, (-10**5)*t], [0, -beta*t]], dtype=np.float32)
Note that t and beta are supposed to be floats. I think Python might not be able to infer this but I don't know how I could do this...
Thx in advance for your help.

You are supplying t as a numpy array of shape 360 from linspace and not simply a float. The resulting At numpy array you are trying to create is then ill formed as all columns must be the same length. In python there is an important difference between lists and numpy arrays. For example, you could do what you have here as a list of lists, e.g.
At = [[0.23*t, (-10**5)*t], [0, -beta*t]]
with dimensions [[360 x 360] x [1 x 360]].
Alternatively, if all elements of At are the length of t the array would work,
At = np.array([[0.23*t, (-10**5)*t], [t, -beta*t]], dtype=np.float32)
with shape [2, 2, 360].

When you give a list or a list of lists, or in this case, a list of list of listss, all of them should have the same length, so that numpy can automatically infer the dimensions (shape) of the resulting matrix.
In your example, it's all correctly put, except the part you put 0 as a column I guess. Not sure what to call it though, cause your expected output is a cube I suppose.
You can fix it by giving the correct number of zeros as bellow:
At = np.array([[0.23*t, (-10**5)*t], [np.zeros(len(t)), -beta*t]], dtype=np.float32)
But check the .shape of the resulting array, and make sure it's what you want.

As others note the problem is the 0 in the inner list. It doesn't match the 360 length arrays generated by the other expressions. np.array can make an object dtype array from that (2x2), but can't make a float one.
At = np.array([[0.23*t, (-10**5)*t], [0*t, -beta*t]])
produces a (2,2,360) array. But I suspect the rest of that function is built around the assumption that At is (2,2) - a 2d square array with eig, inv etc.
What is the return xt supposed to be?
Does this work?
S = np.array([solve_ode(x0, a0, beta, i) for i in t])
giving a 1d array with the same number of values as in t?
I'm not suggesting this is the fastest way of solving the problem, but it's the simplest, especially if you are only generating 360 values.

np.bincount for 1 line, vectorized multidimensional averaging

I am trying to vectorize an operation using numpy, which I use in a python script that I have profiled, and found this operation to be the bottleneck and so needs to be optimized since I will run it many times.
The operation is on a data set of two parts. First, a large set (n) of 1D vectors of different lengths (with maximum length, Lmax) whose elements are integers from 1 to maxvalue. The set of vectors is arranged in a 2D array, data, of size (num_samples,Lmax) with trailing elements in each row zeroed. The second part is a set of scalar floats, one associated with each vector, that I have a computed and which depend on its length and the integer-value at each position. The set of scalars is made into a 1D array, Y, of size num_samples.
The desired operation is to form the average of Y over the n samples, as a function of (value,position along length,length).
This entire operation can be vectorized in matlab with use of the accumarray function: by using 3 2D arrays of the same size as data, whose elements are the corresponding value, position, and length indices of the desired final array:
sz_Y = num_samples;
sz_len = Lmax
sz_pos = Lmax
sz_val = maxvalue
ind_len = repmat( 1:sz_len ,1 ,sz_samples);
ind_pos = repmat( 1:sz_pos ,sz_samples,1 );
ind_val = data
ind_Y = repmat((1:sz_Y)',1 ,Lmax );
copiedY=Y(ind_Y);
mask = data>0;
finalarr=accumarray({ind_val(mask),ind_pos(mask),ind_len(mask)},copiedY(mask), [sz_val sz_pos sz_len])/sz_val;
I was hoping to emulate this implementation with np.bincounts. However, np.bincounts differs to accumarray in two relevant ways:
both arguments must be of same 1D size, and
there is no option to choose the shape of the output array.
In the above usage of accumarray, the list of indices, {ind_val(mask),ind_pos(mask),ind_len(mask)}, is 1D cell array of 1x3 arrays used as index tuples, while in np.bincounts it must be 1D scalars as far as I understand. I expect np.ravel may be useful but am not sure how to use it here to do what I want. I am coming to python from matlab and some things do not translate directly, e.g. the colon operator which ravels in opposite order to ravel. So my question is how might I use np.bincount or any other numpy method to achieve an efficient python implementation of this operation.
EDIT: To avoid wasting time: for these multiD index problems with complicated index manipulation, is the recommend route to just use cython to implement the loops explicity?
EDIT2: Alternative Python implementation I just came up with.
Here is a heavy ram solution:
First precalculate:
Using index units for length (i.e., length 1 =0) make a 4D bool array, size (num_samples,Lmax+1,Lmax+1,maxvalue) , holding where the conditions are satisfied for each value in Y.
ALLcond=np.zeros((num_samples,Lmax+1,Lmax+1,maxvalue+1),dtype='bool')
for l in range(Lmax+1):
for i in range(Lmax+1):
for v in range(maxvalue+!):
ALLcond[:,l,i,v]=(data[:,i]==v) & (Lvec==l)`
Where Lvec=[len(row) for row in data]. Then get the indices for these using np.where and initialize a 4D float array into which you will assign the values of Y:
[indY,ind_len,ind_pos,ind_val]=np.where(ALLcond)
Yval=np.zeros(np.shape(ALLcond),dtype='float')
Now in the loop in which I have to perform the operation, I compute it with the two lines:
Yval[ind_Y,ind_len,ind_pos,ind_val]=Y[ind_Y]
Y_avg=sum(Yval)/num_samples
This gives a factor of 4 or so speed up over the direct loop implementation. I was expecting more. Perhaps, this is a more tangible implementation for Python heads to digest. Any faster suggestions are welcome :)

One way is to convert the 3 "indices" to a linear index and then apply bincount. Numpy's ravel_multi_index is essentially the same as MATLAB's sub2ind. So the ported code could be something like:
shape = (Lmax+1, Lmax+1, maxvalue+1)
posvec = np.arange(1, Lmax+1)
ind_len = np.tile(Lvec[:,None], [1, Lmax])
ind_pos = np.tile(posvec, [n, 1])
ind_val = data
Y_copied = np.tile(Y[:,None], [1, Lmax])
mask = posvec <= Lvec[:,None] # fill-value independent
lin_idx = np.ravel_multi_index((ind_len[mask], ind_pos[mask], ind_val[mask]), shape)
Y_avg = np.bincount(lin_idx, weights=Y_copied[mask], minlength=np.prod(shape)) / n
Y_avg.shape = shape
This is assuming data has shape (n, Lmax), Lvec is Numpy array, etc. You may need to adapt the code a little to get rid of off-by-one errors.
One could argue that the tile operations are not very efficient and not very "numpythonic". Something with broadcast_arrays could be nice, but I think I prefer this way:
shape = (Lmax+1, Lmax+1, maxvalue+1)
posvec = np.arange(1, Lmax+1)
len_idx = np.repeat(Lvec, Lvec)
pos_idx = np.broadcast_to(posvec, data.shape)[mask]
val_idx = data[mask]
Y_copied = np.repeat(Y, Lvec)
mask = posvec <= Lvec[:,None] # fill-value independent
lin_idx = np.ravel_multi_index((len_idx, pos_idx, val_idx), shape)
Y_avg = np.bincount(lin_idx, weights=Y_copied, minlength=np.prod(shape)) / n
Y_avg.shape = shape
Note broadcast_to was added in Numpy 1.10.0.

pointwise operations on scipy.sparse matrices

Is it possible to apply for example numpy.exp or similar pointwise operators to all elements in a scipy.sparse.lil_matrix or another sparse matrix format?
import numpy
from scipy.sparse import lil_matrix
x = numpy.ones((10,10))
y = numpy.exp(x)
x = lil_matrix(numpy.ones((10,10)))
# y = ????
numpy.exp(x) or scipy.exp(x) yields an AttributeError, and numpy.exp(x.data) yields the same.
thanks!

I do not know the full details, but converting to another type works, at least when using the array of non zero elements:
xcsc = x.tocsc()
numpy.exp(xcsc.data) # works

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to test if two sparse arrays are (almost) equal? - python

Related

efficient setting 1D range values in a DataFrame (or a ndarray) with boolean array

Why does my matrix vector multiplication in NumPy yield a two dimensional array instead of a one dimensional vector?

Python Numpy error : setting an array element with a sequence

np.bincount for 1 line, vectorized multidimensional averaging

pointwise operations on scipy.sparse matrices

Categories

Resources