How to condense a sparse matrix efficiently - python

In Python, for my application it is usually best to create a sparse matrix by creating a sparse COO matrix with rows, columns and values arrays and then changing it to CSC (CSR) format.
Now, say I want to condense the CSC matrix. What is an efficient way to do so? The condensation rows/columns vary during the code and are much smaller than the dimensions of the sparse matrix, so I do not believe rebuilding the COO matrix is efficient.
The following MWE shows an example for creating the condensed matrix, but without any optimization attempts. There is a sparse efficiency warning because the number of nonzeros is increased. In this MWE I use dia_array to create the sparse matrix for simplicity.
import numpy as np
from scipy.sparse import dia_array
def main():
n = 10
m = 6
data = np.tile(np.concatenate((np.arange(1, m+1),
np.arange(m-1, 0, -1)))[:, np.newaxis], (1, n))
offsets = np.arange(-m+1, m)
A = dia_array((data, offsets), shape=(n, n)).tocsc()
print("Matrix A:")
print(repr(A.toarray()))
# condense these rows/columns
cond_rowscols = np.arange(n-8, n, 2)
print("Condensed rows/columns of A:")
print(repr(cond_rowscols))
# IMPROVE HERE
# condensation algorithm
B = A.copy()
B[[cond_rowscols[0]]] += B[cond_rowscols[1:]].sum(axis=0)
B[:, [cond_rowscols[0]]] += B[:, cond_rowscols[1:]].sum(axis=1)[:, np.newaxis]
free_rowscols = np.ones(n, dtype=bool)
free_rowscols[cond_rowscols[1:]] = False
B = B[np.ix_(free_rowscols, free_rowscols)]
print("Condensed matrix A:")
print(repr(B.toarray()))
print('Done')
if __name__ == "__main__":
main()
The output is:
Matrix A:
array([[6, 5, 4, 3, 2, 1, 0, 0, 0, 0],
[5, 6, 5, 4, 3, 2, 1, 0, 0, 0],
[4, 5, 6, 5, 4, 3, 2, 1, 0, 0],
[3, 4, 5, 6, 5, 4, 3, 2, 1, 0],
[2, 3, 4, 5, 6, 5, 4, 3, 2, 1],
[1, 2, 3, 4, 5, 6, 5, 4, 3, 2],
[0, 1, 2, 3, 4, 5, 6, 5, 4, 3],
[0, 0, 1, 2, 3, 4, 5, 6, 5, 4],
[0, 0, 0, 1, 2, 3, 4, 5, 6, 5],
[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]])
Condensed rows/columns of A:
array([2, 4, 6, 8])
Condensed matrix A:
array([[ 6, 5, 6, 3, 1, 0, 0],
[ 5, 6, 9, 4, 2, 0, 0],
[ 6, 9, 56, 14, 16, 14, 9],
[ 3, 4, 14, 6, 4, 2, 0],
[ 1, 2, 16, 4, 6, 4, 2],
[ 0, 0, 14, 2, 4, 6, 4],
[ 0, 0, 9, 0, 2, 4, 6]])
Done
Edit: As per hpaulj's comment, we can create a condensation matrix T:
# condensation with matrix multiplication
n_conds = cond_rowscols.shape[0] # number of condensed rows/cols
t_vals = np.ones(n, dtype=int)
t_rows = np.arange(n)
t_cols = np.empty_like(t_rows)
t_cols[free_rowscols] = np.arange(n-n_conds+1)
t_cols[cond_rowscols[1:]] = cond_rowscols[0]
T = csc_array((t_vals, (t_rows, t_cols)), shape=(n, n-n_conds+1))
Such that the condensed matrix A is B = T.T # A # T.
T is:
>>>print(repr(T.toarray()))
array([[1, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1]], dtype=int32)
This does not yield any sparse efficiency errors! I still have to time it, though, for larger problems. Does scipy.sparse use sparse BLAS for sparse matrix multiplications?

I have created a test case for both options shown in the MWE with a simple performance comparison, using a smaller set of values than I intend to use in practice.
It seems that the in-place sum option takes about 5 times more than the multiplication option.
Since solving linear systems is always much more expensive, I think I'll settle with the sparse matrix multiplication option, for now.
import numpy as np
from scipy.sparse import dia_array, csr_array
from time import perf_counter
def build_sp_mat(n=None, m=None):
"""Generates a square sparse CSC matrix of size `n` and half bandwidth `m`"""
if n is None:
n = 1_000 # matrix size
if m is None:
m = 50 # half bandwidth
data = np.tile(np.concatenate((np.arange(1, m+1),
np.arange(m-1, 0, -1)))[:, np.newaxis], (1, n))
offsets = np.arange(-m+1, m)
A = dia_array((data, offsets), shape=(n, n)).tocsc()
return A
def condensed_rowscols(n, n_conds=None):
""""Returns the indices of the condensed rows/columns
Also returns a boolean mask of the non-condensed rows/columns
"""
if n_conds is None:
n_conds = 50 # total number of condensed rows/columns
# condense these rows/columns
cond_rowscols = np.arange(n//n_conds-1, n, n//n_conds)
free_rowscols = np.ones(n, dtype=bool)
free_rowscols[cond_rowscols[1:]] = False
print(repr(cond_rowscols))
return cond_rowscols, free_rowscols
def condense_in_place_sum(A, cond_rowscols, free_rowscols):
"""Performs condensation via a sum of the condensed rows and columns"""
A[[cond_rowscols[0]]] += A[cond_rowscols[1:]].sum(axis=0)
A[:, [cond_rowscols[0]]] += A[:, cond_rowscols[1:]].sum(axis=1)[:, np.newaxis]
Acond = A[np.ix_(free_rowscols, free_rowscols)] # indices are sorted
return Acond
def condense_mat_mult(A, cond_rowscols, free_rowscols, n, n_conds):
"""Performs condensation via sparse matrix multiplications"""
t_vals = np.ones(n, dtype=int)
t_rows = np.arange(n)
t_cols = np.empty_like(t_rows)
t_cols[free_rowscols] = np.arange(n-n_conds+1)
t_cols[cond_rowscols[1:]] = cond_rowscols[0]
T = csr_array((t_vals, (t_rows, t_cols)), shape=(n, n-n_conds+1))
Acond = T.T # A # T
Acond.sort_indices() # indices have to be sorted
return Acond, T
if __name__ == "__main__":
n, m, n_conds = 500_000, 54, 1000
A = build_sp_mat(n, m)
cond_rowscols, free_rowscols = condensed_rowscols(n, n_conds)
A.sort_indices() # this is important
B = A.copy()
stime = perf_counter()
Acond1 = condense_in_place_sum(B, cond_rowscols, free_rowscols)
print(f"Condensation via in-place sum took {perf_counter()-stime} s")
# Acond1 comes with its indices array sorted automatically
stime = perf_counter()
Acond2, _ = condense_mat_mult(A, cond_rowscols, free_rowscols, n, n_conds)
print(f"Condensation via sparse multiplication took {perf_counter()-stime} s")
# Acond2's indices array has to be sorted, as shown in the function
print("Done")
The output is
Condensation via in-place sum took 9.0079096 s
Condensation via sparse multiplication took 1.4657909 s
Done

Related

Is there any fast way to find identical rows of two sparse matrices with different sizes?

Consider A, an n by j matrix, and B, an m by j matrix, both in SciPy with m<n. Is there any way that I can find the indices of the rows of A which are identical to rows of B?
I have tried for loops and tried to convert them into Numpy arrays. In my case, they're not working because I'm dealing with huge matrices.
Here is the link to the same question for Numpy arrays.
Edit:
An Example for A, B, and the desired output:
>>> import numpy as np
>>> from scipy.sparse import csc_matrix
>>> row = np.array([0, 2, 2, 0, 1, 2])
>>> col = np.array([0, 0, 1, 2, 2, 2])
>>> data = np.array([1, 3, 3, 4, 5, 6])
>>> A = csc_matrix((data, (row, col)), shape=(5, 3))
>>> A.toarray()
array([[1, 0, 4],
[0, 0, 5],
[3, 3, 6],
[0, 0, 0],
[0, 0, 0]])
>>> row = np.array([0, 2, 2, 0, 1, 2])
>>> col = np.array([0, 0, 1, 2, 2, 2])
>>> data = np.array([1, 2, 3, 4, 5, 6])
>>> B = csc_matrix((data, (row, col)), shape=(4, 3))
>>> B.toarray()
array([[1, 0, 4],
[0, 0, 5],
[2, 3, 6],
[0, 0, 0]])
Desired output:
def some_function(A,B):
# Some operations
return indices
>>> some_function(A,B)
[0, 1, 3, 4]

Python Numpy - Create 2d array where length is based on 1D array

Sorry for confusing title, but not sure how to make it more concise. Here's my requirements:
arr1 = np.array([3,5,9,1])
arr2 = ?(arr1)
arr2 would then be:
[
[0,1,2,0,0,0,0,0,0],
[0,1,2,3,4,0,0,0,0],
[0,1,2,3,4,5,6,7,8],
[0,0,0,0,0,0,0,0,0]
]
It doesn't need to vary based on the max, the shape is known in advance. So to start I've been able to get a shape of zeros:
arr2 = np.zeros((len(arr1),max_len))
And then of course I could do a for loop over arr1 like this:
for i, element in enumerate(arr1):
arr2[i,0:element] = np.arange(element)
but that would likely take a long time and both dimensions here are rather large (arr1 is a few million rows, max_len is around 500). Is there a clean optimized way to do this in numpy?
Building on a 'padding' idea posted by #Divakar some years ago:
In [161]: res = np.arange(9)[None,:].repeat(4,0)
In [162]: res[res>=arr1[:,None]] = 0
In [163]: res
Out[163]:
array([[0, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 1, 2, 3, 4, 0, 0, 0, 0],
[0, 1, 2, 3, 4, 5, 6, 7, 8],
[0, 0, 0, 0, 0, 0, 0, 0, 0]])
Try this with itertools.zip_longest -
import numpy as np
import itertools
l = map(range, arr1)
arr2 = np.column_stack((itertools.zip_longest(*l, fillvalue=0)))
print(arr2)
array([[0, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 1, 2, 3, 4, 0, 0, 0, 0],
[0, 1, 2, 3, 4, 5, 6, 7, 8],
[0, 0, 0, 0, 0, 0, 0, 0, 0]])
I am adding a slight variation on #hpaulj's answer because you mentioned that max_len is around 500 and you have millions of rows. In this case, you can precompute a 500 by 500 matrix containing all possible rows and index into it using arr1:
import numpy as np
np.random.seed(0)
max_len = 500
arr = np.random.randint(0, max_len, size=10**5)
# generate all unique rows first, then index
# can be faster if max_len << len(arr)
# 53 ms
template = np.tril(np.arange(max_len)[None,:].repeat(max_len,0), k=-1)
res = template[arr,:]
# 173 ms
res1 = np.arange(max_len)[None,:].repeat(arr.size,0)
res1[res1>=arr[:,None]] = 0
assert (res == res1).all()

setting the values of sliding windows of an array in numpy

Suppose I have a 2D array with shape (3, 3), call it a, and an array of zeros with shape (7, 7, 5, 5), call it b. I want to modify b in the following way:
for p in range(5):
for q in range(5):
b[p:p + 3, q:q + 3, p, q] = a
Given:
a = np.array([[4, 2, 2],
[9, 0, 5],
[9, 9, 4]])
b = np.zeros((7, 7, 5, 5), dtype=int)
b would end up something like:
>>> b[:, :, 0, 0]
array([[4, 2, 2, 0, 0, 0, 0],
[9, 0, 5, 0, 0, 0, 0],
[9, 9, 4, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0]])
>>> b[:, :, 0, 1]
array([[0, 4, 2, 2, 0, 0, 0],
[0, 9, 0, 5, 0, 0, 0],
[0, 9, 9, 4, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0]])
One way to think about this to make a sliding window view of b (6D), slice out the parts you want (3D or 4D), and assign a to them.
However, there is a simpler way to do this altogether. The way a sliding window view works is by creating a dimension that steps along less than the full size of the dimension you are viewing. For example:
>>> x = np.array([1, 2, 3, 4])
array([1, 2, 3, 4])
>>> window = np.lib.stride_tricks.as_strided(
x, shape=(x.shape[0] - 2, 3),
strides=x.strides * 2)
[[1 2 3]
[2 3 4]]
I'm deliberately using np.lib.stride_tricks.as_strided rather than np.lib.stride_tricks.sliding_window_view here because it has a certain flexibility that you need.
You can have a stride that is larger than the axis you are viewing, as long as you are careful. Contiguous arrays are more forgiving in this case, but by no means a requirement. An example of this is np.diag. You can implement it something like this:
>>> x = np.arange(12).reshape(3, 4)
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>> diag = np.lib.stride_tricks.as_strided(
x, shape=(min(x.shape),),
strides=(sum(x.strides),))
array([ 0, 5, 10])
The trick is to make a view of only the parts of b you care about in a way that makes the assignment easy. Because of broadcasting rules, you will want the last two dimensions of the view to be a.shape, and the strides to be b.strides[:2], since that's where you want to place a.
The first two dimensions of the view will be responsible for making the copies of a. You want 25 copies, so the shape will be (5, 5). The strides are the trickier part. Let's take a look at a 2D case, just because that's easier to visualize, and then attempt to generalize:
>>> a0 = np.array([1, 2])
>>> b0 = np.zeros((4, 3), dtype=int)
>>> b0[0:2, 0] = b0[1:3, 1] = b0[2:4, 2] = a0
The goal is to make a view that strides along the diagonal of b0 in the first axis. So:
>>> np.lib.stride_tricks.as_strided(
b0, shape=(b0.shape[0] - a0.shape[0] + 1, a0.shape[0]),
strides=(sum(b0.strides), b0.strides[0]))[:] = a0
>>> b0
array([[1, 0, 0],
[2, 1, 0],
[0, 2, 1],
[0, 0, 2]])
So that's what you do for b, but adding up every second dimension:
a = np.array([[4, 2, 2],
[9, 0, 5],
[9, 9, 4]])
b = np.zeros((7, 7, 5, 5), dtype=int)
vshape = (*np.subtract(b.shape[:a.ndim], a.shape) + 1,
*a.shape)
vstrides = (*np.add(b.strides[:a.ndim], b.strides[a.ndim:]),
*b.strides[:a.ndim])
np.lib.stride_tricks.as_strided(b, shape=vshape, strides=vstrides)[:] = a
TL;DR
def emplace_window(a, b):
vshape = (*np.subtract(b.shape[:a.ndim], a.shape) + 1, *a.shape)
vstrides = (*np.add(b.strides[:a.ndim], b.strides[a.ndim:]), *b.strides[:a.ndim])
np.lib.stride_tricks.as_strided(b, shape=vshape, strides=vstrides)[:] = a
I've phrased it this way, because now you can apply it to any number of dimensions. The only expectations is that 2 * a.ndim == b.ndim and that b.shape[a.ndim:] == b.shape[:a.ndim] - a.shape + 1.

How to set the smallest k values of each row of Numpy array to 0?

I want to set the smallest k values in each rows to 0,without using for loop.
Here is my Code with for loop:
import numpy as np
k = 2
sims = np.array([[3,1,2,9],[5,9,1,7],[1,8,6,2], [1,5,8,9]])
for i in range(len(sims)):
indices_argsort = np.argsort(sims[i])
sims[i, indices_argsort[: -k]] = 0
print(sims)
The output is :
array([[3, 0, 0, 9],
[0, 9, 0, 7],
[0, 8, 6, 0],
[0, 0, 8, 9]])
In the output , the smallest k values can be set to 0. But forloop is bad to use when deal with large matrices. So, is there some solutions can solve without using forloop?
Use
mask = np.apply_along_axis(lambda x: x < np.partition(x, k)[k], 1, sims)
sims[mask] = 0
Output
array([[3, 0, 0, 9],
[0, 9, 0, 7],
[0, 8, 6, 0],
[0, 0, 8, 9]])
I have solve it with Broadcast mechanism as follows:
import numpy as np
sims = np.array([[3,1,2,9],[5,9,1,7],[1,8,6,2], [1,5,8,9]])
indices_argsort = np.argsort(sims , axis = -1)
k = 2
sims[np.arange(4)[:,None], indices_argsort[:,:-k]] = 0
print(sims)
output:
array([[3, 0, 0, 9],
[0, 9, 0, 7],
[0, 8, 6, 0],
[0, 0, 8, 9]])

Numpy resize and fill with specific value

How can i resize a numpy array and fill it with a specific value (if some dimension is extended) ?
I find a way to extend my array with np.pad but I can't shorten it:
>>> import numpy as np
>>> a = np.ndarray((5, 5), dtype=np.uint16)
>>> a
array([[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]], dtype=uint16)
>>> np.pad(a, ((0, 1), (0,3)), mode='constant', constant_values=9)
array([[0, 0, 0, 0, 0, 9, 9, 9],
[0, 0, 0, 0, 0, 9, 9, 9],
[0, 0, 0, 0, 0, 9, 9, 9],
[0, 0, 0, 0, 0, 9, 9, 9],
[0, 0, 0, 0, 0, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9]], dtype=uint16)
And if i use resize i can't specify the value that I want to use.
>>> a.fill(5)
>>> a.resize((2, 7))
>>> a
array([[5, 5, 5, 5, 5, 5, 5],
[5, 5, 5, 5, 5, 5, 5]], dtype=uint16)
But i would like
>>> a
array([[5, 5, 5, 5, 5, 9, 9],
[5, 5, 5, 5, 5, 9, 9]], dtype=uint16)
After some test I create this function but it's only work when you change x_value or with a lower y_value, if you need to increase y dimension it doesn't work, why ?
VALUE_TO_FILL = 9
def resize(self, x_value, y_value):
x_diff = self.np_array.shape[0] - x_value
y_diff = self.np_array.shape[1] - y_value
self.np_array.resize((x_value, y_value), refcheck=False)
if x_diff < 0:
self.np_array[x_diff:, :] = VALUE_TO_FILL
if y_diff < 0:
self.np_array[:, y_diff:] = VALUE_TO_FILL
Your array has a fixed size data buffer. You can reshape the array without changing that buffer. You can take a slice (view) without changing the buffer. But you can't add values to the array without changing the buffer.
In general resize returns an new array with a new data buffer.
pad is a complex function to handle general cases. But the simplest approach is to create the empty target array, fill it, and then copy the input into the right place.
Alternatively pad could create the fill arrays and concatenate them with the original. But concatenate also makes the empty return and copies.
A do it yourself pad with clipping could be structured as:
n,m = X.shape
R = np.empty((k,l))
R.fill(value)
<calc slices from n,m,k,l>
R[slice1] = X[slice2]
Calculating the slices may require if-else tests or equivalent min/max. You can probably work out those details.
This may be all that is needed
R[:X.shape[0],:X.shape[1]]=X[:R.shape[0],:R.shape[1]]
That's because there's no problem if a slice is larger than the dimension.
In [37]: np.arange(5)[:10]
Out[37]: array([0, 1, 2, 3, 4])
Thus, for example:
In [38]: X=np.ones((3,4),int)
In [39]: R=np.empty((2,5),int)
In [40]: R.fill(9)
In [41]: R[:X.shape[0],:X.shape[1]]=X[:R.shape[0],:R.shape[1]]
In [42]: R
Out[42]:
array([[1, 1, 1, 1, 9],
[1, 1, 1, 1, 9]])
To shorten it, you can use negative values in slice :
>>> import numpy as np
>>> a = np.ndarray((5, 5), dtype=np.uint16)
>>> a
array([[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]], dtype=uint16)
>>> b = a[0:-1,0:-3]
>>> b
array([[0, 0],
[0, 0],
[0, 0],
[0, 0]], dtype=uint16)

Categories

Resources