given permuted arrays, find permutation - python

I have two numpy integer arrays,
import numpy
a = numpy.array([1, 3, 5, 0])
b = numpy.array([3, 5, 0, 1])
which I know are permutations of each other. How can I find the permutation, i.e., the integer array i such that
a[i] == b
? Explicit for loops with comparisons across the entire arrays works, but seems inefficient.
Bonus points if it works of permutation of row-arrays like
import numpy
a = numpy.array([
[1, 2],
[3, 7],
[5, 12],
[0, 4],
# ...
])
b = numpy.array([
[3, 7],
[5, 12],
[0, 4],
[1, 2],
# ...
])

Here's one using argsort twice. It seems a few percent faster than #Divakar's:
from simple_benchmark import BenchmarkBuilder, MultiArgument
import numpy as np
B = BenchmarkBuilder()
#B.add_function()
def div(A,B):
sidx = A.argsort()
return sidx[np.searchsorted(A,B,sorter=sidx)]
#B.add_function()
def pp(A,B):
oa,ob = (x.argsort() for x in (A,B))
o = np.empty_like(oa)
o[ob] = oa
return o
#B.add_arguments('array size')
def argument_provider():
for exp in range(8, 30):
dim_size = int(1.4**exp)
a = np.random.permutation(dim_size)
b = np.random.permutation(dim_size)
yield dim_size, MultiArgument([a,b])
r = B.run()
r.plot()
import pylab
pylab.savefig('bm.png')

Here's one with np.searchsorted on 1D-views for 2D arrays -
# https://stackoverflow.com/a/45313353/ #Divakar
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
b = np.ascontiguousarray(b)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
A,B = view1D(a,b)
sidx = A.argsort()
idx = sidx[np.searchsorted(A,B,sorter=sidx)]
For 1D input arrays, we can directly feed in a and b as A and B respectively.
Alternatively, for positive integers, we can use dimensionality-reduction with grid-mapping, keeping rest of it same as earlier -
s = np.r_[1,a[:,:-1].max(0)+1]
A,B = a.dot(s),b.dot(s)
Or use np.ravel_multi_index to do the mapping on 2D-grid -
shp = a.max(0)+1
A,B = np.ravel_multi_index(a.T,shp),np.ravel_multi_index(b.T,shp)
Another for positive integers with sparse-matrix using the same grid-mapping technique -
from scipy.sparse import csr_matrix
R = np.arange(1,len(a)+1)
c = csr_matrix((R, a.T), shape=a.max(0)+1)
idx = c[b[:,0],b[:,1]].A1-1

Related

linear algebra (partial) solving in python

I have a dictionary with a tuple as key and a result as value.
I'm looking for a way to "solve" as many keys as possible, even if it's not possible to "solve" all of then.
input : {(A, C):1,(A, B, C): 1}
output : {(A, C):1, (A, B, C): 1, B:0}
in other word :
modified input :
1*A + 0*B + 1*C = 1
1*A + 1*B + 1*C = 1
output :
A = ?
B = 0
C = ?
I can only use numpy and scipy.
I tried this, but it must be square matrix :
import numpy as np
a = np.array([[1, 0, 1], [1, 1, 1]])
b = np.array([1, 1])
from scipy import linalg
x = linalg.solve(a, b)
print(x)
Do you have ideas where I should look at ?
this code does the trick, but it's not very 'clean'
import numpy as np
A=np.array([[1, 0, 1], [1, 1, 1]])
B=np.array([1,1])
s = solutionNonSquare = np.linalg.lstsq(A, B)[0]
for i,val in enumerate(s):
if val < 0.0001:
print('x[',i,'] = 0')
else:
print('x[',i,'] = ?')
print(s)
Thanks a lot for your smartness
Matrix a is Non square Matrix, so it does not have full Rank. However, we can compute pseudo inverse, using np.linalg.pinv and then we can compute x = np.matmul(a_psudo_inv, b)
import numpy as np
a = np.array([[1, 0, 1], [1, 1, 1]])
b = np.array([1, 1])
from numpy import linalg
a_psudo_inv = np.linalg.pinv(a)
print(a_psudo_inv)
print(a_psudo_inv.shape)
x = np.matmul(a_psudo_inv, b)
print(x)
Solution >> [5.00000000e-01 5.55111512e-16 5.00000000e-01]

Pythonic way of finding indexes of unique elements in two arrays

I have two sorted, numpy arrays similar to these ones:
x = np.array([1, 2, 8, 11, 15])
y = np.array([1, 8, 15, 17, 20, 21])
Elements never repeat in the same array. I want to figure out a way of pythonicaly figuring out a list of indexes that contain the locations in the arrays at which the same element exists.
For instance, 1 exists in x and y at index 0. Element 2 in x doesn't exist in y, so I don't care about that item. However, 8 does exist in both arrays - in index 2 in x but index 1 in y. Similarly, 15 exists in both, in index 4 in x, but index 2 in y. So the outcome of my function would be a list that in this case returns [[0, 0], [2, 1], [4, 2]].
So far what I'm doing is:
def get_indexes(x, y):
indexes = []
for i in range(len(x)):
# Find index where item x[i] is in y:
j = np.where(x[i] == y)[0]
# If it exists, save it:
if len(j) != 0:
indexes.append([i, j[0]])
return indexes
But the problem is that arrays x and y are very large (millions of items), so it takes quite a while. Is there a better pythonic way of doing this?
Without Python loops
Code
def get_indexes_darrylg(x, y):
' darrylg answer '
# Use intersect to find common elements between two arrays
overlap = np.intersect1d(x, y)
# Indexes of common elements in each array
loc1 = np.searchsorted(x, overlap)
loc2 = np.searchsorted(y, overlap)
# Result is the zip two 1d numpy arrays into 2d array
return np.dstack((loc1, loc2))[0]
Usage
x = np.array([1, 2, 8, 11, 15])
y = np.array([1, 8, 15, 17, 20, 21])
result = get_indexes_darrylg(x, y)
# result[0]: array([[0, 0],
[2, 1],
[4, 2]], dtype=int64)
Timing Posted Solutions
Results show that darrlg code has the fastest run time.
Code Adjustment
Each posted solution as a function.
Slight mod so that each solution outputs an numpy array.
Curve named after poster
Code
import numpy as np
import perfplot
def create_arr(n):
' Creates pair of 1d numpy arrays with half the elements equal '
max_val = 100000 # One more than largest value in output arrays
arr1 = np.random.randint(0, max_val, (n,))
arr2 = arr1.copy()
# Change half the elements in arr2
all_indexes = np.arange(0, n, dtype=int)
indexes = np.random.choice(all_indexes, size = n//2, replace = False) # locations to make changes
np.put(arr2, indexes, np.random.randint(0, max_val, (n//2, ))) # assign new random values at change locations
arr1 = np.sort(arr1)
arr2 = np.sort(arr2)
return (arr1, arr2)
def get_indexes_lllrnr101(x,y):
' lllrnr101 answer '
ans = []
i=0
j=0
while (i<len(x) and j<len(y)):
if x[i] == y[j]:
ans.append([i,j])
i += 1
j += 1
elif (x[i]<y[j]):
i += 1
else:
j += 1
return np.array(ans)
def get_indexes_joostblack(x, y):
'joostblack'
indexes = []
for idx,val in enumerate(x):
idy = np.searchsorted(y,val)
try:
if y[idy]==val:
indexes.append([idx,idy])
except IndexError:
continue # ignore index errors
return np.array(indexes)
def get_indexes_mustafa(x, y):
indices_in_x = np.flatnonzero(np.isin(x, y)) # array([0, 2, 4])
indices_in_y = np.flatnonzero(np.isin(y, x[indices_in_x])) # array([0, 1, 2]
return np.array(list(zip(indices_in_x, indices_in_y)))
def get_indexes_darrylg(x, y):
' darrylg answer '
# Use intersect to find common elements between two arrays
overlap = np.intersect1d(x, y)
# Indexes of common elements in each array
loc1 = np.searchsorted(x, overlap)
loc2 = np.searchsorted(y, overlap)
# Result is the zip two 1d numpy arrays into 2d array
return np.dstack((loc1, loc2))[0]
def get_indexes_akopcz(x, y):
' akopcz answer '
return np.array([
[i, j]
for i, nr in enumerate(x)
for j in np.where(nr == y)[0]
])
perfplot.show(
setup = create_arr, # tuple of two 1D random arrays
kernels=[
lambda a: get_indexes_lllrnr101(*a),
lambda a: get_indexes_joostblack(*a),
lambda a: get_indexes_mustafa(*a),
lambda a: get_indexes_darrylg(*a),
lambda a: get_indexes_akopcz(*a),
],
labels=["lllrnr101", "joostblack", "mustafa", "darrylg", "akopcz"],
n_range=[2 ** k for k in range(5, 21)],
xlabel="Array Length",
# More optional arguments with their default values:
# logx="auto", # set to True or False to force scaling
# logy="auto",
equality_check=None, #np.allclose, # set to None to disable "correctness" assertion
# show_progress=True,
# target_time_per_measurement=1.0,
# time_unit="s", # set to one of ("auto", "s", "ms", "us", or "ns") to force plot units
# relative_to=1, # plot the timings relative to one of the measurements
# flops=lambda n: 3*n, # FLOPS plots
)
What you are doing is O(nlogn) which is decent enough.
If you want, you can do it in O(n) by iterating on both arrays with two pointers and since they are sorted, increase the pointer for the array with smaller object.
See below:
x = [1, 2, 8, 11, 15]
y = [1, 8, 15, 17, 20, 21]
def get_indexes(x,y):
ans = []
i=0
j=0
while (i<len(x) and j<len(y)):
if x[i] == y[j]:
ans.append([i,j])
i += 1
j += 1
elif (x[i]<y[j]):
i += 1
else:
j += 1
return ans
print(get_indexes(x,y))
which gives me:
[[0, 0], [2, 1], [4, 2]]
Although, this function will search for all the occurances of x[i] in the y array, if duplicates are not allowed in y it will find x[i] exactly once.
def get_indexes(x, y):
return [
[i, j]
for i, nr in enumerate(x)
for j in np.where(nr == y)[0]
]
You can use numpy.searchsorted:
def get_indexes(x, y):
indexes = []
for idx,val in enumerate(x):
idy = np.searchsorted(y,val)
if y[idy]==val:
indexes.append([idx,idy])
return indexes
One solution is to first look from x's side to see what values are included in y by getting their indices through np.isin and np.flatnonzero, and then use the same procedure from the other side; but instead of giving x entirely, we give only the (already found) intersected elements to gain time:
indices_in_x = np.flatnonzero(np.isin(x, y)) # array([0, 2, 4])
indices_in_y = np.flatnonzero(np.isin(y, x[indices_in_x])) # array([0, 1, 2])
Now you can zip them to get the result:
result = list(zip(indices_in_x, indices_in_y)) # [(0, 0), (2, 1), (4, 2)]

Aggregate elements based on position vector

I'm trying to vectorize a very simple operation but can't seem to figure out how.
Given a very large numerical vector (over 1M positions) and another array of size n with a given set of positions, I would like to get back a vector of size n with elements being the average of the values of the first vector as specified by the second
a = np.array([1,2,3,4,5,6,7])
b = np.array([[0,1],[2],[3,5],[4,6]])
c = [1.5,3,5,6]
I need to repeat this operation many times so performance is an issue.
Vanilla python solution:
import numpy as np
import time
a = np.array([1,2,3,4,5,6,7])
b = np.array([[0,1],[2],[3,5],[4,6]])
begin = time.time()
for i in range(100000):
c = []
for d in b:
c.append(np.mean(a[d]))
print(time.time() - begin, c)
# 3.7529971599578857 [1.5, 3.0, 5.0, 6.0]
I'm not sure if this is necessarily faster but you may as well try:
import numpy as np
a = np.array([1, 2, 3, 4, 5, 6, 7])
b = np.array([[0, 1], [2], [3, 5], [4, 6]])
# Get the length of each subset of indices
lens = np.fromiter((len(bi) for bi in b), count=len(b), dtype=np.int32)
# Compute reduction indices
reduce_idx = np.roll(np.cumsum(lens), 1)
reduce_idx[0] = 0
# Make flattened array of index lists
idx = np.fromiter((i for bi in b for i in bi), count=lens.sum(), dtype=np.int32)
# Reorder according to indices
a2 = a[idx]
# Sum reordered array at reduction indices and divide by number of indices
c = np.add.reduceat(a2, reduce_idx) / lens
print(c)
# [1.5 3. 5. 6. ]

Stacking Numpy arrays of different length using padding

a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
I want a function stack_padding such that:
assert(stack_padding(l) == np.array([[1,2,3],[4,5,0]])
Is there a standard way in numpy of achieving
EDIT: l could have potentially many more elements
I think itertools.zip_longest with fill_value=0 can work for you:
import itertools
a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
def stack_padding(l):
return np.column_stack((itertools.zip_longest(*l, fillvalue=0)))
>>> stack_padding(l)
array([[1, 2, 3],
[4, 5, 0]])
With numpy.pad:
a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
max_len = max([len(arr) for arr in l])
padded = np.array([np.lib.pad(arr, (0, max_len - len(arr)), 'constant', constant_values=0) for arr in l])
If you don't want to use itertools and column_stack, numpy.ndarray.resize will also do the job perfectly. As mentioned by jtweeder, you just need to know to resulting size of each rows. The advantage to use resize is that numpy.ndarray is contiguous in memory. Resizing is faster when each row differs alot in size. The performance difference is observable between the two approaches.
import numpy as np
import timeit
import itertools
def stack_padding(it):
def resize(row, size):
new = np.array(row)
new.resize(size)
return new
# find longest row length
row_length = max(it, key=len).__len__()
mat = np.array( [resize(row, row_length) for row in it] )
return mat
def stack_padding1(l):
return np.column_stack((itertools.zip_longest(*l, fillvalue=0)))
if __name__ == "__main__":
n_rows = 200
row_lengths = np.random.randint(30, 50, size=n_rows)
mat = [np.random.randint(0, 100, size=s) for s in row_lengths]
def test_stack_padding():
global mat
stack_padding(mat)
def test_itertools():
global mat
stack_padding1(mat)
t1 = timeit.timeit(test_stack_padding, number=1000)
t2 = timeit.timeit(test_itertools, number=1000)
print('With ndarray.resize: ', t1)
print('With itertool and vstack: ', t2)
The resize method wins in the above comparison:
>>> With ndarray.resize: 0.30080295499647036
>>> With itertool and vstack: 1.0151802329928614

Vectorized groupby with NumPy

Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the results.
Can this be done flexibly in NumPy without a native Python for-loop? With a Python loop, this would look like:
>>> import numpy as np
>>> X = np.arange(10).reshape(5, 2)
>>> groups = np.array([0, 0, 0, 1, 1])
# Split up elements (rows) of `X` based on their element wise group
>>> np.array([X[groups==i].sum() for i in np.unique(groups)])
array([15, 30])
Above 15 is the sum of the first three rows of X, and 30 is the sum of the remaining two.
By "flexibly,” I just mean that we aren't focusing on one particular computation such as sum, count, maximum, etc, but rather passing any computation to the grouped arrays.
If not, is there a faster approach than the above?
How about using scipy sparse matrix
import numpy as np
from scipy import sparse
import time
x_len = 500000
g_len = 100
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
# original
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
# using scipy sparse matrix
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
result on my PC
0.15915322303771973
0.012875080108642578
0
More than 10 times faster.
Update!
Let's benchmark answers of #Paul Panzer and #Daniel F. It is summation only benchmark.
import numpy as np
from scipy import sparse
import time
# by #Daniel F
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
x_len = 500000
g_len = 200
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
print("original")
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
print("use scipy coo matrix")
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
print("use scipy csr matrix #Daniel F")
s = time.time()
x_sum = X.sum(axis=1)
c = np.array(sparse.csr_matrix(
(
x_sum,
groups,
np.arange(len(groups)+1)
),
shape=(len(groups), g_len)
).sum(axis=0)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-c)).sum())
print("use bincount #Paul Panzer #Daniel F")
s = time.time()
d = np.bincount(groups, X.sum(axis=1), g_len)
print(time.time() - s)
#compare
print(np.abs((a-d)).sum())
print("use ufunc #Daniel F")
s = time.time()
e = groupby_np(X, groups)
print(time.time() - s)
#compare
print(np.abs((a-e)).sum())
STDOUT
original
0.2882847785949707
use scipy coo matrix
0.012301445007324219
0
use scipy csr matrix #Daniel F
0.01046299934387207
0
use bincount #Paul Panzer #Daniel F
0.007468223571777344
0.0
use ufunc #Daniel F
0.04431319236755371
0
The winner is the bincount solution. But the csr matrix solution is also very interesting.
#klim's sparse matrix solution would at first sight appear to be tied to summation. We can, however, use it in the general case by converting between the csr and csc formats:
Let's look at a small example:
>>> m, n = 3, 8
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> M = sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m))
>>>
>>> idx
array([0, 2, 2, 1, 1, 2, 2, 0])
>>>
>>> M = M.tocsc()
>>>
>>> M.indptr, M.indices
(array([0, 2, 4, 8], dtype=int32), array([0, 7, 3, 4, 1, 2, 5, 6], dtype=int32))
As we can see after conversion the internal representation of the sparse matrix yields the indices grouped and sorted:
>>> groups = np.split(M.indices, M.indptr[1:-1])
>>> groups
[array([0, 7], dtype=int32), array([3, 4], dtype=int32), array([1, 2, 5, 6], dtype=int32)]
>>>
We could have obtained the same using a stable argsort:
>>> np.argsort(idx, kind='mergesort')
array([0, 7, 3, 4, 1, 2, 5, 6])
>>>
But sparse matrices are actually faster, even when we allow argsort to use a faster non-stable algorithm:
>>> m, n = 1000, 100000
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> timeit('sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m)).tocsc()', **kwds)
2.250748165184632
>>> timeit('np.argsort(idx)', **kwds)
5.783584725111723
If we require argsort to keep groups sorted, the difference is even larger:
>>> timeit('np.argsort(idx, kind="mergesort")', **kwds)
10.507467685034499
If you want a more flexible implementation of groupby that can group using any of numpy's ufuncs:
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
groupby_np(X, groups)
array([15, 30])
groupby_np(X, groups, uf = np.multiply)
array([ 0, 3024])
groupby_np(X, groups, uf = np.maximum)
array([5, 9])
groupby_np(X, groups, uf = np.minimum)
array([0, 6])
There's probably a faster way than this (both of the operands are making copies right now), but:
np.bincount(np.broadcast_to(groups, X.T.shape).ravel(), X.T.ravel())
array([ 15., 30.])
If you want to extend the answer to a ndarray, and still have a fast computation, you could extend the Daniel's solution :
x_len = 500000
g_len = 200
y_len = 2
X = np.arange(x_len * y_len).reshape(x_len, y_len)
groups = np.random.randint(0, g_len, x_len)
# original
a = np.array([X[groups==i].sum(axis=0) for i in np.unique(groups)])
# alternative
bins = [0] + list(np.bincount(groups, minlength=g_len).cumsum())
Z = np.argsort(groups)
d = np.array([X.take(Z[bins[i]:bins[i+1]],0).sum(axis=0) for i in range(g_len)])
It took about 30 ms (15ms for creating bins + 15ms for summing) instead of 280 ms on the original way in this example.
d.shape
>>> (1000, 2)

Categories

Resources