Vectorized groupby with NumPy - python

Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the results.
Can this be done flexibly in NumPy without a native Python for-loop? With a Python loop, this would look like:
>>> import numpy as np
>>> X = np.arange(10).reshape(5, 2)
>>> groups = np.array([0, 0, 0, 1, 1])
# Split up elements (rows) of `X` based on their element wise group
>>> np.array([X[groups==i].sum() for i in np.unique(groups)])
array([15, 30])
Above 15 is the sum of the first three rows of X, and 30 is the sum of the remaining two.
By "flexibly,” I just mean that we aren't focusing on one particular computation such as sum, count, maximum, etc, but rather passing any computation to the grouped arrays.
If not, is there a faster approach than the above?

How about using scipy sparse matrix
import numpy as np
from scipy import sparse
import time
x_len = 500000
g_len = 100
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
# original
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
# using scipy sparse matrix
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
result on my PC
0.15915322303771973
0.012875080108642578
0
More than 10 times faster.
Update!
Let's benchmark answers of #Paul Panzer and #Daniel F. It is summation only benchmark.
import numpy as np
from scipy import sparse
import time
# by #Daniel F
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
x_len = 500000
g_len = 200
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
print("original")
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
print("use scipy coo matrix")
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
print("use scipy csr matrix #Daniel F")
s = time.time()
x_sum = X.sum(axis=1)
c = np.array(sparse.csr_matrix(
(
x_sum,
groups,
np.arange(len(groups)+1)
),
shape=(len(groups), g_len)
).sum(axis=0)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-c)).sum())
print("use bincount #Paul Panzer #Daniel F")
s = time.time()
d = np.bincount(groups, X.sum(axis=1), g_len)
print(time.time() - s)
#compare
print(np.abs((a-d)).sum())
print("use ufunc #Daniel F")
s = time.time()
e = groupby_np(X, groups)
print(time.time() - s)
#compare
print(np.abs((a-e)).sum())
STDOUT
original
0.2882847785949707
use scipy coo matrix
0.012301445007324219
0
use scipy csr matrix #Daniel F
0.01046299934387207
0
use bincount #Paul Panzer #Daniel F
0.007468223571777344
0.0
use ufunc #Daniel F
0.04431319236755371
0
The winner is the bincount solution. But the csr matrix solution is also very interesting.

#klim's sparse matrix solution would at first sight appear to be tied to summation. We can, however, use it in the general case by converting between the csr and csc formats:
Let's look at a small example:
>>> m, n = 3, 8
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> M = sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m))
>>>
>>> idx
array([0, 2, 2, 1, 1, 2, 2, 0])
>>>
>>> M = M.tocsc()
>>>
>>> M.indptr, M.indices
(array([0, 2, 4, 8], dtype=int32), array([0, 7, 3, 4, 1, 2, 5, 6], dtype=int32))
As we can see after conversion the internal representation of the sparse matrix yields the indices grouped and sorted:
>>> groups = np.split(M.indices, M.indptr[1:-1])
>>> groups
[array([0, 7], dtype=int32), array([3, 4], dtype=int32), array([1, 2, 5, 6], dtype=int32)]
>>>
We could have obtained the same using a stable argsort:
>>> np.argsort(idx, kind='mergesort')
array([0, 7, 3, 4, 1, 2, 5, 6])
>>>
But sparse matrices are actually faster, even when we allow argsort to use a faster non-stable algorithm:
>>> m, n = 1000, 100000
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> timeit('sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m)).tocsc()', **kwds)
2.250748165184632
>>> timeit('np.argsort(idx)', **kwds)
5.783584725111723
If we require argsort to keep groups sorted, the difference is even larger:
>>> timeit('np.argsort(idx, kind="mergesort")', **kwds)
10.507467685034499

If you want a more flexible implementation of groupby that can group using any of numpy's ufuncs:
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
groupby_np(X, groups)
array([15, 30])
groupby_np(X, groups, uf = np.multiply)
array([ 0, 3024])
groupby_np(X, groups, uf = np.maximum)
array([5, 9])
groupby_np(X, groups, uf = np.minimum)
array([0, 6])

There's probably a faster way than this (both of the operands are making copies right now), but:
np.bincount(np.broadcast_to(groups, X.T.shape).ravel(), X.T.ravel())
array([ 15., 30.])

If you want to extend the answer to a ndarray, and still have a fast computation, you could extend the Daniel's solution :
x_len = 500000
g_len = 200
y_len = 2
X = np.arange(x_len * y_len).reshape(x_len, y_len)
groups = np.random.randint(0, g_len, x_len)
# original
a = np.array([X[groups==i].sum(axis=0) for i in np.unique(groups)])
# alternative
bins = [0] + list(np.bincount(groups, minlength=g_len).cumsum())
Z = np.argsort(groups)
d = np.array([X.take(Z[bins[i]:bins[i+1]],0).sum(axis=0) for i in range(g_len)])
It took about 30 ms (15ms for creating bins + 15ms for summing) instead of 280 ms on the original way in this example.
d.shape
>>> (1000, 2)

Related

Numpy sum of minimums of two arrays like dot product efficiently

I would like get two arrays' sum of minumums efficiently with numpy. For example;
X=np.array([[1,2,3],[1,2,0]])
Y=np.array([[0,2,0],[1,3,1]])
My result should be;
result = array([[2, 4],[2, 3]])
The calculation for first cell;
result[0,0] = min(X[0,0],Y[0,0])+ min(X[0,1],Y[0,1])+min(X[0,2],Y[0,2])
In general, the result should be:
res[i,j] = sum(np.minimum(X[i, :], Y[j, :]))
but looking for fastest way.
dot is the equivalent of taking outer products, and summing on the appropriate axis.
The equivalent in your case is:
In [291]: np.minimum(X[:,None,:], Y[None,:,:])
Out[291]:
array([[[0, 2, 0],
[1, 2, 1]],
[[0, 2, 0],
[1, 2, 0]]])
In [292]: np.sum(np.minimum(X[:,None,:], Y[None,:,:]),axis=-1)
Out[292]:
array([[2, 4],
[2, 3]])
Best I could do:
import numpy as np
def sum_mins(x, y):
mask = (X - Y) < 0
return np.sum(X*mask + Y*np.logical_not(mask))
X=np.array([1,2,3])
Y=np.array([0,2,0])
print(sum_mins(X, Y))
One naive approach close to definition:
result = np.array([[np.sum(np.minimum(v_x, v_y)) for v_y in Y] for v_x in X])
A combination of hpaulj's and my former answer (deleted) that works in case you run out of memory otherwise:
# maximum number of float32s in memory - determining a max. chunk size
MAX_CHUNK_MEM_SIZE = 1000 * 1024 * 1024 / 4
def _fast_small(x, y):
"""Process a case with small size of x and y."""
# see answer of #hpaulj
return np.sum(np.minimum(x[:, None, :], y[None, :, :]), axis = -1)
def fast(x, y):
"""Process a case with potentially large size of x and y."""
assert len(x.shape) == len(y.shape) == 2
assert x.shape[1] == y.shape[1]
num_chunks = int(np.ceil(x.shape[0] * y.shape[0] * x.shape[0] / MAX_CHUNK_MEM_SIZE))
result_blocks = []
for x_block in np.array_split(x, num_chunks):
result_blocks_row = []
for y_block in np.array_split(y, num_chunks):
result_blocks_row.append(_fast_small(x_block, y_block))
result_blocks.append(result_blocks_row)
return np.block(result_blocks)

How to stretch specific items of numpy array with decrement?

Given boundary value k, is there a vectorized way to replace each number n with consecutive descending numbers from n-1 to k? For example, if k is 0 the I'd like to replace np.array([3,4,2,2,1,3,1]) with np.array([2,1,0,3,2,1,0,1,0,1,0,0,2,1,0,0]). Every item of input array is greater than k.
I have tried combination of np.repeat and np.cumsum but it seems evasive solution:
x = np.array([3,4,2,2,1,3,1])
y = np.repeat(x, x)
t = -np.ones(y.shape[0])
t[np.r_[0, np.cumsum(x)[:-1]]] = x-1
np.cumsum(t)
Is there any other way? I expect smth like inverse of np.add.reduceat that is able to broadcast integers to decreasing sequences instead of minimizing them.
Here's another way with array-assignment to skip the repeat part -
def func1(a):
l = a.sum()
out = np.full(l, -1, dtype=int)
out[0] = a[0]-1
idx = a.cumsum()[:-1]
out[idx] = a[1:]-1
return out.cumsum()
Benchmarking
# OP's soln
def OP(x):
y = np.repeat(x, x)
t = -np.ones(y.shape[0], dtype=int)
t[np.r_[0, np.cumsum(x)[:-1]]] = x-1
return np.cumsum(t)
Using benchit package (few benchmarking tools packaged together; disclaimer: I am its author) to benchmark proposed solutions.
import benchit
a = np.array([3,4,2,2,1,3,1])
in_ = [np.resize(a,n) for n in [10, 100, 1000, 10000]]
funcs = [OP, func1]
t = benchit.timings(funcs, in_)
t.plot(logx=True, save='timings.png')
Extend to take k as arg
def func1(a, k):
l = a.sum()+len(a)*(-k)
out = np.full(l, -1, dtype=int)
out[0] = a[0]-1
idx = (a-k).cumsum()[:-1]
out[idx] = a[1:]-1-k
return out.cumsum()
Sample run -
In [120]: a
Out[120]: array([3, 4, 2, 2, 1, 3, 1])
In [121]: func1(a, k=-1)
Out[121]:
array([ 2, 1, 0, -1, 3, 2, 1, 0, -1, 1, 0, -1, 1, 0, -1, 0, -1,
2, 1, 0, -1, 0, -1])
This is concise and probably ok for efficiency; I don't think apply is vectorized here, so you will be limited mostly be the number of elements in the original array (less so their value is my guess):
import pandas as pd
x = np.array([3,4,2,2,1,3,1])
values = pd.Series(x).apply(lambda val: np.arange(val-1,-1,-1)).values
output = np.concatenate(values)

given permuted arrays, find permutation

I have two numpy integer arrays,
import numpy
a = numpy.array([1, 3, 5, 0])
b = numpy.array([3, 5, 0, 1])
which I know are permutations of each other. How can I find the permutation, i.e., the integer array i such that
a[i] == b
? Explicit for loops with comparisons across the entire arrays works, but seems inefficient.
Bonus points if it works of permutation of row-arrays like
import numpy
a = numpy.array([
[1, 2],
[3, 7],
[5, 12],
[0, 4],
# ...
])
b = numpy.array([
[3, 7],
[5, 12],
[0, 4],
[1, 2],
# ...
])
Here's one using argsort twice. It seems a few percent faster than #Divakar's:
from simple_benchmark import BenchmarkBuilder, MultiArgument
import numpy as np
B = BenchmarkBuilder()
#B.add_function()
def div(A,B):
sidx = A.argsort()
return sidx[np.searchsorted(A,B,sorter=sidx)]
#B.add_function()
def pp(A,B):
oa,ob = (x.argsort() for x in (A,B))
o = np.empty_like(oa)
o[ob] = oa
return o
#B.add_arguments('array size')
def argument_provider():
for exp in range(8, 30):
dim_size = int(1.4**exp)
a = np.random.permutation(dim_size)
b = np.random.permutation(dim_size)
yield dim_size, MultiArgument([a,b])
r = B.run()
r.plot()
import pylab
pylab.savefig('bm.png')
Here's one with np.searchsorted on 1D-views for 2D arrays -
# https://stackoverflow.com/a/45313353/ #Divakar
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
b = np.ascontiguousarray(b)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
A,B = view1D(a,b)
sidx = A.argsort()
idx = sidx[np.searchsorted(A,B,sorter=sidx)]
For 1D input arrays, we can directly feed in a and b as A and B respectively.
Alternatively, for positive integers, we can use dimensionality-reduction with grid-mapping, keeping rest of it same as earlier -
s = np.r_[1,a[:,:-1].max(0)+1]
A,B = a.dot(s),b.dot(s)
Or use np.ravel_multi_index to do the mapping on 2D-grid -
shp = a.max(0)+1
A,B = np.ravel_multi_index(a.T,shp),np.ravel_multi_index(b.T,shp)
Another for positive integers with sparse-matrix using the same grid-mapping technique -
from scipy.sparse import csr_matrix
R = np.arange(1,len(a)+1)
c = csr_matrix((R, a.T), shape=a.max(0)+1)
idx = c[b[:,0],b[:,1]].A1-1

Stacking Numpy arrays of different length using padding

a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
I want a function stack_padding such that:
assert(stack_padding(l) == np.array([[1,2,3],[4,5,0]])
Is there a standard way in numpy of achieving
EDIT: l could have potentially many more elements
I think itertools.zip_longest with fill_value=0 can work for you:
import itertools
a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
def stack_padding(l):
return np.column_stack((itertools.zip_longest(*l, fillvalue=0)))
>>> stack_padding(l)
array([[1, 2, 3],
[4, 5, 0]])
With numpy.pad:
a = np.array([1,2,3])
b = np.array([4,5])
l = [a,b]
max_len = max([len(arr) for arr in l])
padded = np.array([np.lib.pad(arr, (0, max_len - len(arr)), 'constant', constant_values=0) for arr in l])
If you don't want to use itertools and column_stack, numpy.ndarray.resize will also do the job perfectly. As mentioned by jtweeder, you just need to know to resulting size of each rows. The advantage to use resize is that numpy.ndarray is contiguous in memory. Resizing is faster when each row differs alot in size. The performance difference is observable between the two approaches.
import numpy as np
import timeit
import itertools
def stack_padding(it):
def resize(row, size):
new = np.array(row)
new.resize(size)
return new
# find longest row length
row_length = max(it, key=len).__len__()
mat = np.array( [resize(row, row_length) for row in it] )
return mat
def stack_padding1(l):
return np.column_stack((itertools.zip_longest(*l, fillvalue=0)))
if __name__ == "__main__":
n_rows = 200
row_lengths = np.random.randint(30, 50, size=n_rows)
mat = [np.random.randint(0, 100, size=s) for s in row_lengths]
def test_stack_padding():
global mat
stack_padding(mat)
def test_itertools():
global mat
stack_padding1(mat)
t1 = timeit.timeit(test_stack_padding, number=1000)
t2 = timeit.timeit(test_itertools, number=1000)
print('With ndarray.resize: ', t1)
print('With itertool and vstack: ', t2)
The resize method wins in the above comparison:
>>> With ndarray.resize: 0.30080295499647036
>>> With itertool and vstack: 1.0151802329928614

Numpy: get the lowest N elements of an array X, considering only elements whose index is not an element in another array Y

To get the lowest 10 values of an array X I do something like:
lowest10 = np.argsort(X)[:10]
what is the most efficient way, avoiding loops, to filter the results so that I get the lowest 10 values whose index is not an element of another array Y?
So for example if the array Y is:
[2,20,51]
X[2], X[20] and X[51] shouldn't be taken into consideration to compute the lowest 10.
After some benchmarking here is my humble recommendation:
Swapping out appears to be more or less always faster than masking (even if 99% of X are forbidden.) So use something along the lines of
swap = X[Y]
X[Y] = np.inf
Sorting is expensive, therefore use argpartition and only sort what's necessary. Like
lowest10 = np.argpartition(Xfiltered, 10)[:10]
lowest10 = lowest10[np.argsort(Xfiltered[lowest10])]
Here are some benchmarks:
import numpy as np
from timeit import timeit
def swap_out():
global sol
swap = X[Y]
X[Y] = np.inf
sol = np.argpartition(X, K)[:K]
sol = sol[np.argsort(X[sol])]
X[Y] = swap
def app1():
sidx = X.argsort()
return sidx[~np.in1d(sidx, Y)][:K]
def app2():
sidx = np.argpartition(X,range(K+Y.size))
return sidx[~np.in1d(sidx, Y)][:K]
def app3():
sidx = np.argpartition(X,K+Y.size)
return sidx[~np.in1d(sidx, Y)][:K]
K = 10 # number of small elements wanted
N = 10000 # size of X
M = 10 # size of Y
S = 10 # number of repeats in benchmark
X = np.random.random((N,))
Y = np.random.choice(N, (M,))
so = timeit(swap_out, number=S)
print(sol)
print(X[sol])
d1 = timeit(app1, number=S)
print(sol)
print(X[sol])
d2 = timeit(app2, number=S)
print(sol)
print(X[sol])
d3 = timeit(app3, number=S)
print(sol)
print(X[sol])
print('pp', f'{so:8.5f}', ' d1(um)', f'{d1:8.5f}', ' d2', f'{d2:8.5f}', ' d3', f'{d3:8.5f}')
# pp 0.00053 d1(um) 0.00731 d2 0.00313 d3 0.00149
Here's one approach -
sidx = X.argsort()
idx_out = sidx[~np.in1d(sidx, Y)][:10]
Sample run -
# Setup inputs
In [141]: X = np.random.choice(range(60), 60)
In [142]: Y = np.array([2,20,51])
# For testing, let's set the Y positions as 0s and
# we want to see them skipped in o/p
In [143]: X[Y] = 0
# Use proposed approach
In [144]: sidx = X.argsort()
In [145]: X[sidx[~np.in1d(sidx, Y)][:10]]
Out[145]: array([ 0, 2, 4, 5, 5, 9, 9, 10, 12, 14])
# Print the first 13 numbers and skip three 0s and
# that should match up with the output from proposed approach
In [146]: np.sort(X)[:13]
Out[146]: array([ 0, 0, 0, 0, 2, 4, 5, 5, 9, 9, 10, 12, 14])
Alternatively, for performance, we might want to use np.argpartition, like so -
sidx = np.argpartition(X,range(10+Y.size))
idx_out = X[sidx[~np.in1d(sidx, Y)][:10]]
This would be beneficial if the length of X is a much larger number than 10.
If you don't care about the order of elements in that list of 10 indices, for further boost, we can simply pass on the scalar length instead of range array to np.argpartition : np.argpartition(X,10+Y.size).
We can optimize np.in1d with searchsorted to have one more approach (listing next).
Listing below all the discussed approaches in this post -
def app1(X, Y, n=10):
sidx = X.argsort()
return sidx[~np.in1d(sidx, Y)][:n]
def app2(X, Y, n=10):
sidx = np.argpartition(X,range(n+Y.size))
return sidx[~np.in1d(sidx, Y)][:n]
def app3(X, Y, n=10):
sidx = np.argpartition(X,n+Y.size)
return sidx[~np.in1d(sidx, Y)][:n]
def app4(X, Y, n=10):
n_ext = n+Y.size
sidx = np.argpartition(X,np.arange(n_ext))[:n_ext]
ssidx = sidx.argsort()
mask = np.ones(ssidx.size,dtype=bool)
search_idx = np.searchsorted(sidx, Y, sorter=ssidx)
search_idx[search_idx==sidx.size] = 0
idx = ssidx[search_idx]
mask[idx[sidx[idx] == Y]] = 0
return sidx[mask][:n]
You can work on a subset of original array using numpy.delete();
lowest10 = np.argsort(np.delete(X, Y))[:10]
Since delete works by slicing the original array with indexes to keep, complexity should be constant.
Warning: This solution uses a subset of original X array (X without the elements indexed in Y), thus the end result will be the lowest 10 of that subset.

Categories

Resources