extract several columns in one op from tf.tensor - python

In recent TensorFlow (1.13 or 2.0) is there a way to extract non-contiguous slices from a tensor in one pass? How to do it?
For instance with the following tensor:
1 2 3 4
5 6 7 8
I want to extract columns 1 and 3 in one op to get:
2 4
6 8
However it seems I cannot do it in a single op with slicing.
What's the correct/fastest/most elegant way to do this?

1. Using tf.gather(tensor, columns, axis=1) (TF1.x, TF2):
import tensorflow as tf
tensor = tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32)
columns = [1, 3]
print(tf.gather(tensor, columns, axis=1).numpy())
%timeit -n 10000 tf.gather(tensor, columns, axis=1)
# [[2. 4.]
# [6. 8.]]
82.6 µs ± 5.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
2. With indexing (TF1.x, TF2):
import tensorflow as tf
tensor = tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32)
columns = [1, 3] # <--columns you want to extract
transposed = tf.transpose(tensor)
sliced = [transposed[c] for c in columns]
stacked = tf.transpose(tf.stack(sliced, axis=0))
# print(stacked.numpy()) # <-- TF2, TF1.x-eager
with tf.Session() as sess: # <-- TF1.x
print(sess.run(stacked))
# [[2. 4.]
# [6. 8.]]
Wrapping it to a function and running %timeit in tf.__version__=='2.0.0-alpha0':
154 µs ± 2.61 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Decorating it with #tf.function is more than 2 times faster:
import tensorflow as tf
tensor = tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32)
columns = [1, 3] # <--columns you want to extract
#tf.function
def extract_columns(tensor=tensor, columns=columns):
transposed = tf.transpose(tensor)
sliced = [transposed[c] for c in columns]
stacked = tf.transpose(tf.stack(sliced, axis=0))
return stacked
%timeit -n 10000 extract_columns()
66.8 µs ± 2.03 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
3. One-liner for eager execution (TF2, TF1.x-eager):
import tensorflow as tf
tensor = tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32)
columns = [1, 3] # <--columns you want to extract
res = tf.transpose(tf.stack([t for i, t in enumerate(tf.transpose(tensor))
if i in columns], 0))
print(res.numpy())
# [[2. 4.]
# [6. 8.]]
%timeit in tf.__version__=='2.0.0-alpha0':
242 µs ± 2.97 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
4. Use tf.one_hot() to specify rows/columns and then tf.boolean_mask() to extract these rows/columns (TF1.x, TF2):
import tensorflow as tf
tensor = tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32)
columns = [1, 3] # <--columns you want to extract
mask = tf.one_hot(columns, tensor.get_shape().as_list()[-1])
mask = tf.reduce_sum(mask, axis=0)
res = tf.transpose(tf.boolean_mask(tf.transpose(tensor), mask))
# print(res.numpy()) # <-- TF2, TF1.x-eager
with tf.Session() as sess: # TF1.x
print(sess.run(res))
# [[2. 4.]
# [6. 8.]]
%timeit in tf.__version__=='2.0.0-alpha0':
494 µs ± 4.01 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

You can get all odd numbered columns with a combination of reshapes and a slice:
N = 4
M = 10
input = tf.constant(np.random.rand(M, N))
slice_odd = tf.reshape(tf.reshape(input, (-1, 2))[:,1], (-1, int(N/2)))

Related

Roll first column by 1, second column by 2, etc

I have an array in numpy. I want to roll the first column by 1, second column by 2, etc.
Here is an example.
>>> x = np.reshape(np.arange(15), (5, 3))
>>> x
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
What I want to do:
>>> y = roll(x)
>>> y
array([[12, 10, 8],
[ 0, 13, 11],
[ 3, 1, 14],
[ 6, 4, 2],
[ 9, 7, 5]])
What is the best way to do it?
The real array will be very big. I'm using cupy, the GPU version of numpy. I will prefer solution fastest on GPU, but of course, any idea is welcomed.
You could use advanced indexing:
import numpy as np
x = np.reshape(np.arange(15), (5, 3))
h, w = x.shape
rows, cols = np.arange(h), np.arange(w)
offsets = cols + 1
shifted = np.subtract.outer(rows, offsets) % h
y = x[shifted, cols]
y:
array([[12, 10, 8],
[ 0, 13, 11],
[ 3, 1, 14],
[ 6, 4, 2],
[ 9, 7, 5]])
I implemented a naive solution (roll_for) and compares it to #Chrysophylaxs 's solution (roll_indexing).
Conclusion: roll_indexing is faster for small arrays, but the difference shrinks when the array goes bigger, and is eventually slower than roll_for for very large arrays.
Implementations:
import numpy as np
def roll_for(x, shifts=None, axis=-1):
if shifts is None:
shifts = np.arange(1, x.shape[axis] + 1) # OP requirement
xt = x.swapaxes(axis, 0) # https://stackoverflow.com/a/31094758/13636407
yt = np.empty_like(xt)
for idx, shift in enumerate(shifts):
yt[idx] = np.roll(xt[idx], shift=shift)
return yt.swapaxes(0, axis)
def roll_indexing(x):
h, w = x.shape
rows, cols = np.arange(h), np.arange(w)
offsets = cols + 1
shifted = np.subtract.outer(rows, offsets) % h # fix
return x[shifted, cols]
Tests:
M, N = 5, 3
x = np.arange(M * N).reshape(M, N)
expected = np.array([[12, 10, 8], [0, 13, 11], [3, 1, 14], [6, 4, 2], [9, 7, 5]])
assert np.array_equal(expected, roll_for(x))
assert np.array_equal(expected, roll_indexing(x))
M, N = 100, 200
# roll_indexing did'nt work when M < N before fix
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
Benchmark:
M, N = 100, 100
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 859 µs ± 2.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit roll_indexing(x) # 81 µs ± 255 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
M, N = 1_000, 1_000
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 12.7 ms ± 56.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit roll_indexing(x) # 12.4 ms ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
M, N = 10_000, 10_000
x = np.arange(M * N).reshape(M, N)
assert np.array_equal(roll_for(x), roll_indexing(x))
%timeit roll_for(x) # 1.3 s ± 6.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit roll_indexing(x) # 1.61 s ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

How to find cosine similarity of one vector vs matrix

I have a TF-IDF matrix of shape (149,1001). What is want is to compute the cosine similarity of last columns, with all columns
Here is what I did
from numpy import dot
from numpy.linalg import norm
for i in range(mat.shape[1]-1):
cos_sim = dot(mat[:,i], mat[:,-1])/(norm(mat[:,i])*norm(mat[:,-1]))
cos_sim
But this loop is making it slow. So, is there any efficient way? I want to do with numpy only
Leverage 2D vectorized matrix-multiplication
Here's one with NumPy using matrix-multiplication on 2D data -
p1 = mat[:,-1].dot(mat[:,:-1])
p2 = norm(mat[:,:-1],axis=0)*norm(mat[:,-1])
out1 = p1/p2
Explanation : p1 is the vectorized equivalent of looping of dot(mat[:,i], mat[:,-1]). p2 is of (norm(mat[:,i])*norm(mat[:,-1])).
Sample run for verification -
In [57]: np.random.seed(0)
...: mat = np.random.rand(149,1001)
In [58]: out = np.empty(mat.shape[1]-1)
...: for i in range(mat.shape[1]-1):
...: out[i] = dot(mat[:,i], mat[:,-1])/(norm(mat[:,i])*norm(mat[:,-1]))
In [59]: p1 = mat[:,-1].dot(mat[:,:-1])
...: p2 = norm(mat[:,:-1],axis=0)*norm(mat[:,-1])
...: out1 = p1/p2
In [60]: np.allclose(out, out1)
Out[60]: True
Timings -
In [61]: %%timeit
...: out = np.empty(mat.shape[1]-1)
...: for i in range(mat.shape[1]-1):
...: out[i] = dot(mat[:,i], mat[:,-1])/(norm(mat[:,i])*norm(mat[:,-1]))
18.5 ms ± 977 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [62]: %%timeit
...: p1 = mat[:,-1].dot(mat[:,:-1])
...: p2 = norm(mat[:,:-1],axis=0)*norm(mat[:,-1])
...: out1 = p1/p2
939 µs ± 29.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# #yatu's soln
In [89]: a = mat
In [90]: %timeit cosine_similarity(a[None,:,-1] , a.T[:-1])
2.47 ms ± 461 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Further optimize on norm with einsum
Alternatively, we could compute p2 with np.einsum.
So, norm(mat[:,:-1],axis=0) could be replaced by :
np.sqrt(np.einsum('ij,ij->j',mat[:,:-1],mat[:,:-1]))
Hence, giving us a modified p2 :
p2 = np.sqrt(np.einsum('ij,ij->j',mat[:,:-1],mat[:,:-1]))*norm(mat[:,-1])
Timings on same setup as earlier -
In [82]: %%timeit
...: p1 = mat[:,-1].dot(mat[:,:-1])
...: p2 = np.sqrt(np.einsum('ij,ij->j',mat[:,:-1],mat[:,:-1]))*norm(mat[:,-1])
...: out1 = p1/p2
607 µs ± 132 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
30x+ speedup over loopy one!
There's an sklearn function to compute the cosine similarity between vectors, cosine_similarity. Here's a use case with an example array:
a = np.random.randint(0,10,(5,5))
print(a)
array([[5, 2, 0, 4, 1],
[4, 2, 8, 2, 4],
[9, 7, 4, 9, 7],
[4, 6, 0, 1, 3],
[1, 1, 2, 5, 0]])
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(a[None,:,-1] , a.T[:-1])
# array([[0.94022805, 0.91705665, 0.75592895, 0.79921221, 1. ]])
Where a[None,-1] is the last column in a, reshaped so that both matrices have equally shaped Mat.shape[1], which is a requirement of the function:
a[None,:,-1]
# array([[1, 4, 7, 3, 0]])
And by transposing, the result will be the cosine_similarity with all other columns.
Check with the solution from the question:
from numpy import dot
from numpy.linalg import norm
cos_sim = []
for i in range(a.shape[1]-1):
cos_sim.append(dot(a[:,i], a[:,-1])/(norm(a[:,i])*norm(a[:,-1])))
np.allclose(cos_sim, cosine_similarity(a[None,:,-1] , a.T[:-1]))
# True

Creating a tumbling windows in python

Just wondering if there is a way to construct a tumbling window in python. So for example if I have list/ndarray , listA = [3,2,5,9,4,6,3,8,7,9]. Then how could I find the maximum of the first 3 items (3,2,5) -> 5, and then the next 3 items (9,4,6) -> 9 and so on... Sort of like breaking it up to sections and finding the max. So the final result would be list [5,9,8,9]
Approach #1: One-liner for windowed-max using np.maximum.reduceat -
In [118]: np.maximum.reduceat(listA,np.arange(0,len(listA),3))
Out[118]: array([5, 9, 8, 9])
Becomes more compact with np.r_ -
np.maximum.reduceat(listA,np.r_[:len(listA):3])
Approach #2: Generic ufunc way
Here's a function for generic ufuncs and that window length as a parameter -
def windowed_ufunc(a, ufunc, W):
a = np.asarray(a)
n = len(a)
L = W*(n//W)
out = ufunc(a[:L].reshape(-1,W),axis=1)
if n>L:
out = np.hstack((out, ufunc(a[L:])))
return out
Sample run -
In [81]: a = [3,2,5,9,4,6,3,8,7,9]
In [82]: windowed_ufunc(a, ufunc=np.max, W=3)
Out[82]: array([5, 9, 8, 9])
On other ufuncs -
In [83]: windowed_ufunc(a, ufunc=np.min, W=3)
Out[83]: array([2, 4, 3, 9])
In [84]: windowed_ufunc(a, ufunc=np.sum, W=3)
Out[84]: array([10, 19, 18, 9])
In [85]: windowed_ufunc(a, ufunc=np.mean, W=3)
Out[85]: array([3.33333333, 6.33333333, 6. , 9. ])
Benchmarking
Timings on NumPy solutions on array data with sample data scaled up by 10000x -
In [159]: a = [3,2,5,9,4,6,3,8,7,9]
In [160]: a = np.tile(a, 10000)
# #yatu's soln
In [162]: %timeit moving_maxima(a, w=3)
435 µs ± 8.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# From this post - app#1
In [167]: %timeit np.maximum.reduceat(a,np.arange(0,len(a),3))
353 µs ± 2.55 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# From this post - app#2
In [165]: %timeit windowed_ufunc(a, ufunc=np.max, W=3)
379 µs ± 6.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
If you want a one-liner, you can use list comprehension:
listA = [3,2,5,9,4,6,3,8,7,9]
listB=[max(listA[i:i+3]) for i in range(0,len(listA),3)]
print (listB)
it returns:
[5, 9, 8, 9]
Of course the codes can be written more dynamically: if you want a different window size, just change 3 to any integer.
Using numpy, you can extend the list with zeroes so its length is divisible by the window size, and reshape and compute the maxalong the second axis:
def moving_maxima(a, w):
mod = len(a)%w
d = w if mod else mod
x = np.r_[a, [0]*(d-mod)]
return x.reshape(-1,w).max(1)
Some examples:
moving_maxima(listA,2)
# array([3., 9., 6., 8., 9.])
moving_maxima(listA,3)
#array([5, 9, 8, 9])
moving_maxima(listA,4)
#array([9, 8, 9])

Boolean indexing array through array of boolean indexes without loop

I want to index an array with a boolean mask through multiple boolean arrays without a loop.
This is what I want to achieve but without a loop and only with numpy.
import numpy as np
a = np.array([[0, 1],[2, 3]])
b = np.array([[[1, 0], [1, 0]], [[0, 0], [1, 1]]], dtype=bool)
r = []
for x in b:
print(a[x])
r.extend(a[x])
# => array([0, 2])
# => array([2, 3])
print(r)
# => [0, 2, 2, 3]
# what I would like to do is something like this
r = some_fancy_indexing_magic_with_b_and_a
print(r)
# => [0, 2, 2, 3]
Approach #1
Simply broadcast a to b's shape with np.broadcast_to and then mask it with b -
In [15]: np.broadcast_to(a,b.shape)[b]
Out[15]: array([0, 2, 2, 3])
Approach #2
Another would be getting all the indices and mod those by the size of a, which would also be the size of each 2D block in b and then indexing into flattened a -
a.ravel()[np.flatnonzero(b)%a.size]
Approach #3
On the same lines as App#2, but keeping the 2D format and using non-zero indices along the last two axes of b -
_,r,c = np.nonzero(b)
out = a[r,c]
Timings on large arrays (given sample shapes scaled up by 100x) -
In [50]: np.random.seed(0)
...: a = np.random.rand(200,200)
...: b = np.random.rand(200,200,200)>0.5
In [51]: %timeit np.broadcast_to(a,b.shape)[b]
45.5 ms ± 381 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [52]: %timeit a.ravel()[np.flatnonzero(b)%a.size]
94.6 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [53]: %%timeit
...: _,r,c = np.nonzero(b)
...: out = a[r,c]
128 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Python Numpy appending multiple lists from objects

I am calling an object several times that is returning a numpy list:
for x in range(0,100):
d = simulation3()
d = [0, 1, 2, 3]
d = [4, 5, 6, 7]
..and many more
I want to take each list and append it to a 2D array.
final_array = [[0, 1, 2, 3],[4, 5, 6, 7]...and so forth]
I tried creating an empty array (final_array = np.zeros(4,4)) and appending it but the values are appending after the 4X4 matrix is created.
Can anyone help me with this? thank you!
You can use np.fromiter to create an array from an iterable. Since, by default, this function only works with scalars, you can use itertools.chain to help:
np.random.seed(0)
from itertools import chain
def simulation3():
return np.random.randint(0, 10, 4)
n = 5
d = np.fromiter(chain.from_iterable(simulation3() for _ in range(5)), dtype='i')
d.shape = 5, 4
print(d)
array([[5, 0, 3, 3],
[7, 9, 3, 5],
[2, 4, 7, 6],
[8, 8, 1, 6],
[7, 7, 8, 1]], dtype=int32)
But this is relatively inefficient. NumPy performs best with fixed size arrays. If you know the size of your array in advance, you can define an empty array and update rows sequentially. See the alternatives described by #norok2.
there are multiple way to do it in numpy , the easiest way is to use vstack like this :
for Ex :
#you have these array you want to concat
d1 = [0, 1, 2, 3]
d2 = [4, 5, 6, 7]
d3 = [4, 5, 6, 7]
#initialize your variable with zero raw
X = np.zeros((0,4))
#then each time you call your function use np.vstack like this :
X = np.vstack((np.array(d1),X))
X = np.vstack((np.array(d2),X))
X = np.vstack((np.array(d2),X))
# and finally you have your array like below
#array([[4., 5., 6., 7.],
# [4., 5., 6., 7.],
# [0., 1., 2., 3.]])
The optimal solution depends on the numbers / sizes you are dealing with.
My favorite solution (which only works if you already know the size of the final result) is to initialize the array which will contain your results and then fill each you could initialize your result and then fill it using views.
This the most memory efficient solution.
If you do not know the size of the final result, then you are better off by generating a list of lists, which can be converted (or stacked) as a NumPy array at the end of the process.
Here are some examples, where gen_1d_list() is used to generate some random numbers to mimic the result of simulate3() (meaning that in the following code, you should replace gen_1d_list(n, dtype) with simulate3()):
stacking1() implements the filling using views
stacking2() implements the list generation and converting to NumPy array
stacking3() implements the list generation and stacking to NumPy array
stacking4() implements the dynamic modification of a NumPy array using vstack() as proposed earlier.
import numpy as np
def gen_1d_list(n, dtype=int):
return list(np.random.randint(1, 100, n, dtype))
def stacking1(n, m, dtype=int):
arr = np.empty((n, m), dtype=dtype)
for i in range(n):
arr[i] = gen_1d_list(m, dtype)
return arr
def stacking2(n, m, dtype=int):
items = [gen_1d_list(m, dtype) for i in range(n)]
arr = np.array(items)
return arr
def stacking3(n, m, dtype=int):
items = [gen_1d_list(m, dtype) for i in range(n)]
arr = np.stack(items, dtype)
return arr
def stacking4(n, m, dtype=int):
arr = np.zeros((0, m), dtype=dtype)
for i in range(n):
arr = np.vstack((gen_1d_list(m, dtype), arr))
return arr
Time-wise, stacking1() and stacking2() are more or less equally fast, while stacking3() and stacking4() are slower (and, in proportion, much slower for small size inputs).
Some numbers, for small size inputs:
n, m = 4, 10
%timeit stacking1(n, m)
# 15.7 µs ± 182 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit stacking2(n, m)
# 14.2 µs ± 141 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit stacking3(n, m)
# 22.7 µs ± 282 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit stacking4(n, m)
# 31.8 µs ± 270 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
and for larger size inputs:
n, m = 4, 1000000
%timeit stacking1(n, m)
# 344 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit stacking2(n, m)
# 350 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit stacking3(n, m)
# 370 ms ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit stacking4(n, m)
# 369 ms ± 3.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Categories

Resources