Find consecutive sequences based on Boolean array - python

I'm trying to extract sequences from an array b for which a boolean array a is used as index (len(a) >= len(b), but (a==True).sum() == len(b), i.e. there are only as many values true in a than there are elements in b). The sequences should be represented in the result as start and end index of a where a[i] is true and for which there are consecutive values.
For instance, for the following arrays of a and b
a = np.asarray([True, True, False, False, False, True, True, True, False])
b = [1, 2, 3, 4, 5]
the result should be [((0, 1), [1, 2]), ((5, 7), [3, 4, 5])], so as many elements in the array as there are true-sequences. Each true sequence should contain the start and end index from a and the values these relate to from b).
So for the above:
[
((0, 1), [1, 2]), # first true sequence: starting at index=0 (in a), ending at index=1, mapping to the values [1, 2] in b
((5, 7), [3, 4, 5]) # second true sequence: starting at index=5, ending at index=7, with values in b=[3, 4, 5]
]
How can this be done efficiently in numpy?

Here's one NumPy based one inspired by this post -
def func1(a,b):
# "Enclose" mask with sentients to catch shifts later on
mask = np.r_[False,a,False]
# Get the shifting indices
idx = np.flatnonzero(mask[1:] != mask[:-1])
s0,s1 = idx[::2], idx[1::2]
idx_b = np.r_[0,(s1-s0).cumsum()]
out = []
for (i,j,k,l) in zip(s0,s1-1,idx_b[:-1],idx_b[1:]):
out.append(((i, j), b[k:l]))
return out
Sample run -
In [104]: a
Out[104]: array([ True, True, False, False, False, True, True, True, False])
In [105]: b
Out[105]: [1, 2, 3, 4, 5]
In [106]: func1(a,b)
Out[106]: [((0, 1), [1, 2]), ((5, 7), [3, 4, 5])]
Timings -
In [156]: # Using given sample data and tiling it 1000x
...: a = np.asarray([True, True, False, False, False, True, True, True, False])
...: b = [1, 2, 3, 4, 5]
...: a = np.tile(a,1000)
...: b = np.tile(b,1000)
# #Chris's soln
In [157]: %%timeit
...: res = []
...: gen = (i for i in b)
...: for k, g in itertools.groupby(enumerate(a), lambda x:x[1]):
...: if k:
...: ind, bools = list(zip(*g))
...: res.append((ind[0::len(ind)-1], list(itertools.islice(gen, len(bools)))))
100 loops, best of 3: 13.8 ms per loop
In [158]: %timeit func1(a,b)
1000 loops, best of 3: 1.29 ms per loop

Using itertools.groupby and itertools.islice:
import itertools
res = []
gen = (i for i in b)
for k, g in itertools.groupby(enumerate(a), lambda x:x[1]):
if k:
ind, bools = list(zip(*g))
res.append((ind[0::len(ind)-1], list(itertools.islice(gen, len(bools)))))
Output
[((0, 1), [1, 2]), ((5, 7), [3, 4, 5])]
Insights:
itertools.groupby returns grouped object of Trues and Falses.
list[0::len(list)-1] returns first and last element of list.
Since b always have a same number of Trues, make b a generator and grab as many elements as there are Trues.
Time taken:
def itertool_version():
res = []
gen = (i for i in b)
for k, g in itertools.groupby(enumerate(a), lambda x:x[1]):
if k:
ind, bools = list(zip(*g))
res.append((ind[0::len(ind)-1], list(itertools.islice(gen, len(bools)))))
return res
%timeit itertool()
7.11 µs ± 313 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

I don't know about a solution using numpy, but maybe the following for-loop solution will help you (or others) finding a different, more efficient solution:
import numpy as np
a = np.asarray([True, True, False, False, False, True, True, True, False])
b = []
temp_list = []
count = 0
for val in a:
if (val):
count += 1
temp_list.append(count) if len(temp_list) == 0 else None # Only add the first 'True' value in a sequence
# Code only reached if val is not true > append b if temp_list has more than 1 entry
elif (len(temp_list) > 0):
temp_list.append(count) # Add the last true value in a sequence
b.append(temp_list)
temp_list = []
print(b)
>>> [[1, 2], [3, 5]]

Here is my two cents. Hope it helps. [EDITED]
# Get Data
a = np.asarray([True, True, False, False, False, True, True, True, False])
b = [1, 2, 3, 4, 5]
# Assign Index names
ac = ac.astype(float)
ac[ac==1] = b
# Select edges
ac[(np.roll(ac, 1) != 0) & (np.roll(ac, -1) != 0)] = 0 # Clear out intermediates
indices = ac[ac != 0] # Select only edges
indices.reshape(2, int(indices.shape[0]/2)) # group in pairs
Output
>> [[1, 2], [3, 5]]

Solution uses a method where() from numpy:
result = []
f = np.where(a)[0]
m = 1
for j in list(create(f)):
lo = j[1]-j[0]+1
result.append((j, [*range(m, m + lo)]))
m += lo
print(result)
#OUTPUT: [((0, 1), [1, 2]), ((5, 7), [3, 4, 5])]
And there is a method to split an array [0 1 5 6 7] -- > [(0, 1), (5, 7)]:
def create(k):
le = len(k)
i = 0
while i < le:
left = k[i]
while i < le - 1 and k[i] + 1 == k[i + 1]:
i += 1
right = k[i]
if right - left >= 1:
yield (left, right)
elif right - left == 1:
yield (left, )
yield (right, )
else:
yield (left, )
i += 1

Related

Get groups of consecutive elements of a NumPy array based on multiple conditions

I have 2 NumPy array as follow:
import numpy as np
a = np.array([1, 4, 2, 6, 4, 4, 6, 2, 7, 6, 2, 8, 9, 3, 6, 3, 4, 4, 5, 8])
b = np.array([2, 8, 3, 9, 9, 9, 7, 5, 4, 8, 6, 5, 4, 4, 7, 2, 1, 1, 9, 9])
and 2 constant numbers:
c = 6
d = 3
Based on a previous question, I can extract an array each times the elements in a are less than c, 2 or more times consecutively:
array = np.append(a, -np.inf) # padding so we don't lose last element
mask = array >= c # values to be removed
split_indices = np.where(mask)[0]
for subarray in np.split(array, split_indices + 1):
if len(subarray) > 2:
print(subarray[:-1])
which output:
[1. 4. 2.]
[4. 4.]
[3. 4. 4. 5.]
Now, I would like to change my condition for a multiple condition where, 2 or more times consecutively:
elements in a are less than c,
AND
elements in b are less than d
Using the following code:
mask = ((a< c) & (b< d))
I know that my conditions (2 times or more consecutively) are just meet 1 time at indices 15,16 and 17.
Now I would like to extract the value of a corresponding to those indices where my conditions are meet.
Based on the link answer, I tried:
a1= np.append(a, -np.inf)
a2=np.append(b, -np.inf) # padding so we don't lose last element
mask = ((a1< c) & (a2< d)) # values to be removed
split_indices = np.where(mask)[0]
for subarray in np.split(a, split_indices + 1):
if len(subarray) > 2:
print(subarray[:-1])
Which surprisingly, return an array where my coonditions are not meet...
[4 2 6 4 4 6 2 7 6 2 8 9 3 6]
I also tried the np.extract as follow:
np.extract((len(list(g))>=2 for i, g in ((a < c) & (b < d)) if i), a)
which return me a value of 1 and not the value of the array a...
The desired output array should be the one of indice 15,16,17 corresponding to the value [3 4 4] in array a.
Could someone point me to the python tools I could use in order to extract the array fulfilling my multiple conditions?
NOTE: this is a minimal example of my problem, in my "real life" I need to find arrays that meet my conditions 14 times or more consecutively!
Note that in your previous question when you looked for the elements in array that are less than the threshold, your mask was defined not as mask = array < threshold but as an inverse of it: mask = array >= threshold. This is because it was used later to get elements that would be removed.
So, in your new example, you also have to get the inverse of your mask. Instead of mask = (a1 < c) & (a2 < d) you need mask = ~((a1 < c) & (a2 < d)):
a1= np.append(a, -np.inf)
a2 = np.append(b, -np.inf)
mask = ~((a1 < c) & (a2 < d))
split_indices = np.where(mask)[0]
for subarray in np.split(a, split_indices + 1):
if len(subarray) > 2:
print(subarray[:-1])
gives:
[3 4 4]
which is 15-17th elements of a.
Following the conditions, your desired output after placing the 2 conditions are: [3,4,4] from a or [2,1,1] from b right?
Try:
a = [1, 4, 2, 6, 4, 4, 6, 2, 7, 6, 2, 8, 9, 3, 6, 3, 4, 4, 5, 8]
b = [2, 8, 3, 9, 9, 9, 7, 5, 4, 8, 6, 5, 4, 4, 7, 2, 1, 1, 9, 9]
c = 6
d = 3
condition_met = []
a_extract = []
b_extract = []
for i in range(0, len(a)):
if a[i] < c and b[i] < d:
condition_met.append(True)
else:
condition_met.append(False)
printing condition_met list gives [True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, False, False]
Using this, we now check for your conditions:
for i in range(0, len(condition_met)):
if i == 0 and condition_met[i] == True and condition_met[i+1] == True:
a_extract.append(a[i])
b_extract.append(b[i])
elif condition_met[i] == True and condition_met[i+1] == True and i != len(condition_met) - 1 and i > 0 or condition_met[i] == True and condition_met[i-1] == True and i != len(condition_met) - 1 and i > 0:
a_extract.append(a[i])
b_extract.append(b[i])
elif condition_met[i] == True and condition_met[i-1] == True and i == len(condition_met) - 1:
a_extract.append(a[i])
b_extract.append(b[i])
You'll get [3,4,4] for your a_extract list and [2,1,1] for your b_extract list.
Is this what you need?
You can create a mask using SciKit image like
import numpy as np
import skimage
N = 2
mask = ((a < c) & (b < d))
mask2 = np.zeros_like(mask)
tmp = skimage.util.view_as_windows(mask, N).all(axis=1)
mask2[N - 1:-N + 1] = skimage.util.view_as_windows(tmp, N).any(axis=1)
mask2
# array([False, False, False, False, False, False, False, False, False,
# False, False, False, False, False, False, True, True, True,
# False, False])
and get the indices and values using
np.where(mask2)[0] # array([15, 16, 17])
a[mask2] # array([3, 4, 4])

Efficient numpy subarrays extraction from a mask

I am searching a pythonic way to extract multiple subarrays from a given array using a mask as shown in the example:
a = np.array([10, 5, 3, 2, 1])
m = np.array([True, True, False, True, True])
The output will be a collection of array like the following, where only the contiguous "region" of True values (True values next to each other) of the mask m represent the indices generating a subarray.
L[0] = np.array([10, 5])
L[1] = np.array([2, 1])
Here's one approach -
def separate_regions(a, m):
m0 = np.concatenate(( [False], m, [False] ))
idx = np.flatnonzero(m0[1:] != m0[:-1])
return [a[idx[i]:idx[i+1]] for i in range(0,len(idx),2)]
Sample run -
In [41]: a = np.array([10, 5, 3, 2, 1])
...: m = np.array([True, True, False, True, True])
...:
In [42]: separate_regions(a, m)
Out[42]: [array([10, 5]), array([2, 1])]
Runtime test
Other approach(es) -
# #kazemakase's soln
def zip_split(a, m):
d = np.diff(m)
cuts = np.flatnonzero(d) + 1
asplit = np.split(a, cuts)
msplit = np.split(m, cuts)
L = [aseg for aseg, mseg in zip(asplit, msplit) if np.all(mseg)]
return L
Timings -
In [49]: a = np.random.randint(0,9,(100000))
In [50]: m = np.random.rand(100000)>0.2
# #kazemakase's's solution
In [51]: %timeit zip_split(a,m)
10 loops, best of 3: 114 ms per loop
# #Daniel Forsman's solution
In [52]: %timeit splitByBool(a,m)
10 loops, best of 3: 25.1 ms per loop
# Proposed in this post
In [53]: %timeit separate_regions(a, m)
100 loops, best of 3: 5.01 ms per loop
Increasing the average length of islands -
In [58]: a = np.random.randint(0,9,(100000))
In [59]: m = np.random.rand(100000)>0.1
In [60]: %timeit zip_split(a,m)
10 loops, best of 3: 64.3 ms per loop
In [61]: %timeit splitByBool(a,m)
100 loops, best of 3: 14 ms per loop
In [62]: %timeit separate_regions(a, m)
100 loops, best of 3: 2.85 ms per loop
def splitByBool(a, m):
if m[0]:
return np.split(a, np.nonzero(np.diff(m))[0] + 1)[::2]
else:
return np.split(a, np.nonzero(np.diff(m))[0] + 1)[1::2]
This will return a list of arrays, split into chunks of True in m
Sounds like a natural application for np.split.
You first have to figure out where to cut the array, which is where the mask changes between True and False. Next discard all elements where the mask is False.
a = np.array([10, 5, 3, 2, 1])
m = np.array([True, True, False, True, True])
d = np.diff(m)
cuts = np.flatnonzero(d) + 1
asplit = np.split(a, cuts)
msplit = np.split(m, cuts)
L = [aseg for aseg, mseg in zip(asplit, msplit) if np.all(mseg)]
print(L[0]) # [10 5]
print(L[1]) # [2 1]

python: mean of variable length 2 matrix

Consider the following variable length 2D array
[
[1, 2, 3],
[4, 5],
[6, 7, 8, 9]
]
How can i find the mean of the variables along the column?
I want something like [(1+4+6)/3,(2+5+7)/3, (3+8)/2, 9/1]
So the end result would be [3.667, 4.667, 5.5, 9]
Is this possible using numpy?
I tried np.mean(x, axis=0), but numpy expects the arrays of same dimension.
Right now, I am popping the elements of each column and finding the mean. Is there a better way to achieve the result?
You could use pandas:
import pandas as pd
a = [[1, 2, 3],
[4, 5],
[6, 7, 8, 9]]
df = pd.DataFrame(a)
# 0 1 2 3
# 0 1 2 3 NaN
# 1 4 5 NaN NaN
# 2 6 7 8 9
df.mean()
# 0 3.666667
# 1 4.666667
# 2 5.500000
# 3 9.000000
# dtype: float64
Here is another solution that only uses numpy:
import numpy as np
nrows = len(a)
ncols = max(len(row) for row in a)
arr = np.zeros((nrows, ncols))
arr.fill(np.nan)
for jrow, row in enumerate(a):
for jcol, col in enumerate(row):
arr[jrow, jcol] = col
print np.nanmean(arr, axis=0)
# array([ 3.66666667, 4.66666667, 5.5 , 9. ])
Very simple alternative approach using itertools.izip_longest() as:
>>> mean_list = []
>>> for sub_list in izip_longest(*my_list):
... filtered_list = filter(None, sub_list)
... mean_list.append(sum(filtered_list)/(len(filtered_list)*1.0))
...
>>> mean_list
[3.6666666666666665, 4.666666666666667, 5.5, 9.0]
where my_list equals to:
[
[1, 2, 3],
[4, 5],
[6, 7, 8, 9]
]
Listed in this post is an almost vectorized approach using NumPy. We would try to assign each element in list element an ID based on their positions. These IDs could then be fed to np.bincount as it would perform ID based summations. Finally, we would divide the summations respectively by the lengths of each ID to get the final average values.
Thus, we would have an implementation like so -
def variable_mean(a):
vals = np.concatenate(a)
lens = np.array(map(len,a))
id_arr = np.ones(vals.size,dtype=int)
id_arr[0] = 0
id_arr[lens.cumsum()[:-1]] = -lens[:-1] + 1
IDs = id_arr.cumsum()
return np.bincount(IDs,vals)/np.bincount(IDs)
Runtime test -
In [298]: # Setup input
...: N = 1000 # number of elems in input list
...: minL = 3 # min len of an element (list) in input list
...: maxL = 10 # max len of an element (list) in input list
...: a = [list(np.random.randint(0,9,(i))) \
...: for i in np.random.randint(minL,maxL,(N))]
...:
In [299]: %timeit pd.DataFrame(a).mean() ##Julien Spronck's pandas soln
100 loops, best of 3: 3.33 ms per loop
In [300]: %timeit variable_mean(a)
100 loops, best of 3: 2.36 ms per loop
In [301]: # Setup input
...: N = 1000 # number of elems in input list
...: minL = 3 # min len of an element (list) in input list
...: maxL = 100 # max len of an element (list) in input list
...: a = [list(np.random.randint(0,9,(i))) \
...: for i in np.random.randint(minL,maxL,(N))]
...:
In [302]: %timeit pd.DataFrame(a).mean() ##Julien Spronck's pandas soln
10 loops, best of 3: 27.1 ms per loop
In [303]: %timeit variable_mean(a)
100 loops, best of 3: 9.58 ms per loop
If you want to do it manually, what I would do:
max_length = 0
Figure out the max array length:
for array in arrays:
if len(array) > max:
max = len(array)
Pad all arrays to that length with 'None'
for array in arrays:
while len(array) < max:
array.append(None)
Zip will group the columns
columns = zip(*arrays)
columns == [(1, 4, 6), (2, 5, 7), (3, 'None', 8), ('None', 'None', 9)]
Calculate the average as you would for any list:
for col in columns:
count = 0
sum = 0.0
for num in col:
if num is not None:
count += 1
sum += float(num)
print "%s: Avg %s" % (col, sum/count)
Or as a list comprehension after padding the arrays:
[sum(filter(None, col))/float(len(filter(None, col))) for col in zip(*arrays)]
Output:
(1, 4, 6): Avg 3.66666666667
(2, 5, 7): Avg 4.66666666667
(3, 'None', 8): Avg 5.5
('None', 'None', 9): Avg 9.0
In Py3, zip_longest takes a fillvalue parameter:
In [1208]: ll=[
...: [1, 2, 3],
...: [4, 5],
...: [6, 7, 8, 9]
...: ]
In [1209]: list(itertools.zip_longest(*ll, fillvalue=np.nan))
Out[1209]: [(1, 4, 6), (2, 5, 7), (3, nan, 8), (nan, nan, 9)]
By filling with nan, I can use np.nanmean to take the mean ignoring the nan. nanmean turns its input (here _ from the previous line) into an array:
In [1210]: np.nanmean(_, axis=1)
Out[1210]: array([ 3.66666667, 4.66666667, 5.5 , 9. ])

remove items with low frequency

Let's consider the array of length n:
y=np.array([1,1,1,1,2,2,2,3,3,3,3,3,2,2,2,2,1,4,1,1,1])
and the matrix X of size n x m.
I want to remove items of y and rows of X, for which the corresponding value of y has low frequency.
I figured out this would give me the values of y which should be removed:
>>> items, count = np.unique(y, return_counts=True)
>>> to_remove = items[count < 3] # array([4])
and this would remove the items:
>>> X=X[y != to_remove,:]
>>> y=y[y != to_remove]
array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1])
While the code above works when there is only one label to remove, it fails when there are multiple values of y with low frequency (i.e. y=np.array([1,1,1,1,2,2,2,3,3,3,3,3,2,2,2,2,1,4,1,1,1,5,5,1,1]) would cause to_remove to be array([4, 5])):
>>> y[y != to_remove,:]
Traceback (most recent call last):
File "<input>", line 1, in <module>
IndexError: too many indices for array
How to fix this in a concise way?
You can use an additional output parameter return_inverse in np.unique like so -
def unique_where(y):
_, idx, count = np.unique(y, return_inverse=True,return_counts=True)
return y[np.in1d(idx,np.where(count>=3)[0])]
def unique_arange(y):
_, idx, count = np.unique(y, return_inverse=True,return_counts=True)
return y[np.in1d(idx,np.arange(count.size)[count>=3])]
You can use np.bincount for counting that is supposedly pretty efficient at counting and might suit it better here, assuming y contains non-negative numbers, like so -
def bincount_where(y):
counts = np.bincount(y)
return y[np.in1d(y,np.where(counts>=3)[0])]
def bincount_arange(y):
counts = np.bincount(y)
return y[np.in1d(y,np.arange(y.max())[counts>=3])]
Runtime tests -
This section times the above listed three approaches alongwith the approach listed in #Ashwini Chaudhary's solution -
In [85]: y = np.random.randint(0,100000,50000)
In [90]: def unique_items_indexed(y): # #Ashwini Chaudhary's solution
...: items, count = np.unique(y, return_counts=True)
...: return y[np.in1d(y, items[count >= 3])]
...:
In [115]: %timeit unique_items_indexed(y)
10 loops, best of 3: 19.8 ms per loop
In [116]: %timeit unique_where(y)
10 loops, best of 3: 26.9 ms per loop
In [117]: %timeit unique_arange(y)
10 loops, best of 3: 26.5 ms per loop
In [118]: %timeit bincount_where(y)
100 loops, best of 3: 16.7 ms per loop
In [119]: %timeit bincount_arange(y)
100 loops, best of 3: 16.5 ms per loop
You're looking for numpy.in1d:
>>> y = np.array([1,1,1,1,2,2,2,3,3,3,3,3,2,2,2,2,1,4,1,1,1,5,5,1,1])
>>> items, count = np.unique(y, return_counts=True)
>>> to_remove = items[count < 3]
>>> y[~np.in1d(y, to_remove)]
array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])
If you have more than one value in to_remove the operation is ill defined:
>>> to_remove
array([4, 5])
>>> y != to_remove
True
Use the operator in1d:
>>> ~np.in1d(y, to_remove)
array([ True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, False,
True, True, True, False, False, True, True], dtype=bool)

Python version of ismember with 'rows' and index

The similar question has been asked, but none of the answers quite do what I need - some allow multidimensional searches (aka 'rows' option in matlab) but dont return the index. Some return the index but dont allow rows. My arrays are very large (1M x 2) and I have bee successful in making a loop that works, but obviously that is very slow. In matlab, the built-in ismember function takes about 10 seconds.
Here is what I am looking for:
a=np.array([[4, 6],[2, 6],[5, 2]])
b=np.array([[1, 7],[1, 8],[2, 6],[2, 1],[2, 4],[4, 6],[4, 7],[5, 9],[5, 2],[5, 1]])
The exact matlab function that does the trick is:
[~,index] = ismember(a,b,'rows')
where
index = [6, 3, 9]
import numpy as np
def asvoid(arr):
"""
View the array as dtype np.void (bytes)
This views the last axis of ND-arrays as bytes so you can perform comparisons on
the entire row.
http://stackoverflow.com/a/16840350/190597 (Jaime, 2013-05)
Warning: When using asvoid for comparison, note that float zeros may compare UNEQUALLY
>>> asvoid([-0.]) == asvoid([0.])
array([False], dtype=bool)
"""
arr = np.ascontiguousarray(arr)
return arr.view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[-1])))
def in1d_index(a, b):
voida, voidb = map(asvoid, (a, b))
return np.where(np.in1d(voidb, voida))[0]
a = np.array([[4, 6],[2, 6],[5, 2]])
b = np.array([[1, 7],[1, 8],[2, 6],[2, 1],[2, 4],[4, 6],[4, 7],[5, 9],[5, 2],[5, 1]])
print(in1d_index(a, b))
prints
[2 5 8]
This would be equivalent to Matlab's [3, 6, 9], since Python uses 0-based indexing.
Some caveats:
The indices are returned in increasing order. They do not correspond
to the location of the items of a in b.
asvoid will work for integer dtypes, but be careful if using asvoid
on float dtypes, since asvoid([-0.]) == asvoid([0.]) returns
array([False]).
asvoid works best on contiguous arrays. If the arrays are not contiguous, the data will be copied to a contiguous array, which will slow down the performance.
Despite the caveats, one might choose to use in1d_index anyway for the sake of speed:
def ismember_rows(a, b):
# http://stackoverflow.com/a/22705773/190597 (ashg)
return np.nonzero(np.all(b == a[:,np.newaxis], axis=2))[1]
In [41]: a2 = np.tile(a,(2000,1))
In [42]: b2 = np.tile(b,(2000,1))
In [46]: %timeit in1d_index(a2, b2)
100 loops, best of 3: 8.49 ms per loop
In [47]: %timeit ismember_rows(a2, b2)
1 loops, best of 3: 5.55 s per loop
So in1d_index is ~650x faster (for arrays of length in the low thousands), but again note the comparison is not exactly apples-to-apples since in1d_index returns the indices in increasing order, while ismember_rows returns the indices in the order rows of a show up in b.
import numpy as np
def ismember_rows(a, b):
'''Equivalent of 'ismember' from Matlab
a.shape = (nRows_a, nCol)
b.shape = (nRows_b, nCol)
return the idx where b[idx] == a
'''
return np.nonzero(np.all(b == a[:,np.newaxis], axis=2))[1]
a = np.array([[4, 6],[2, 6],[5, 2]])
b = np.array([[1, 7],[1, 8],[2, 6],[2, 1],[2, 4],[4, 6],[4, 7],[5, 9],[5, 2],[5, 1]])
idx = ismember_rows(a, b)
print idx
print np.all(b[idx] == a)
print
array([5, 2, 8])
True
e...I used broadcasting
--------------------------[update]------------------------------
def ismember(a, b):
return np.flatnonzero(np.in1d(b[:,0], a[:,0]) & np.in1d(b[:,1], a[:,1]))
a = np.array([[4, 6],[2, 6],[5, 2]])
b = np.array([[1, 7],[1, 8],[2, 6],[2, 1],[2, 4],[4, 6],[4, 7],[5, 9],[5, 2],[5, 1]])
a2 = np.tile(a,(2000,1))
b2 = np.tile(b,(2000,1))
%timeit timeit in1d_index(a2, b2)
# 100 loops, best of 3: 8.74 ms per loop
%timeit ismember(a2, b2)
# 100 loops, best of 3: 8.5 ms per loop
np.all(in1d_index(a2, b2) == ismember(a2, b2))
# True
as what unutbu said, the indices are returned in increasing order
The function first turns multiple columns of elements into a single column array, then numpy.in1d can be used to find out the desire answer, please try the following code:
import numpy as np
def ismemberRow(A,B):
'''
This function is find which rows found in A can be also found in B,
The function first turns multiple columns of elements into a single column array, then numpy.in1d can be used
Input: m x n numpy array (A), and p x q array (B)
Output unique numpy array with length m, storing either True or False, True for rows can be found in both A and B
'''
sa = np.chararray((A.shape[0],1))
sa[:] = '-'
sb = np.chararray((B.shape[0],1))
sb[:] = '-'
ba = (A).astype(np.str)
sa2 = np.expand_dims(ba[:,0],axis=1) + sa + np.expand_dims(ba[:,1],axis=1)
na = A.shape[1] - 2
for i in range(0,na):
sa2 = sa2 + sa + np.expand_dims(ba[:,i+2],axis=1)
bb = (B).astype(np.str)
sb2 = np.expand_dims(bb[:,0],axis=1) + sb + np.expand_dims(bb[:,1],axis=1)
nb = B.shape[1] - 2
for i in range(0,nb):
sb2 = sb2 + sb + np.expand_dims(bb[:,i+2],axis=1)
return np.in1d(sa2,sb2)
A = np.array([[1, 3, 4],[2, 4, 3],[7, 4, 3],[1, 1, 1],[1, 3, 4],[5, 3, 4],[1, 1, 1],[2, 4, 3]])
B = np.array([[1, 3, 4],[1, 1, 1]])
d = ismemberRow(A,B)
print A[np.where(d)[0],:]
#results:
#[[1 3 4]
# [1 1 1]
# [1 3 4]
# [1 1 1]]
Here's a function based on libigl's igl::ismember_rows which closely mimics the behavior of Matlab's ismember(A,B,'rows'):
def ismember_rows(A,B, return_index=False):
"""
Return whether each row in A occurs as a row in B
Parameters
----------
A : #A by dim array
B : #B by dim array
return_index : {True,False}, optional.
Returns
-------
IA : #A 1D array, IA[i] == True if and only if
there exists j = LOCB[i] such that B[j,:] == A[i,:]
LOCB : #A 1D array of indices. LOCB[j] == -1 if IA[i] == False,
only returned if return_index=True
"""
IA = np.full(A.shape[0],False)
LOCB = np.full(A.shape[0],-1)
if len(A) == 0: return (IA,LOCB) if return_index else IA
if len(B) == 0: return (IA,LOCB) if return_index else IA
# Get rid of any duplicates
uA,uIuA = np.unique(A, axis=0, return_inverse=True)
uB,uIB = np.unique(B, axis=0, return_index=True)
# Sort both
sIA = np.lexsort(uA.T[::-1])
sA = uA[sIA,:]
sIB = np.lexsort(uB.T[::-1])
sB = uB[sIB,:]
#
uF = np.full(sA.shape[0],False)
uLOCB = np.full(sA.shape[0],-1)
def row_greater_than(a,b):
for c in range(sA.shape[1]):
if(sA[a,c] > sB[b,c]): return True
if(sA[a,c] < sB[b,c]): return False
return False
# loop over sA
bi = 0
past = False
for a in range(sA.shape[0]):
while not past and row_greater_than(a,bi):
bi+=1
past = bi>=sB.shape[0]
if not past and np.all(sA[a,:]==sB[bi,:]):
uF[sIA[a]] = True
uLOCB[sIA[a]] = uIB[sIB[bi]]
for a in range(A.shape[0]):
IA[a] = uF[uIuA[a]]
LOCB[a] = uLOCB[uIuA[a]]
return (IA,LOCB) if return_index else IA
For example,
a=np.array([[4, 6],[6,6],[2, 6],[5, 2]])
b=np.array([[1, 7],[1, 8],[2, 6],[2, 1],[2, 4],[4, 6],[4, 7],[5, 9],[5, 2],[5, 1]])
(flag,index) = ismember_rows(a,b,return_index=True)
produces
>>> flag
array([ True, False, True, True])
>>> index
array([ 5, -1, 2, 8])
Update: Here's a faster version that makes better use of numpy.unique based on array_correspondence in gpytoolbox.
def ismember_rows(A,B,return_index=False):
"""
Return whether each row in A occurs as a row in B
Parameters
----------
A : #A by dim array
B : #B by dim array
return_index : {True,False}, optional.
Returns
-------
IA : #A 1D array, IA[i] == True if and only if
there exists j = LOCB[i] such that B[j,:] == A[i,:]
LOCB : #A 1D array of indices. LOCB[j] == -1 if IA[i] == False,
only returned if return_index=True
"""
if len(A) == 0 or len(B) == 0:
IA = np.full(A.shape[0],False)
LOCB = np.full(A.shape[0],-1)
return (IA,LOCB) if return_index else IA
uB,mapB = np.unique(B,axis=0, return_index=True)
uU,idx,inv = np.unique(np.vstack((uB,A)),axis=0,return_index=True, return_inverse=True)
imap = idx[inv[uB.shape[0]:]]
imap[imap>=uB.shape[0]] = -1
LOCB = np.where(imap<0, -1, mapB[imap])
IA = LOCB>=0
return (IA,LOCB) if return_index else IA
Seems to be a bit faster on my laptop.

Categories

Resources