Efficient probability tree branching - python

As an example, I have an array of branches and probabilities that looks like this:
paths = np.array([
[1, 0, 1.0],
[2, 0, 0.4],
[2, 1, 0.6],
[3, 1, 1.0],
[5, 1, 0.25],
[5, 2, 0.5],
[5, 4, 0.25],
[6, 0, 0.7],
[6, 5, 0.2],
[6, 2, 0.1]])
The columns are upper node, lower node, probability.
Here's a visual of the nodes:
6
/ | \
5 0 2
/ | \ / \
1 2 4 0 1
| /\ |
0 0 1 0
|
0
I want to be able to pick a starting node and output an array of the branches and cumulative probabilities, including all the duplicate branches. For example:
start_node = 5 should return
array([
[5, 1, 0.25],
[5, 2, 0.5],
[5, 4, 0.25],
[1, 0, 0.25],
[2, 0, 0.2],
[2, 1, 0.3],
[1, 0, 0.3]])
Notice the [1, 0, x] branch is included twice, as it's fed by both the [5, 1, 0.25] branch and the [2, 1, 0.3] branch.
Here's some code I got working but it's far too slow for my application (millions of branches):
def branch(start_node, paths):
output = paths[paths[:,0]==start_node]
next_nodes = output
while True:
can_go_lower = np.isin(next_nodes[:,1], paths[:,0])
if ~np.any(can_go_lower): break
next_nodes_checked = next_nodes[can_go_lower]
next_nodes = np.empty([0,3])
for nodes in next_nodes_checked:
to_append = paths[paths[:,0]==nodes[1]]
to_append[:,2] *= nodes[2]
next_nodes = np.append(next_nodes, to_append, axis=0)
output = np.append(output, next_nodes, axis=0)
return output
The branches are always higher to lower, therefor getting caught in circles isn't a concern. A way to vectorize the for loop and avoid the appends would be the best optimization, I think.

Instead of storing in numpy array lets' store graph in dict.
tree = {k:arr[arr[:, 0] == k] for k in np.unique(arr[:, 0])}
Make as set of nodes which are non-leaf:
non_leaf_nodes = set(np.unique(arr[:, 0]))
Now to find the branch and cumulative probability:
def branch(start_node, tree, non_leaf_nodes):
curr_nodes = [[start_node, start_node, 1.0]] #(prev_node, starting_node, current_probability)
output = []
while True:
next_nodes = []
for _, node, prob in curr_nodes:
if node not in non_leaf_nodes: continue
subtree = tree[node]
to_append = subtree.copy()
to_append[:, 2] *= prob
to_append = to_append.tolist()
output += to_append
next_nodes += to_append
curr_nodes = next_nodes
if len(curr_nodes) == 0:
break
return np.array(output)
Output:
>>> branch(5, tree, non_leaf_nodes)
array([
[5. , 1. , 0.25],
[5. , 2. , 0.5 ],
[5. , 4. , 0.25],
[1. , 0. , 0.25],
[2. , 0. , 0.2 ],
[2. , 1. , 0.3 ],
[1. , 0. , 0.3 ]])
I am expecting it to work faster. Let me know.

Related

normalize the rows of numpy array based on a custom function

I have an numpy array. I want to normalized each rows based on this formula
x_norm = (x-x_min)/(x_max-x_min)
, where x_min is the minimum of each row and x_max is the maximum of each row. Here is a simple example:
a = np.array(
[[0, 1 ,2],
[2, 4 ,7],
[6, 10,5]
])
and desired output:
a = np.array([
[0, 0.5 ,1],
[0, 0.4 ,1],
[0.2, 1 ,0]
])
Thank you
IIUC, you can use raw numpy operations:
x = np.array(
[[0, 1 ,2],
[2, 4 ,7],
[6, 10,5]
])
x_norm = ((x.T-x.min(1))/(x.max(1)-x.min(1))).T
# OR
x_norm = (x-x.min(1)[:,None])/(x.max(1)-x.min(1))[:,None]
output:
array([[0. , 0.5, 1. ],
[0. , 0.4, 1. ],
[0.2, 1. , 0. ]])
NB. if efficiency matters, save the result of x.min(1) in a variable as it is used twice
You could use np.apply_along_axis
a = np.array(
[[0, 1 ,2],
[2, 4 ,7],
[6, 10,5]
])
def scaler(x):
return (x-x.min())/(x.max()-x.min())
np.apply_along_axis(scaler, axis=1, arr=a)
Output:
array([[0. , 0.5, 1. ],
[0. , 0.4, 1. ],
[0.2, 1. , 0. ]])

How can I count the length of the edge associated with each point?

I built the Delaunay triangulation in python.
Now I have 8 points (black) and generate 14 edges (gray).
How can I count the length of the edge associated with each point?
the matrix I want is the edges' length connected by each point, such as
[[P1, E1_length, E2_length, ...], [P2, E6_length, E7_length, ...], ...]
import numpy as np
points = np.array([[0, 0], [0, 1.1], [1, 0], [1, 1],[1.5, 0.6],[1.2, 0.5],[1.7, 0.9],[1.1, 0.1],])
from scipy.spatial import Delaunay
tri = Delaunay(points)
import matplotlib.pyplot as plt
plt.triplot(points[:, 0], points[:, 1], tri.simplices.copy(), color='0.7')
plt.plot(points[:, 0], points[:, 1], 'o', color='0.3')
plt.show()
New answer
Here's an approach which will give you a dictionary of points and edge lengths associated with each point:
simplices = points[tri.simplices]
edge_lengths = {}
for point in points:
key = tuple(point)
vertex_edges = edge_lengths.get(key, [])
adjacency_mask = np.isin(simplices, point).all(axis=2).any(axis=1)
for simplex in simplices[adjacency_mask]:
self_mask = np.isin(simplex, point).all(axis=1)
for other in simplex[~self_mask]:
dist = np.linalg.norm(point - other)
if dist not in vertex_edges:
vertex_edges.append(dist)
edge_lengths[key] = vertex_edges
Output:
{(0.0, 0.0): [1.4142135623730951, 1.1, 1.3, 1.0],
(0.0, 1.1): [1.004987562112089, 1.3416407864998738, 1.4866068747318506],
(1.0, 0.0): [1.4866068747318506, 0.5385164807134504, 0.7810249675906654, 1.140175425099138, 0.14142135623730956],
(1.0, 1.0): [1.004987562112089, 1.4142135623730951, 0.5385164807134504, 0.6403124237432849, 0.7071067811865475],
(1.5, 0.6): [0.6403124237432849, 0.36055512754639896, 0.31622776601683794, 0.6403124237432848],
(1.2, 0.5): [0.5385164807134504, 1.3, 0.31622776601683794, 0.41231056256176607],
(1.7, 0.9): [0.7071067811865475, 0.36055512754639896],
(1.1, 0.1): [0.14142135623730956, 0.41231056256176607, 0.6403124237432848]}
Old answer before requirements changed
The Delaunay object has a simplices attribute which returns the points which make up the simplices. Using scipy.spatial.distance.pdist(), and advanced indexing, you can get all the edge lengths like so:
>>> from scipy.spatial.distance import pdist
>>> edge_lengths = np.array([pdist(x) for x in points[tri.simplices]])
>>> edge_lengths
array([[1.00498756, 1.41421356, 1.1 ],
[0.53851648, 1.3 , 1.41421356],
[0.53851648, 1. , 1.3 ],
[0.64031242, 0.70710678, 0.36055513],
[0.64031242, 0.31622777, 0.53851648],
[0.14142136, 0.53851648, 0.41231056],
[0.64031242, 0.41231056, 0.31622777]])
Note however, that edge lengths are duplicated here, since every simplex shares at least one edge with another simplex.
Step-by-step
The tri.simplices attribute gives the indices in points for each vertex in each simplex in the Delaunay object:
>>> tri.simplices
array([[2, 6, 5],
[7, 2, 5],
[0, 7, 5],
[2, 1, 4],
[1, 2, 7],
[0, 3, 7],
[3, 1, 7]], dtype=int32)
Using advanced indexing, we can get all the points which make up the simplices:
>>> points[tri.simplices]
array([[[1. , 1. ],
[0. , 1.1],
[0. , 0. ]],
[[1.2, 0.5],
[1. , 1. ],
[0. , 0. ]],
[[1. , 0. ],
[1.2, 0.5],
[0. , 0. ]],
[[1. , 1. ],
[1.5, 0.6],
[1.7, 0.9]],
[[1.5, 0.6],
[1. , 1. ],
[1.2, 0.5]],
[[1. , 0. ],
[1.1, 0.1],
[1.2, 0.5]],
[[1.1, 0.1],
[1.5, 0.6],
[1.2, 0.5]]])
Finally, each subarray here represents a simplex and the three points which form it, and by using scipy.spatial.distance.pdist(), we can get the pairwise distances of each point in each simplex by iterating over the simplices:
>>> np.array([pdist(x) for x in points[tri.simplices]])
array([[1.00498756, 1.41421356, 1.1 ],
[0.53851648, 1.3 , 1.41421356],
[0.53851648, 1. , 1.3 ],
[0.64031242, 0.70710678, 0.36055513],
[0.64031242, 0.31622777, 0.53851648],
[0.14142136, 0.53851648, 0.41231056],
[0.64031242, 0.41231056, 0.31622777]])

Selectively set values in numpy array (or set on condition)

a = np.array([[0, 2, 0, 0], [0, 1, 3, 0], [0, 0, 10, 11], [0, 0, 1, 7]])
array([[ 0, 2, 0, 0],
[ 0, 1, 3, 0],
[ 0, 0, 10, 11],
[ 0, 0, 1, 7]])
There are 0 entries in each row. I need to assign a value to each of these zero entries, where the value is calculated as follows:
V = 0.1 * Si / Ni
where Si is the sum of row i
Ni is the number of zero entries in row i
I can calculate Si and Ni fairly easy:
S = np.sum(a, axis=1)
array([ 2, 4, 21, 8])
N = np.count_nonzero(a == 0, axis=1)
array([3, 2, 2, 2])
Now, V is calculated as:
V = 0.1 * S/N
array([0.06666667, 0.2 , 1.05 , 0.4 ])
But how do I assign V[i] to a zero entry in i-th row? So I'm expecting to get the following array a:
array([[ 0.06666667, 2, 0.06666667, 0.06666667],
[ 0.2, 1, 3, 0.2],
[ 1.05, 1.05, 10, 11],
[ 0.4, 0.4, 1, 7]])
I need some kind of selective broadcasting operation or assignment?
Use np.where
np.where(a == 0, v.reshape(-1, 1), a)
array([[ 0.06666667, 2. , 0.06666667, 0.06666667],
[ 0.2 , 1. , 3. , 0.2 ],
[ 1.05 , 1.05 , 10. , 11. ],
[ 0.4 , 0.4 , 1. , 7. ]])
Here's a way using np.where:
z = a == 0
np.where(z, (0.1*a.sum(1)/z.sum(1))[:,None], a)
array([[ 0.06666667, 2. , 0.06666667, 0.06666667],
[ 0.2 , 1. , 3. , 0.2 ],
[ 1.05 , 1.05 , 10. , 11. ],
[ 0.4 , 0.4 , 1. , 7. ]])
Maybe using a mask:
for i in range(V.size):
print((a[i,:] == 0) * V[i] + a[i,:])

Tensorflow: Masking an array based on duplicated elements of another array

I have an array, x=[2, 3, 4, 3, 2] which contains the states of model and another array which gives corresponding probabilities of these states, prob=[.2, .1, .4, .1, .2]. But some states are duplicated and I need to sum their corresponding probabilities. So my desired outputs are: unique_elems=[2, 3, 4] and reduced_prob=[.2+.2, .1+.1, .4]. Here is my approach:
x = tf.constant([2, 3, 4, 3, 2])
prob = tf.constant([.2, .1, .4, .1, .2])
unique_elems, _ = tf.unique(x) # [2, 3, 4]
unique_elems = tf.expand_dims(unique_elems, axis=1) # [[2], [3], [4]]
tiled_prob = tf.tile(tf.expand_dims(prob, axis=0), [3, 1])
# [[0.2, 0.1, 0.4, 0.1, 0.2],
# [0.2, 0.1, 0.4, 0.1, 0.2],
# [0.2, 0.1, 0.4, 0.1, 0.2]]
equal = tf.equal(x, unique_elems)
# [[ True, False, False, False, True],
# [False, True, False, True, False],
# [False, False, True, False, False]]
reduced_prob = tf.multiply(tiled_prob, tf.cast(equal, tf.float32))
# [[0.2, 0. , 0. , 0. , 0.2],
# [0. , 0.1, 0. , 0.1, 0. ],
# [0. , 0. , 0.4, 0. , 0. ]]
reduced_prob = tf.reduce_sum(reduced_prob, axis=1)
# [0.4, 0.2, 0.4]
but I am wondering whether there is a more efficient way to do that. In particular I am using tile operation which I think is not very efficient for large arrays.
It can be done in two lines by tf.unsorted_segment_sum:
unique_elems, idx = tf.unique(x) # [2, 3, 4]
reduced_prob = tf.unsorted_segment_sum(prob, idx, tf.size(unique_elems))

Assigning a list element with the sum of the other elements

I have a 2d matrix which can be any size but always a square. I want to loop through the matrix and for each diagonal element (x in the example) I want to assign the value 1-sum_of_all_other_values_in_the_row e.g.
Mtx = [[ x ,.2 , 0 ,.2,.2]
[ 0 , x ,.4 ,.2,.2]
[.2 ,.2 , x , 0, 0]
[ 0 , 0 ,.2 , x,.2]
[ 0 , 0 , 0 , 0, x]]
for i in enumerate(Mtx):
for j in enumerate(Mtx):
if Mtx[i][j] == 'x'
Mtx[i][j] = 1-sum of all other [j]'s in the row
I can't figure out how to get the sum of the j's in each row
for i,row in enumerate(Mtx): #same thing as `for i in range(len(Mtx)):`
Mtx[i][i]=0
Mtx[i][i]=1-sum(Mtx[i])
##could also use (if it makes more sense to you):
#row[i]=0
#Mtx[i][i]=1-sum(row)
You could do this as such:
from copy import copy
for i, row in enumerate(Mtx):
row_copy = copy(row)
row_copy.pop(i)
row[i] = 1 - sum(row_copy)
mtx = [[ 0 ,.2 , 0 ,.2,.2],
[ 0 , 0 , .4 ,.2,.2,],
[.2 ,.2 , 0 , 0, 0],
[ 0 , 0 ,.2 , 0,.2],
[ 0 , 0 , 0 , 0, 0]]
for i in range(len(mtx)):
summ=sum(mtx[i])
mtx[i][i]=round(1-summ,2) #use round to get 0.4 instead of .39999999999999999
print(mtx)
output:
[[0.4, 0.2, 0, 0.2, 0.2], [0, 0.2, 0.4, 0.2, 0.2], [0.2, 0.2, 0.6, 0, 0], [0, 0, 0.2, 0.6, 0.2], [0, 0, 0, 0, 1.0]]

Categories

Resources