Convert multidimensional array of indices clusters to a 1D categorical array - python

I have a function which returns a multidimensional array of k clusters. My algorith works for the most part, but I need it to return a categorical array instead of a multidimensional array. Here is my code:
import numpy as np
import pandas as pd
import random
from bokeh.sampledata.iris import flowers
from typing import List, Tuple
def get_closest(data_point: np.ndarray, centroids: np.ndarray):
"""
Takes a data_point and a nd.array of multiple centroids and returns the index of the centroid closest to data_point
by computing the euclidean distance for each centroid and picking the closest.
"""
N = centroids.shape[0]
dist = np.empty(N)
for i, c in enumerate(centroids):
dist[i] = np.linalg.norm(c - data_point)
index_min = np.argmin(dist)
return index_min
# Use these centroids in the first iteration of you algorithm if "Random Centroids" is set to False in the Dashboard
DEFAULT_CENTROIDS = np.array([[5.664705882352942, 3.0352941176470587, 3.3352941176470585, 1.0176470588235293],
[5.446153846153847, 3.2538461538461543, 2.9538461538461536, 0.8846153846153846],
[5.906666666666667, 2.933333333333333, 4.1000000000000005, 1.3866666666666667],
[5.992307692307692, 3.0230769230769234, 4.076923076923077, 1.3461538461538463],
[5.747619047619048, 3.0714285714285716, 3.6238095238095243, 1.1380952380952383],
[6.161538461538462, 3.030769230769231, 4.484615384615385, 1.5307692307692309],
[6.294117647058823, 2.9764705882352938, 4.494117647058823, 1.4],
[5.853846153846154, 3.215384615384615, 3.730769230769231, 1.2076923076923078],
[5.52857142857143, 3.142857142857143, 3.107142857142857, 1.007142857142857],
[5.828571428571429, 2.9357142857142855, 3.664285714285714, 1.1]])
def k_means(data_np: np.ndarray, k:int=3, n_iter:int=500, random_initialization=False) -> Tuple[np.ndarray, int]:
"""
:param data: your data, a numpy array with shape (n_entries, n_features)
:param k: The number of clusters to compute
:param n_iter: The maximal numnber of iterations
:param random_initialization: If False, DEFAULT_CENTROIDS are used as the centroids of the first iteration.
:return: A tuple (cluster_indices: A numpy array of cluster_indices,
n_iterations: the number of iterations it took until the algorithm terminated)
"""
# Initialize the algorithm by assigning random cluster labels to each entry in your dataset
k=k+1
centroids = data_np[random.sample(range(len(data_np)), k)]
labels = np.array([np.argmin([(el - c) ** 2 for c in centroids]) for el in data_np])
clustering = []
for k in range(k):
clustering.append(data_np[labels == k])
# Implement K-Means with a while loop, which terminates either if the centroids don't move anymore, or
# if the number of iterations exceeds n_iter
counter = 0
while counter < n_iter:
# Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
# if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
if random_initialization is False and counter == 0:
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
# Update the cluster labels using get_closest
labels = np.array([get_closest(el, centroids) for el in data_np])
clustering = []
for i in range(k):
clustering.append(np.where(labels == i)[0])
counter += 1
new_centroids = np.zeros_like(centroids)
for i in range(k):
if len(clustering[i]) > 0:
new_centroids[i] = data_np[clustering[i]].mean(axis=0)
else:
new_centroids[i] = centroids[i]
# if the centroids didn't move, exit the while loop
if clustering is not None and (centroids == new_centroids).sum() == 0:
break
else:
centroids = new_centroids
pass
# return the final cluster labels and the number of iterations it took
return clustering, counter
# read and store the dataset
data: pd.DataFrame = flowers.copy(deep=True)
data = data.drop(['species'], axis=1)
data_np = np.asarray(data)
clustering, counter = k_means(data_np,4,500,False)
So clustering looks like so
clustering
[array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 57,
98], dtype=int64),
array([60, 93], dtype=int64),
array([ 50, 51, 52, 53, 54, 55, 56, 58, 61, 62, 63, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 86, 87, 89, 90, 91, 92, 94, 95, 96,
97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
dtype=int64),
array([59, 64, 84, 85, 88], dtype=int64)]
However, what I'm looking for is an array like
clustering
array([1, 3, 2, ..., 4, 1, 4], dtype=int64)]
Also, the while loop is always terminating after 1 iteration which shouldn't be the case.
counter
1
EDIT1:
The code continues as follows.
def callback(attr, old, new):
# recompute the clustering and update the colors of the data points based on the result
k = slider_k.valued_throttled
init = select_init.value
clustering_new, counter_new = k_means(data_np,k,500,init)
pass
# Create the dashboard
# 1. A Select widget to choose between random initialization or using the DEFAULT_CENTROIDS on top
select_init = Select(title='Random Centroids',value='False',options=['True','False'])
# 2. A Slider to choose a k between 2 and 10 (k being the number of clusters)
slider_k = Slider(start=2,end=10,value=3,step=1,title='k')
# 4. Connect both widgets to the callback
select_init.on_change('value',callback)
slider_k.on_change('value_throttled',callback)
# 3. A ColumnDataSource to hold the data and the color of each point you need
source = ColumnDataSource(dict(petal_length=data['petal_length'],sepal_length=data['sepal_length'],petal_width=data['petal_width'],clustering=clustering))
# 4. Two plots displaying the dataset based on the following table, have a look at the images
# in the handout if this confuses you.
#
# Axis/Plot Plot1 Plot2
# X Petal length Petal width
# Y Sepal length Petal length
#
# Use a categorical color mapping, such as Spectral10, have a look at this section of the bokeh docs:
# https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#filling
plot1 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal length and sepal length')
plot1.yaxis.axis_label = 'Sepal length'
plot1.xaxis.axis_label = 'Petal length'
scatter1 = plot1.scatter(x='petal_length',y='sepal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))
plot2 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal width and petal length')
plot2.yaxis.axis_label = 'Petal length'
plot2.xaxis.axis_label = 'Petal width'
scatter2 = plot2.scatter(x='petal_width',y='petal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))
# 5. A Div displaying the currently number of iterations it took the algorithm to update the plot.
div = Div(text='Number of iterations: ')
Thus the end result should look like so

I'm not sure I understand what you need.
If clustering contains a list of arrays where each array represent a cluster and the ith array contains the indices of the samples that belong to the ith cluster and what you need is to convert this to a single vector of size number_of_samples that represent the cluster each sample belongs to you can do it like this:
def to_classes(clustering):
# Get number of samples (you can pass it directly to the function)
num_samples = sum(x.shape[0] for x in clustering)
indices = np.empty((num_samples,)) # An empty array with correct size
for ith, cluster in enumerate(clustering):
# use cluster indices to assign to correct the cluster index
indices[cluster] = ith
return indices
The loops exists after a single iteration because the break condition is wrong, I think what you want is actually
# note the !=
if clustering is not None and (centroids != new_centroids).sum() == 0:
break

Related

Create a list of multiples of a number

Problem:
List of Multiples
Create a Python 3 function that takes two numbers (value, length) as arguments and returns a list of multiples of value until the size of the list reaches length.
Examples
list_of_multiples(value=7, length=5) ➞ [7, 14, 21, 28, 35]
list_of_multiples(value=12, length=10) ➞ [12, 24, 36, 48, 60, 72, 84, 96, 108, 120]
list_of_multiples(value=17, length=6) ➞ [17, 34, 51, 68, 85, 102]
def multiples (value,length):
"""
Value is number to be multiply
length is maximum number of iteration up to
which multiple required.
"""
for i in range(length):
out=i
return i
Most Pythonic Way
def multiples(value, length):
return [*range(value, length*value+1, value)]
print(multiples(7, 5))
# [7, 14, 21, 28, 35]
print(multiples(12, 10))
# [12, 24, 36, 48, 60, 72, 84, 96, 108, 120]
print(multiples(17, 6))
# [17, 34, 51, 68, 85, 102]
Pythonic way:
def multiples(value, length):
return [value * i for i in range(1, length + 1)]
print(multiples(7, 5))
# [7, 14, 21, 28, 35]
print(multiples(12, 10))
# [12, 24, 36, 48, 60, 72, 84, 96, 108, 120]
print(multiples(17, 6))
# [17, 34, 51, 68, 85, 102]
def multiples(value, length):
list_multiples = []
i = 0
while i < length:
list_multiples.append(value*(i+1))
i+=1
return list_multiples
The easy / not in-line way would be :
def multiples(value, length):
l = []
for i in range(1, length+1):
l.append(value*i)
return l
The best answer for small values of length (< 100) is given by Nite Block.
However, in case length becomes bigger, using numpy is significantly faster than python loops:
numpy.arange(1, length+1) * value
With a length of 1000, python loops take almost 4 times longer than numpy. See code below:
import timeit
testcode_numpy = '''
import numpy
def multiples_numpy(value, length):
return numpy.arange(1, length+1) * value
multiples_numpy(5, 1000)
'''
testcode = '''
def multiples(value, length):
return [*range(value, length*value+1, value)]
multiples(5, 1000)
'''
print(timeit.timeit(testcode_numpy))
print(timeit.timeit(testcode))
# Result:
# without numpy: 9.7 s
# with numpy: 2.4 s

Creating a limit to find the sum of array values to a set number Numpy Python

I want to make a function where the sum of the Arrays and Arrays2 array is equivalent to val. The function should modify the Arrays and Arrays2 values so that the last index will output the sum of all values in the array to be val. How will be able to get the Expected Output?
import numpy as np
Arrays = np.array([50, 30, 25, 87, 44, 68, 45])
Arrays2 = np.array([320])
val = 300
Expected output:
[50, 30, 25, 87, 44, 64]
[300]
something like this?
import numpy as np
Arrays = np.array([50, 30, 25, 87, 44, 68, 45])
Arrays2 = np.array([320])
val = 300
def thisRareFunction(arr):
outArrays = []
acum = 0
for x in arr:
acum += x
if acum <=val:
outArrays.append(x)
else:
outArrays.append(x -(acum-val))
break
return outArrays
print(thisRareFunction(Arrays))
print(thisRareFunction(Arrays2))

Largest strongly connected components of a directed graph

I am working on an Networkx .MultiDiGraph() object built from a total of 82927 directed email data. At current stage, I am trying to get the largest strongly connected components from the .MultiDiGraph() object and its corresponding subgraph.
The text data can be accessed here.
Here's my working code:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
email_df = pd.read_csv('email_network.txt', delimiter = '->')
edge_groups = email_df.groupby(["#Sender", "Recipient"], as_index=False).count().rename(columns={"time":"weight"})
email = nx.from_pandas_dataframe(edge_groups, '#Sender', 'Recipient', edge_attr = 'weight')
G = nx.MultiDiGraph()
G.add_edges_from(email.edges(data=True))
# G is a .MultiDiGraph object
# using .strongly_connected_components() to get the part of G that has the most nodes
# using list comprehension
number_of_nodes = [len(n) for n in sorted(nx.strongly_connected_components(G))]
number_of_nodes
# 'number_of_nodes' return a list of [1, 1, 1,...,1] of length 167 (which is the exact number of nodes in the network)
# using the recommended method in networkx documentation
largest = max(nx.strongly_connected_components(G), key=len)
largest
# 'largest' returns {92}, not sure what this means...
As I noted in the above code block, the list comprehension method returns a list of [1, 1, 1,..., 1] of length 167 (which is the total number of nodes in my data), while the max(nx.strongly_connected_components(G), key=len) returned {92}, I am not sure what this means.
It looks like there's something wrong with my code and I might have missed several key steps in processing the data. Could anyone care to take a look at and enlighten me on this?
Thank you.
Note: Revised code (kudos to Eric and Joel)
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
email_df = pd.read_csv('email_network.txt', delimiter = ' ')
edge_groups = email_df.groupby(["#Sender", "Recipient"], as_index=False).count().rename(columns={"time":"weight"})
# per #Joel's comment, adding 'create_using = nx.DiGraph()'
email = nx.from_pandas_dataframe(edge_groups, '#Sender', 'Recipient', edge_attr = 'weight', create_using = nx.DiGraph())
# adding this 'directed' edge list to .MultiDiGraph() object
G = nx.MultiDiGraph()
G.add_edges_from(email.edges(data=True))
We now examine the largest strongly connected component (in terms of the number of nodes) in this network.
In [1]: largest = max(nx.strongly_connected_components(G), key=len)
In [2]: len(largest)
Out [2]: 126
The largest strongly connected component consists of 126 nodes.
[Updates]
Upon further trial and error, I found that one needs to use create_using = .MultiDiGraph() (instead of .DiGraph()) when loading data onto networkx, otherwise, even if you get correct number of nodes for your MultiDiGraph and its weakly/strongly connected subgraphs, you might still get the number of edges wrong! This will reflect in you .strongly_connected_subgraphs() outputs.
For my case here, I will recommend others to use this one-liner
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
G = nx.read_edgelist(path="email_network.txt", data=[('time', int)], create_using=nx.MultiDiGraph(), nodetype=str)
And we can implement .strongly_connected_components(G) and strongly_connected_subgraphs to verify.
If you use the networkx output G from the first code block, max(nx.strongly_connected_components(G), key=len) will give an output with 126 nodes and 52xx something edges, but if you apply the one-liner I listed above, you will get:
In [1]: largest = max(nx.strongly_connected_components(G), key=len)
In [2]: G_sc = max(nx.strongly_connected_subgraphs(G), key=len)
In [3]: nx.number_of_nodes(G_sc)
Out [3]: 126
In [4]: nx.number_of_nodes(G_sc)
Out [4]: 82130
You will get the same number of nodes with both methods but different number of edges owing to different counting mechanisms associated with different networkx graph classes.
The underlying cause of your error is that nx.from_pandas_dataframe defaults to creating an undirected graph. So email is an undirected graph. When you then create the directed graph, each edge appears in only one direction.
To fix it use nx.from_pandas_dataframe with the argument create_using = DiGraph
older comments related to the output you were getting
All your strongly connected components have a single node.
When you do max(nx.strongly_connected_components(G), key=len) it finds the set of nodes which has the longest length and returns it. In your case, they all have length 1, so it returns one of them (I believe whichever networkx happened to put into nx.strongly_connected_components(G) first). But it's returning the set, not the length. So {92} is the set of nodes it is returning.
It happens that {92} was chosen to be the "longest" length 1 component in nx.strongly_connected_components(G) by the tiebreaker.
Example:
max([{1}, {3}, {5}], key = len)
> {1}
[1, 1, 1,...,1] of length 167 (which is the exact number of nodes in the network)
This means that there's basically no strongly connected component in your graph (except for lone vertices, that is).
If you sort those components by length, you get a randon component of one single vertex since the components all have the same length (1). In your example, {92}, which could have been any other vertex.
The import looks correct and there's really no strongly connected component, it means that nobody ever replied to any email.
To check if the problem doesn't come from pandas, MultiDiGraph or your import, I wrote:
G = nx.DiGraph()
with open('email_network.txt') as f:
for line in f:
n1, n2, time = line.split()
if n1.isdigit():
G.add_edge(int(n1),int(n2))
It didn't change the result.
Just adding an edge with G.add_edge(2,1) creates a large strongly connected component, though:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 126, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 117, 118, 119, 120, 121, 122, 123, 124, 128, 129, 134, 149, 151}

Find index of min value in a matrix

I've a 2-Dim array containing the residual sum of squares of a given fit (unimportant here).
RSS[i,j] = np.sum((spectrum_theo - sp_exp_int) ** 2)
I would like to find the matrix element with the minimum value AND its position (i,j) in the matrix. Find the minimum element is OK:
RSS_min = RSS[RSS != 0].min()
but for the index, I've tried:
ij_min = np.where(RSS == RSS_min)
which gives me:
ij_min = (array([3]), array([20]))
I would like to obtain instead:
ij_min = (3,20)
If I try :
ij_min = RSS.argmin()
I obtain:
ij_min = 0,
which is a wrong result.
Does it exist a function, in Scipy or elsewhere, that can do it? I've searched on the web, but I've found answers leading only with 1-Dim arrays, not 2- or N-Dim.
Thanks!
The easiest fix based on what you have right now would just be to extract the elements from the array as a final step:
# ij_min = (array([3]), array([20]))
ij_min = np.where(RSS == RSS_min)
ij_min = tuple([i.item() for i in ij_min])
Does this work for you
import numpy as np
array = np.random.rand((1000)).reshape(10,10,10)
print np.array(np.where(array == array.min())).flatten()
in the case of multiple minimums you could try something like
import numpy as np
array = np.array([[1,1,2,3],[1,1,4,5]])
print zip(*np.where(array == array.min()))
You can combine argmin with unravel_index.
For example, here's an array RSS:
In [123]: np.random.seed(123456)
In [124]: RSS = np.random.randint(0, 99, size=(5, 8))
In [125]: RSS
Out[125]:
array([[65, 49, 56, 43, 43, 91, 32, 87],
[36, 8, 74, 10, 12, 75, 20, 47],
[50, 86, 34, 14, 70, 42, 66, 47],
[68, 94, 45, 87, 84, 84, 45, 69],
[87, 36, 75, 35, 93, 39, 16, 60]])
Use argmin (which returns an integer that is the index in the flattened array), and then pass that to unravel_index along with the shape of RSS to convert the index of the flattened array into the indices of the 2D array:
In [126]: ij_min = np.unravel_index(RSS.argmin(), RSS.shape)
In [127]: ij_min
Out[127]: (1, 1)
ij_min itself can be used as an index into RSS to get the minimum value:
In [128]: RSS_min = RSS[ij_min]
In [129]: RSS_min
Out[129]: 8

Probabilistic selection from a numpy array

Does Numpy have any built in functions to randomly select values from a 1D numpy array with a higher weighting given to values at the end of the array? Is there an easier way to do this than defining a skewed distribution and sampling from it to get array indices?
You can give a weight to np.choice, as shown:
a = np.random.random(100) # an array to draw from
n = 10 # number of values to draw
i = np.arange(a.size) # an array of the index value for weighting
w = np.exp(i/10.) # higher weights for larger index values
w /= w.sum() # weight must be normalized
Now, access your values with:
np.random.choice(a, size=n, p=w)
Clearly you can change your weight array however you want, I did an exponential decay from the end with decay length 10; increase that decay length for a wider selection:
for np.exp(i/50.):
In [38]: np.random.choice(a, size=n, p=w)
Out[38]: array([37, 53, 45, 22, 88, 69, 56, 86, 96, 24])
for np.exp(i):
In [41]: np.random.choice(a, size=n, p=w)
Out[41]: array([99, 99, 98, 99, 99, 99, 99, 97, 99, 98])
If you only want to be able to get each value once, be sure to set replace=False, otherwise you can get the same value several times (especially if it's highly weighted, as in the second example above). See this example:
In [33]: np.random.choice(a, size=n, replace=False, p=w)
Out[33]: array([99, 84, 86, 91, 87, 81, 96, 89, 97, 95])
In [34]: np.random.choice(a, size=n, replace=True, p=w)
Out[34]: array([94, 98, 99, 98, 97, 99, 91, 96, 97, 93])
My original answer was:
If the form of the distribution doesn't really matter, you could do something like a poisson distribution of indices:
idx = np.random.poisson(size=10)
Your sample:
a[-idx-1]

Categories

Resources