Python - Apply a function over a labeled multidimensional array - python

I have a numpy array that is labelled using scipy connected component labelling.
import numpy
from scipy import ndimage
a = numpy.zeros((8,8),
a[1,1] = a[1,2] = a[2,1] = a[2,2] = a[3,1] = a[3,2] = 1
a[5,5] = a[5,6] = a[6,5] = a[6,6] = a[7,5] = a[7,6] = 1
lbl, numpatches = ndimage.label(a)
I want to apply a custom function (calculation of a specific value) over all labels within the labelled array.
Similar as for instance the ndimage algebra functions:
( Which in this case returns me the number of values for each label [6,6]. )
Is there a way to do this?

You can pass an arbitrary function to ndimage.labeled_comprehension, which is roughly equivalent to
[func(a[lbl == i]) for i in index]
Here is the labeled_comprehension-equivalent of ndimage.sum(a,lbl,range(1,numpatches+1)):
import numpy as np
from scipy import ndimage
a = np.zeros((8,8),
a[1,1] = a[1,2] = a[2,1] = a[2,2] = a[3,1] = a[3,2] = 1
a[5,5] = a[5,6] = a[6,5] = a[6,6] = a[7,5] = a[7,6] = 1
lbl, numpatches = ndimage.label(a)
def func(x):
return x.sum()
print(ndimage.labeled_comprehension(a, lbl, index=range(1, numpatches+1),
func=func, out_dtype='float', default=None))
# [6 6]


Find graphs intersection python

Does anyone have any idea of how I can find the intersection of these two graphs? (image below)
energ_ac, price_compvend and energ_ac1, price_compven1 are set of x,y values.
Please note the following code which gets the values from a database and then plots the two graphs:
I can only get the intersection manually, and I want to get it automatically
import matplotlib.pyplot as plt
import pyodbc
import pandas as pd
import numpy as np
import string as str
import sys
conn = pyodbc.connect(Trusted_Connection='yes', driver='{SQL Server}', server='srv03',
database='mercadoOMIE_curvas') # Ligação à BD no sqlserver
SQL_Query = pd.read_sql_query("""SELECT * FROM curva_pbc_uof_2020_1_12 WHERE ("4" = 'C' AND "0" = '1' AND "7" = 'O')""", conn)
df = pd.DataFrame(SQL_Query, columns=['0','1','2','3','4','5','6','7','8'])
df['5'] = df['5'].str.replace('.','', regex = True)
df['6'] = df['6'].str.replace('.','', regex = True)
df['5'] = pd.to_numeric(df['5'].str.replace(',','.'), errors='coerce')
df['6'] = pd.to_numeric(df['6'].str.replace(',','.'), errors='coerce')
energ_ac = np.zeros(len(df['5']))
energ_ac[0] = df['5'][0]
for x in range (1, len(df['5'])):
energ_ac[x] = energ_ac[x-1]+df['5'][x]
price_compvend = df['6'].to_numpy()
SQL_Query1 = pd.read_sql_query("""SELECT * FROM curva_pbc_uof_2020_1_12 WHERE ("4" = 'V' AND "0" = '1' AND "7" = 'O')""", conn)
df1 = pd.DataFrame(SQL_Query1, columns=['0','1','2','3','4','5','6','7','8'])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
df1['5'] = df1['5'].str.replace('.','', regex = True)
df1['6'] = df1['6'].str.replace('.','', regex = True)
df1['5'] = pd.to_numeric(df1['5'].str.replace(',','.'), errors='coerce')
df1['6'] = pd.to_numeric(df1['6'].str.replace(',','.'), errors='coerce')
energ_ac1 = np.zeros(len(df1['5']))
energ_ac1[0] = df1['5'][0]
for x in range (1, len(df1['5'])):
energ_ac1[x] = energ_ac1[x-1]+df1['5'][x]
price_compvend1 = df1['6'].to_numpy()
The solution is this link: np.array intersection // AttributeError: 'module' object has no attribute 'PiecewisePolynomial'
import scipy.interpolate as interpolate
import scipy.optimize as optimize
import numpy as np
x1 = np.array([1.4,2.1,3,5.9,8,9,23])
y1 = np.array([2.3,3.1,1,3.9,8,9,11])
x2 = np.array([1,2,3,4,6,8,9])
y2 = np.array([4,12,7,1,6.3,8.5,12])
# linear interpolators
opts = {'fill_value': 'extrapolate'}
f1 = interpolate.interp1d(x1,y1,**opts)
f2 = interpolate.interp1d(x2,y2,**opts)
# possible range for an intersection
xmin = np.min((x1,x2))
xmax = np.max((x1,x2))
# number of intersections
xuniq = np.unique((x1,x2))
xvals = xuniq[(xmin<=xuniq) & (xuniq<=xmax)]
# note that it's bad practice to compare floats exactly
# but worst case here is a bit of redundance, no harm
# for each combined interval there can be at most 1 intersection,
# so looping over xvals should hopefully be enough
# one can always err on the safe side and loop over a `np.linspace`
intersects = []
for xval in xvals:
x0, = optimize.fsolve(lambda x: f1(x)-f2(x), xval)
if (xmin<=x0<=xmax
and np.isclose(f1(x0),f2(x0))
and not any(np.isclose(x0,intersects))):
Use bellow script:
diff_vector = abs(price_compvend - price_compvend1)
min_index = np.where(diff_vector == np.min(diff_vector))
print('Intersection point is ({},{})'.format(energ_ac[min_index],
You could use set.intersection() method to get intersection points of the graphs.
graph_points1 = set(zip(energ_ac,price_compvend))
graph_points2 = set(zip(energ_ac1,price_compvend1))
intersection_points = graph_points1.intersection(graph_points2)

python kmedoids - calculating new medoid centers more efficiently

I'm following an excellent medium article: to implement kmedoids from scratch. There is a place in the code where each pixel's distance to the medoid centers is calculated and it is VERY slow. It has numpy.linalg.norm inside a loop. Is there a way to optimize this with numpy.linalg.norm or with numpy broadcasting or scipy.spatial.distance.cdist and np.argmin to do the same thing?
###helper function here###
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
this is where the slowdown occurs
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(X, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
Full code below. All credits to the article author.
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(,columns = iris.feature_names)
target = iris.target_names
labels =
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
#PCA Transformation
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])
datapoints = PCAdf.values
m, f = datapoints.shape
k = 3
def init_medoids(X, k):
from numpy.random import choice
from numpy.random import seed
samples = choice(len(X), size=k, replace=False)
return X[samples, :]
medoids_initial = init_medoids(datapoints, 3)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
S = compute_d_p(datapoints, medoids_initial, 2)
def assign_labels(S):
return np.argmin(S, axis=1)
labels = assign_labels(S)
def update_medoids(X, medoids, p):
S = compute_d_p(points, medoids, p)
labels = assign_labels(S)
out_medoids = medoids
for i in set(labels):
avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p))
cluster_points = points[labels == i]
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(points, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
return out_medoids
def has_converged(old_medoids, medoids):
return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
if starting_medoids is None:
medoids = init_medoids(X, k)
medoids = starting_medoids
converged = False
labels = np.zeros(len(X))
i = 1
while (not converged) and (i <= max_steps):
old_medoids = medoids.copy()
S = compute_d_p(X, medoids, p)
labels = assign_labels(S)
medoids = update_medoids(X, medoids, p)
converged = has_converged(old_medoids, medoids)
i += 1
return (medoids,labels)
results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]
There's a good chance numpy's broadcasting capabilities will help. Getting broadcasting to work in 3+ dimensions is a bit tricky, and I usually have to resort to a bit of trial and error to get the details right.
The use of linalg.norm here compounds things further, because my version of the code won't give identical results to linalg.norm for all inputs. But I believe it will give identical results for all relevant inputs in this case.
I've added some comments to the code to explain the thinking behind certain details.
def compute_d_p_broadcasted(X, medoids, p):
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids.shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
# In general, broadcasting n-dim arrays requires that the last
# dim of the first array be a singleton dimension, and that the
# first dim of the second array be a singleton dimension. We can
# quickly accomplish that by slicing with `None` in the appropriate
# places. (`np.newaxis` is a slightly more self-documenting way
# of spelling `None`, but I rarely bother.)
# In this case, the shapes of the other two dimensions also
# have to align in the same way you'd expect for a dot product.
# So we pass `medoids.T`.
diff = np.abs(X[:, :, None] - medoids.T[None, :, :])
# The last tricky bit is to figure out which axis to sum. Right
# now, the array is a 3-dimensional array, with the first
# dimension corresponding to the rows of `X` and the last
# dimension corresponding to the columns of `medoids.T`.
# The middle dimension corresponds to the underlying dimensionality
# of the space; that's what we want to sum for a sum of squares.
# (Or sum of cubes for L3 norm, etc.)
return (diff ** p).sum(axis=1)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
# A couple of simple tests:
X = np.array([[ 1.0, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
medoids = X[[0, 2], :]
np.allclose(compute_d_p(X, medoids, 2),
compute_d_p_broadcasted(X, medoids, 2))
# Returns True
np.allclose(compute_d_p(X, medoids, 3),
compute_d_p_broadcasted(X, medoids, 3))
# Returns True
Of course, these tests don't tell whether this actually gives a significant speedup. You'll have to check that yourself for the relevant use-case. But I suspect it will at least help.

Get all component stats of multiple arrays labeled by one of them

I already asked a similar question which got answered but now this is more in detail:
I need a really fast way to get all important component stats of two arrays, where one array is labeled by opencv2 and gives the component areas for both arrays. The stats for all components masked on the two arrays should then saved to a dictionary. My approach works but it is much too slow. Is there something to avoid the loop or a better approach then the ndimage.öabeled_comprehension?
from scipy import ndimage
import numpy as np
import cv2
def calculateMeanMaxMin(val):
return np.array([np.mean(val),np.max(val),np.min(val)])
def getTheStatsForComponents(array1,array2):
ret, thresholded= cv2.threshold(array2, 120, 255, cv2.THRESH_BINARY)
thresholded= thresholded.astype(np.uint8)
numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(thresholded, 8, cv2.CV_8UC1)
meanmaxminArray2 = ndimage.labeled_comprehension(array2, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
meanmaxminArray1 = ndimage.labeled_comprehension(array1, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
for position, label in enumerate(range(1, numLabels)):
currentLabel = np.uint8(labels== label)
contour, _ = cv2.findContours(currentLabel, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
componentStat = stats[label]
allstats = {'position':centroids[label,:],'area':componentStat[4],'height':componentStat[3],
if side1 >= side2 and side1 > 0:
allstats['elongation'] = np.float32(side2 / side1)
elif side2 > side1 and side2 > 0:
allstats['elongation'] = np.float32(side1 / side2)
allstats['elongation'] = np.float32(0)
return allComponentStats
The two arrays are 2d arrays:
array1= np.random.choice(255,(512,512)).astype(np.uint8)
array2= np.random.choice(255,(512,512)).astype(np.uint8)
small example of two arrays and the labelArray with two components(1 and 2, and background 0). Calculate the min,max mean with ndimage.labeled_comprhension.
from scipy import ndimage
import numpy as np
labelArray = np.array([[0,1,1,1],[2,2,1,1],[2,2,0,1]])
data = np.array([[0.1,0.2,0.99,0.2],[0.34,0.43,0.87,0.33],[0.22,0.53,0.1,0.456]])
data2 = np.array([[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2]])
numLabels = 2
minimumDataForAllLabels = ndimage.labeled_comprehension(data, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
minimumData2ForallLabels = ndimage.labeled_comprehension(data2, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
[0.2 0.22] ##minimum of component 1 and 2 from data
[0.2 0.1] ##minimum of component 1 and 2 from data2
[0.1 0.2 0.22] ##minimum output of bin_and_do_simple_stats from data
labeled_comprehension is definitely slow.
At least the simple stats can be done much faster based on the linked post. For simplicity I'm only doing one data array, but as the procedure returns sort indices it can be easily extended to multiple arrays:
import numpy as np
from scipy import sparse
from stb_pthr import sort_to_bins as _stb_pthr
# fallback if pythran not available
def sort_to_bins_sparse(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
aux = sparse.csr_matrix((data, idx, np.arange(len(idx)+1)), (len(idx), mx)).tocsc()
return, aux.indices, aux.indptr
def sort_to_bins_pythran(idx, data, mx=-1):
indices, indptr = _stb_pthr(idx, mx)
return data[indices], indices, indptr
# pick best available
sort_to_bins = sort_to_bins_pythran if HAVE_PYTHRAN else sort_to_bins_sparse
# example data
idx = np.random.randint(0,10,(100000))
data = np.random.random(100000)
# if possible compare the two methods
dsp,isp,psp = sort_to_bins_sparse(idx,data)
dph,iph,pph = sort_to_bins_pythran(idx,data)
assert (dsp==dph).all()
assert (isp==iph).all()
assert (psp==pph).all()
# example how to do simple vectorized calculations
def simple_stats(data,iptr):
min = np.minimum.reduceat(data,iptr[:-1])
mean = np.add.reduceat(data,iptr[:-1]) / np.diff(iptr)
return min, mean
def bin_and_do_simple_stats(idx,data,mx=-1):
data,indices,indptr = sort_to_bins(idx,data,mx)
return simple_stats(data,indptr)
print("minima: {}\n mean values: {}".format(*bin_and_do_simple_stats(idx,data)))
If you have pythran (not required but a bit faster), compile this as <>:
import numpy as np
#pythran export sort_to_bins(int[:], int)
def sort_to_bins(idx, mx):
if mx==-1:
mx = idx.max() + 1
cnts = np.zeros(mx + 2, int)
for i in range(idx.size):
cnts[idx[i]+2] += 1
for i in range(2, cnts.size):
cnts[i] += cnts[i-1]
res = np.empty_like(idx)
for i in range(idx.size):
res[cnts[idx[i]+1]] = i
cnts[idx[i]+1] += 1
return res, cnts[:-1]

Using numpy vectorize

I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a +,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(,xnow)) ###This is the line that produces an error###
xy =,ynow)
mu =, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu =,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional

Manipulating a large binary image array with numpy and cv2

My code is the following:
import cv2; import numpy as np
class MyClass:
def __init__(self,imagefile):
self.image = cv2.imread(imagefile)
#image details
self.h,self.w = self.image.shape[:2]
#self.bPoints, self.wPoints = np.array([[0,0]]),np.array([[0,0]])
self.bPoints, self.wPoints = [],[]
#CAUTION! Points are of the form (y,x)
# Point filtering
for i in xrange(self.h):
for j in xrange(self.w):
if self.th2.item(i,j) == 0:
#self.bPoints = np.append([[i,j]], self.bPoints, axis=0)
#self.wPoints = np.append([[i,j]], self.wPoints, axis=0)
#self.bPoints = self.bPoints[:len(self.bPoints) - 1]
#self.wPoints = self.wPoints[:len(self.wPoints) - 1]
self.bPoints, self.wPoints = np.array(self.bPoints), np.array(self.wPoints)
I want to find and separate the white from the black points. I have commented the lines that show a possible (but very-very slow) solution via numpy. Can you recommend me a better and faster solution? I will appreciate it if you do so!
I'm assuming self.th2 is a numpy array. This might take some adjustment if that is not the case. Basically, this uses the np.where function to determine all the indices which are 0 or 255.
import cv2; import numpy as np
class MyClass:
def __init__(self,imagefile):
self.image = cv2.imread(imagefile)
#image details
self.h,self.w = self.image.shape[:2]
#self.bPoints, self.wPoints = np.array([[0,0]]),np.array([[0,0]])
self.bPoints, self.wPoints = [],[]
#CAUTION! Points are of the form (y,x)
# use the np.where method instead of a double loop.
# make sure self.th2 is a numpy array
indx = np.where(self.th2==0)
for i,j in zip(indx[0], indx[1]):
indx = np.where(self.th2==255)
for i,j in zip(indx[0], indx[1]):
# Point filtering
#for i in xrange(self.h):
# for j in xrange(self.w):
# if self.th2.item(i,j) == 0:
# #self.bPoints = np.append([[i,j]], self.bPoints, axis=0)
# self.bPoints.append((i,j))
# else:
# self.wPoints.append((i,j))
# #self.wPoints = np.append([[i,j]], self.wPoints, axis=0)
#self.bPoints = self.bPoints[:len(self.bPoints) - 1]
#self.wPoints = self.wPoints[:len(self.wPoints) - 1]
self.bPoints, self.wPoints = np.array(self.bPoints), np.array(self.wPoints)

