I'm interested in image convolution. Here is my code to perform convolutions with a 3x3 kernel. I'm looking for any ideas on how to make it run faster.
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import numpy as np
img = mpimg.imread('benfrank.png')
imgCopy = img.copy()
Width = 1200
Height = 1464
x1 = 0
y1 = 0
cWidth = 3
cHeight = 3
convul = np.array([[0,0,-5],
[0,1,0],
[-5,0,0]])
summ = convul[2,2]+convul[2,1]+convul[2,0]+convul[1,2]+convul[1,1]+convul[1,0]+convul[0,2]+convul[0,1]+convul[0,0]
def convulute3x3(x,y):
global convul
global img,imgCopy, Width, Height, summ
i = x
j = y
if(i < 1 or i > Width-2 ):
return
elif(j < 1 or j > Height-2 ):
return
for c in range(3):
n11 = img[j-1,i-1,c]*convul[0,0]
n22 = img[j-1,i,c]*convul[1,0]
n33 = img[j-1,i+1,c]*convul[2,0]
n44= img[j,i-1,c]*convul[0,1]
n55 = img[j,i,c]*convul[1,1]
n66 = img[j,i+1,c]*convul[2,1]
n77 = img[j+1,i-1,c]*convul[0,2]
n88 = img[j+1,i,c]*convul[1,2]
n99 = img[j+1,i+1,c]*convul[2,2]
color = (n11+n22+n33+n44+n55+n66+n77+n88+n99)/summ
imgCopy[j,i,c] = color
for x in img:
x1=0
for y in x:
convulute3x3(x1,y1)
x1 = x1+1
y1 = y1+1
plt.imshow(imgCopy)
plt.show()
As #Reti43 has mentioned in the comments, there already exists libraries to do so, but I suspect you just want to play around with some home made implementations.
I too have been interested in how to implement convolutions manually in Python. Python loops are terribly slow, and if you care about speed you should stay away from pure python loops and instead stick to more vectorized methods.
The best I have so far is to use numpy.lib.stride_tricks.as_strided, which allows you to get very customized views of numpy arrays. I use as_strided to get a sliding window view of the image, then I use np.tensordot to do a "more general matrix multiplication" (docs) with the kernel. Furthermore, numpy 1.20 (iirc) has numpy.lib.stride_tricks.sliding_window_view, which is a little less general version of my code below (as of this date), as it cannot do custom strides.
import numpy as np
from numpy.lib.stride_tricks import as_strided
def get_sliding_window(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
imgChannels, imgRows, imgCols = x.shape
_, kernelRows, kernelCols = k.shape
u = np.array(x.itemsize) # Used to scale stride size, as_astrided wants stride sizes in bits
return as_strided(x,
shape=((imgRows-kernelRows)//rowstride+1, (imgCols-kernelCols)//colstride+1, imgChannels, kernelRows, kernelCols),
strides=u*(imgCols*rowstride, colstride, imgRows*imgCols, imgCols, 1)
)
def conv2d(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
"""
Performs 2d convolution on images with arbitrary number of channels where you can
specify the strides as well.
x: np.ndarray, image array of shape (C x N x M), where C is number of channels
k: np.ndarray, convolution kernel of shape (C x P x Q), where C is number of channels
rowstride: int, "vertical" step size
colstride: int, "horizontal" step size
"""
sliding_window_view = get_sliding_window(x, k, rowstride, colstride)
return np.tensordot(sliding_window_view, k, axes=3)
x = np.array([
[[1,1,1,1],
[1,1,1,1],
[2,2,2,2],
[2,2,2,2]],
[[1,1,2,2],
[1,1,2,2],
[4,4,8,8],
[4,4,8,8]]
])
k = np.array([
[[1,1],
[1,1]],
[[1,1],
[1,1]]
]) / 8
print(conv2d(x,k,1,1))
#[[1. 1.25 1.5 ]
# [2. 2.625 3.25 ]
# [3. 4. 5. ]]
print(conv2d(x,k,2,2))
#[[1. 1.5]
# [3. 5. ]]
Bonus
I implemented an ascii visualization thing to sanity check that sliding windows is correct:
import time
def conv2d_asciiviz(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
x = x.copy().astype(object)
sliding_window_view = get_sliding_window(x, k, rowstride, colstride)
highlighter = np.vectorize(lambda x: f"\x1b[33m{x}\x1b[0m")
r = np.full(sliding_window_view.shape[:2], np.nan)
with np.printoptions(nanstr="", formatter={"all":lambda x: str(x)}):
for i, row in enumerate(sliding_window_view):
for j, window in enumerate(row):
temp = window.copy()
r[i,j] = np.tensordot(window, k, axes=3)
window[...] = highlighter(window)
print(f"\x1b[JChannels:\n{x}\n\nResult:\n{str(r)}\x1b[{x.shape[0]*x.shape[1]+len(r)+4}A")
window[...] = temp
time.sleep(0.69)
print(f"\x1b[{x.shape[0]*x.shape[1]+len(r)+4}B")
return r
print("Output:\n",conv2d(x,k,1,1))
Related
I am trying to speed up the below operation by doing some sort of matrix/vector-multiplication, can anyone see a nice quick solution?
It should also work for a special case where a tensor has shape 0 (torch.Size([])) but i am not able to initialize such a tensor.
See the image below for the type of tensor i am referring to:
tensor to add to test
def adstock_geometric(x: torch.Tensor, theta: float):
x_decayed = torch.zeros_like(x)
x_decayed[0] = x[0]
for xi in range(1, len(x_decayed)):
x_decayed[xi] = x[xi] + theta * x_decayed[xi - 1]
return x_decayed
def adstock_multiple_samples(x: torch.Tensor, theta: torch.Tensor):
listtheta = theta.tolist()
if isinstance(listtheta, float):
return adstock_geometric(x=x,
theta=theta)
x_decayed = torch.zeros((100, 112, 1))
for idx, theta_ in enumerate(listtheta):
x_decayed_one_entry = adstock_geometric(x=x,
theta=theta_)
x_decayed[idx] = x_decayed_one_entry
return x_decayed
if __name__ == '__main__':
ones = torch.tensor([1])
hundreds = torch.tensor([idx for idx in range(100)])
x = torch.tensor([[idx] for idx in range(112)])
ones = adstock_multiple_samples(x=x,
theta=ones)
hundreds = adstock_multiple_samples(x=x,
theta=hundreds)
print(ones)
print(hundreds)
I came up with the following, which is 40 times faster on your example:
import torch
def adstock_multiple_samples(x: torch.Tensor, theta: torch.Tensor):
arange = torch.arange(len(x))
powers = (arange[:, None] - arange).clip(0)
return ((theta[:, None, None] ** powers[None, :, :]).tril() * x).sum(-1)
It behaves as expected:
>>> x = torch.arange(112)
>>> theta = torch.arange(100)
>>> adstock_multiple_samples(x, theta)
... # the same output
Note that I considered that x was a 1D-tensor, as for your example the second dimension was not needed.
It also works with theta = torch.empty((0,)), and it returns an empty tensor.
I'm following an excellent medium article: https://towardsdatascience.com/k-medoids-clustering-on-iris-data-set-1931bf781e05 to implement kmedoids from scratch. There is a place in the code where each pixel's distance to the medoid centers is calculated and it is VERY slow. It has numpy.linalg.norm inside a loop. Is there a way to optimize this with numpy.linalg.norm or with numpy broadcasting or scipy.spatial.distance.cdist and np.argmin to do the same thing?
###helper function here###
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
this is where the slowdown occurs
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(X, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
Full code below. All credits to the article author.
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)
target = iris.target_names
labels = iris.target
#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
#PCA Transformation
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])
datapoints = PCAdf.values
m, f = datapoints.shape
k = 3
def init_medoids(X, k):
from numpy.random import choice
from numpy.random import seed
seed(1)
samples = choice(len(X), size=k, replace=False)
return X[samples, :]
medoids_initial = init_medoids(datapoints, 3)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
S = compute_d_p(datapoints, medoids_initial, 2)
def assign_labels(S):
return np.argmin(S, axis=1)
labels = assign_labels(S)
def update_medoids(X, medoids, p):
S = compute_d_p(points, medoids, p)
labels = assign_labels(S)
out_medoids = medoids
for i in set(labels):
avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p))
cluster_points = points[labels == i]
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(points, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
return out_medoids
def has_converged(old_medoids, medoids):
return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
if starting_medoids is None:
medoids = init_medoids(X, k)
else:
medoids = starting_medoids
converged = False
labels = np.zeros(len(X))
i = 1
while (not converged) and (i <= max_steps):
old_medoids = medoids.copy()
S = compute_d_p(X, medoids, p)
labels = assign_labels(S)
medoids = update_medoids(X, medoids, p)
converged = has_converged(old_medoids, medoids)
i += 1
return (medoids,labels)
results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]
There's a good chance numpy's broadcasting capabilities will help. Getting broadcasting to work in 3+ dimensions is a bit tricky, and I usually have to resort to a bit of trial and error to get the details right.
The use of linalg.norm here compounds things further, because my version of the code won't give identical results to linalg.norm for all inputs. But I believe it will give identical results for all relevant inputs in this case.
I've added some comments to the code to explain the thinking behind certain details.
def compute_d_p_broadcasted(X, medoids, p):
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids.shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
# In general, broadcasting n-dim arrays requires that the last
# dim of the first array be a singleton dimension, and that the
# first dim of the second array be a singleton dimension. We can
# quickly accomplish that by slicing with `None` in the appropriate
# places. (`np.newaxis` is a slightly more self-documenting way
# of spelling `None`, but I rarely bother.)
# In this case, the shapes of the other two dimensions also
# have to align in the same way you'd expect for a dot product.
# So we pass `medoids.T`.
diff = np.abs(X[:, :, None] - medoids.T[None, :, :])
# The last tricky bit is to figure out which axis to sum. Right
# now, the array is a 3-dimensional array, with the first
# dimension corresponding to the rows of `X` and the last
# dimension corresponding to the columns of `medoids.T`.
# The middle dimension corresponds to the underlying dimensionality
# of the space; that's what we want to sum for a sum of squares.
# (Or sum of cubes for L3 norm, etc.)
return (diff ** p).sum(axis=1)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
# A couple of simple tests:
X = np.array([[ 1.0, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
medoids = X[[0, 2], :]
np.allclose(compute_d_p(X, medoids, 2),
compute_d_p_broadcasted(X, medoids, 2))
# Returns True
np.allclose(compute_d_p(X, medoids, 3),
compute_d_p_broadcasted(X, medoids, 3))
# Returns True
Of course, these tests don't tell whether this actually gives a significant speedup. You'll have to check that yourself for the relevant use-case. But I suspect it will at least help.
I already asked a similar question which got answered but now this is more in detail:
I need a really fast way to get all important component stats of two arrays, where one array is labeled by opencv2 and gives the component areas for both arrays. The stats for all components masked on the two arrays should then saved to a dictionary. My approach works but it is much too slow. Is there something to avoid the loop or a better approach then the ndimage.öabeled_comprehension?
from scipy import ndimage
import numpy as np
import cv2
def calculateMeanMaxMin(val):
return np.array([np.mean(val),np.max(val),np.min(val)])
def getTheStatsForComponents(array1,array2):
ret, thresholded= cv2.threshold(array2, 120, 255, cv2.THRESH_BINARY)
thresholded= thresholded.astype(np.uint8)
numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(thresholded, 8, cv2.CV_8UC1)
allComponentStats=[]
meanmaxminArray2 = ndimage.labeled_comprehension(array2, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
meanmaxminArray1 = ndimage.labeled_comprehension(array1, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
for position, label in enumerate(range(1, numLabels)):
currentLabel = np.uint8(labels== label)
contour, _ = cv2.findContours(currentLabel, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
(side1,side2)=cv2.minAreaRect(contour[0])[1]
componentStat = stats[label]
allstats = {'position':centroids[label,:],'area':componentStat[4],'height':componentStat[3],
'width':componentStat[2],'meanArray1':meanmaxminArray1[position][0],'maxArray1':meanmaxminArray1[position][1],
'minArray1':meanmaxminArray1[position][2],'meanArray2':meanmaxminArray2[position][0],'maxArray2':meanmaxminArray2[position][1],
'minArray2':meanmaxminArray2[position][2]}
if side1 >= side2 and side1 > 0:
allstats['elongation'] = np.float32(side2 / side1)
elif side2 > side1 and side2 > 0:
allstats['elongation'] = np.float32(side1 / side2)
else:
allstats['elongation'] = np.float32(0)
allComponentStats.append(allstats)
return allComponentStats
EDIT
The two arrays are 2d arrays:
array1= np.random.choice(255,(512,512)).astype(np.uint8)
array2= np.random.choice(255,(512,512)).astype(np.uint8)
EDIT2
small example of two arrays and the labelArray with two components(1 and 2, and background 0). Calculate the min,max mean with ndimage.labeled_comprhension.
from scipy import ndimage
import numpy as np
labelArray = np.array([[0,1,1,1],[2,2,1,1],[2,2,0,1]])
data = np.array([[0.1,0.2,0.99,0.2],[0.34,0.43,0.87,0.33],[0.22,0.53,0.1,0.456]])
data2 = np.array([[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2]])
numLabels = 2
minimumDataForAllLabels = ndimage.labeled_comprehension(data, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
minimumData2ForallLabels = ndimage.labeled_comprehension(data2, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
print(minimumDataForAllLabels)
print(minimumData2ForallLabels)
print(bin_and_do_simple_stats(labelArray.flatten(),data.flatten()))
Output:
[0.2 0.22] ##minimum of component 1 and 2 from data
[0.2 0.1] ##minimum of component 1 and 2 from data2
[0.1 0.2 0.22] ##minimum output of bin_and_do_simple_stats from data
labeled_comprehension is definitely slow.
At least the simple stats can be done much faster based on the linked post. For simplicity I'm only doing one data array, but as the procedure returns sort indices it can be easily extended to multiple arrays:
import numpy as np
from scipy import sparse
try:
from stb_pthr import sort_to_bins as _stb_pthr
HAVE_PYTHRAN = True
except:
HAVE_PYTHRAN = False
# fallback if pythran not available
def sort_to_bins_sparse(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
aux = sparse.csr_matrix((data, idx, np.arange(len(idx)+1)), (len(idx), mx)).tocsc()
return aux.data, aux.indices, aux.indptr
def sort_to_bins_pythran(idx, data, mx=-1):
indices, indptr = _stb_pthr(idx, mx)
return data[indices], indices, indptr
# pick best available
sort_to_bins = sort_to_bins_pythran if HAVE_PYTHRAN else sort_to_bins_sparse
# example data
idx = np.random.randint(0,10,(100000))
data = np.random.random(100000)
# if possible compare the two methods
if HAVE_PYTHRAN:
dsp,isp,psp = sort_to_bins_sparse(idx,data)
dph,iph,pph = sort_to_bins_pythran(idx,data)
assert (dsp==dph).all()
assert (isp==iph).all()
assert (psp==pph).all()
# example how to do simple vectorized calculations
def simple_stats(data,iptr):
min = np.minimum.reduceat(data,iptr[:-1])
mean = np.add.reduceat(data,iptr[:-1]) / np.diff(iptr)
return min, mean
def bin_and_do_simple_stats(idx,data,mx=-1):
data,indices,indptr = sort_to_bins(idx,data,mx)
return simple_stats(data,indptr)
print("minima: {}\n mean values: {}".format(*bin_and_do_simple_stats(idx,data)))
If you have pythran (not required but a bit faster), compile this as <stb_pthr.py>:
import numpy as np
#pythran export sort_to_bins(int[:], int)
def sort_to_bins(idx, mx):
if mx==-1:
mx = idx.max() + 1
cnts = np.zeros(mx + 2, int)
for i in range(idx.size):
cnts[idx[i]+2] += 1
for i in range(2, cnts.size):
cnts[i] += cnts[i-1]
res = np.empty_like(idx)
for i in range(idx.size):
res[cnts[idx[i]+1]] = i
cnts[idx[i]+1] += 1
return res, cnts[:-1]
need to read an image as an array and for each pixel select 7*7 neighbor pixels then reshape it and put as a first row of training set:
import numpy as np
from scipy import misc
face1=misc.imread('face1.jpg')
face1 dimensions are (288, 352, 3) , need to find 7*7 neighbor pixels for every pixel , so 49*3 color then reshape it as a (1,147) array and stack it into an array for all pixels , i took the following approach:
X_training=np.zeros([1,147] ,dtype=np.uint8)
for i in range(3, face1.shape[0]-3):
for j in range(3, face1.shape[1]-3):
block=face1[i-3:i+4,j-3:j+4]
pxl=np.reshape(block,(1,147))
X_training=np.vstack((pxl,X_training))
resulting X_training shape is (97572, 147)
and as last row contains all zeros then:
a = len(X_training)-1
X_training = X_training[:a]
above code works well for one picture but with Wall time: 5min 19s i have 2000 images, so it will take ages to do it for all the images. I am looking for a faster way to iterate over every pixel and do the above task.
Edit:
this is what i mean by neighbor pixels , for every pixel face1[i-3 : i+4 ,j-3:j+4]
An efficient way is to use stride_tricks to create a 2d rolling window over the image, then flatten it out:
import numpy as np
face1 = np.arange(288*352*3).reshape(288, 352, 3) # toy data
n = 7 # neighborhood size
h, w, d = face1.shape
s = face1.strides
tmp = np.lib.stride_tricks.as_strided(face1, strides=s[:2] + s,
shape=(h - n + 1, w - n + 1, n, n, d))
X_training = tmp.reshape(-1, n**2 * d)
X_training = X_training[::-1] # to get the rows into same order as in the question
tmp is a 5D view into the image, where tmp[x, y, :, :, c] is equivalent to the neigborhood face1[x:x+n, y:y+n, c] in color channel c.
The following is < 1s on my laptop:
import scipy as sp
im = sp.rand(300, 300, 3)
size = 3
ij = sp.meshgrid(range(size, im.shape[0]-size), range(size, im.shape[1]-size))
i = ij[0].T.flatten()
j = ij[1].T.flatten()
N = len(i)
L = (2*size + 1)**2
X_training = sp.empty(shape=[N, 3*L])
for pixel in range(N):
si = (slice(i[pixel]-size, i[pixel]+size+1))
sj = (slice(j[pixel]-size, j[pixel]+size+1))
X_training[pixel, :] = im[si, sj, :].flatten()
X_training = X_training[-1::-1, :]
I'm always a bit sad when I can't think of one-line vectorized version, but at least it's faster for you.
Using scikit-image:
import numpy as np
from skimage import util
image = np.random.random((288, 352, 3))
windows = util.view_as_windows(image, (7, 7, 3))
out = windows.reshape(-1, 7 * 7 * 3)
I am trying to apply graph theory methods to an image processing problem. I want to generate an adjacency matrix from an array containing the points I want to graph. I want to generate a complete graph of the points in the array. If I have N points in the array that I need to graph, I will need an NxN matrix. The weights should be the distances between the points, so this is the code that I have:
''' vertexarray is an array where the points that are to be
included in the complete graph are True and all others False.'''
import numpy as np
def array_to_complete_graph(vertexarray):
vertcoords = np.transpose(np.where(vertexarray == True))
cg_array = np.eye(len(vertcoords))
for idx, vals in enumerate(vertcoords):
x_val_1, y_val_1 = vals
for jdx, wals in enumerate(vertcoords):
x_diff = wals[0] - vals[0]
y_diff = wals[1] - vals[1]
cg_array[idx,jdx] = np.sqrt(x_diff**2 + y_diff**2)
return cg_array
This works, of course, but my question is: can this same array be generated without the nested for loops?
Use the function scipy.spatial.distance.cdist():
import numpy as np
def array_to_complete_graph(vertexarray):
vertcoords = np.transpose(np.where(vertexarray == True))
cg_array = np.eye(len(vertcoords))
for idx, vals in enumerate(vertcoords):
x_val_1, y_val_1 = vals
for jdx, wals in enumerate(vertcoords):
x_diff = wals[0] - vals[0]
y_diff = wals[1] - vals[1]
cg_array[idx,jdx] = np.sqrt(x_diff**2 + y_diff**2)
return cg_array
arr = np.random.rand(10, 20) > 0.75
from scipy.spatial.distance import cdist
y, x = np.where(arr)
p = np.c_[x, y]
dist = cdist(p, p)
np.allclose(array_to_complete_graph(arr), dist)