Vectorizing scipy norm.pdf - python

def predictDigit(img):
prob = [0] * 10
for digit in range(10):
for pix in range(len(img)):
std = pix_std[digit][pix]
mean = pix_means[digit][pix]
if std == 0:
continue
else:
prob[digit] += np.log(norm.pdf(img[pix], mean, std))
prob[digit] += np.log(digit_prob[digit])
return np.argmax(prob)
I wrote this function to use it for implementing a Naive Bayes Classifier for digit classification. The idea is to go through all pixels of an input image and add the np.log(norm.pdf(img[pix], mean, std)) to the prob and return the argmax of it at the end to label the digit of the input image.
However, this takes too long. I successfully vectorized getting the mean and std using:
pix_means[digit] = np.mean(image_cluster[digit], axis = 0)
pix_std[digit] = np.std(image_cluster[digit], axis = 0)
But, I am not sure if vectorization is possible with norm.pdf.
Please help.
EDIT
Digit Prob
digit_count = {}
for digit in y_train:
if digit not in digit_count:
digit_count[digit] = 1
else:
digit_count[digit] += 1
digit_prob = {}
for digit in range(10):
digit_prob[digit] = digit_count[digit] / len(y_train)
image_cluster
image_cluster = {}
for image, digit in zip(x_train, y_train):
if digit not in image_cluster:
image_cluster[digit] = [image]
else:
image_cluster[digit].append(image)
pix_means = {}
pix_std = {}
# get mean and sd
for digit in range(10):
pix_means[digit] = np.mean(image_cluster[digit], axis = 0)
pix_std[digit] = np.std(image_cluster[digit], axis = 0)

norm.pdf is vectorized right out the box!
To compute the cdf at a number of points, we can pass a list or a numpy array.
norm.cdf([-1., 0, 1])
array([ 0.15865525, 0.5, 0.84134475])
import numpy as np
norm.cdf(np.array([-1., 0, 1]))
array([ 0.15865525, 0.5, 0.84134475])
Thus, the basic methods, such as pdf, cdf, and so on, are vectorized.
https://docs.scipy.org/doc/scipy/reference/tutorial/stats.html

Related

Recognition of a plateau with a slope close to zero

I am writing code to remove plateau outliers from time series data. I proceeded after receiving advice to use np.diff, but there was a problem that it could not be recognized if it was not the same value.
def find_plateaus(F, min_length=200, tolerance = 0.75, smoothing=15):
import numpy as np
from scipy.ndimage.filters import uniform_filter1d
# calculate smooth gradients
smoothF = uniform_filter1d(F, size = smoothing)
dF = uniform_filter1d(np.gradient(smoothF),size = smoothing)
d2F = uniform_filter1d(np.gradient(dF),size = smoothing)
def zero_runs(x):
iszero = np.concatenate(([0], np.equal(x, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
# Find ranges where second derivative is zero
# Values under eps are assumed to be zero.
eps = np.quantile(abs(d2F),tolerance)
smalld2F = (abs(d2F) <= eps)
# Find repititions in the mask "smalld2F" (i.e. ranges where d2F is constantly zero)
p = zero_runs(np.diff(smalld2F))
# np.diff(p) gives the length of each range found.
# only accept plateaus of min_length
plateaus = p[(np.diff(p) > min_length).flatten()]
return (plateaus)
plateaus = find_plateaus(test, min_length=5, tolerance = 0.02, smoothing=11)
plateaus = np.ravel(plateaus, order = 'A')
plateaus = plateaus.tolist()
print(plateaus)
test2['T&F'] = np.nan
for i in test2.index:
if i in plateaus:
test2.loc[i,['T&F']] = test2.loc[i,'data']
else :
test2.loc[i,['T&F']] = 0
fig, ax = plt.subplots(figsize=(15,6))
ax.plot(test2.index, test2['data'], color='black', label = 'time_series')
ax.scatter(test2.index,test2['T&F'], color='red', label = 'D910')
plt.legend()
plt.show();
Do you know any libraries or methods that can be used?
I want to recognize the parts marked in the picture below.
enter image description here
Still in progress, but found the answer.
First, make the np array multidimensional.
ex) time_step = 3
.....
Then, using np.std(), find the standard deviation,
After checking, you can set the standard deviation range to recognize the included range.

Optimizing torch mean over a dimension in a random batch

I am looking for a way to optimize the following code in pytorch.
I have a function f defined over space x,y and time t.
In a random batch, I need to compute the average over all the same timestamps. I was able to achieve this with the following inefficient for-loop
import torch
# Space (x,y) and time (t) coordinates in a random batch
x = torch.Tensor([[0, 0, 1, 0],[3, 2, 2, 1],[1,3,5,5]]).T
# compute a dummy function u = f(t,x,y)
f = (x**2 + 0.5)[:,:2]
# timestamps
t = x[:,0]
# get unique timestamps
val = torch.unique(t.squeeze())
for v in val:
# compute a mask for all timestamp equal to v
mask = t == v
# average over the spatial coordinates
f[mask,:] = torch.mean(f[mask,:], dim=0)
print(f)
Which results in
f = tensor([[0.5000, 5.1667],
[0.5000, 5.1667],
[1.5000, 4.5000],
[0.5000, 5.1667]])
Is there a way to make this computation faster?
I think you are looking for index_add_:
avg_size = int(t.max().item()) + 1 # number of rows in output tensor
z = torch.zeros((avg_size, f.shape[1]), dtype=f.dtype)
s = torch.index_add(s, 0, t.long(), f) # sum the elements of f
c = torch.index_add(s, 0, t.long(), torch.ones_like(f[:, :1])) # count how many at each entry
out = s / c # divide to get the mean

Efficiently calculating grid-based point density in 3d point cloud

I have a 3d point cloud matrix, and I am trying to calculate the largest point density within a smaller volume inside the matrix. I am currently using a 3D grid-histogram system where I loop through every point in the matrix and increase the value of the corresponding grid square. Then, I can simply find the max value of the grid matrix.
I have already written code that works, but it is horribly slow for what I am trying to do
import numpy as np
def densityPointCloud(points, gridCount, gridSize):
hist = np.zeros((gridCount, gridCount, gridCount), np.uint16)
rndPoints = np.rint(points/gridSize) + int(gridCount/2)
rndPoints = rndPoints.astype(int)
for point in rndPoints:
if np.amax(point) < gridCount and np.amin(point) >= 0:
hist[point[0]][point[1]][point[2]] += 1
return hist
cloud = (np.random.rand(100000, 3)*10)-5
histogram = densityPointCloud(cloud , 50, 0.2)
print(np.amax(histogram))
Are there any shortcuts I can take to do this more efficiently?
Here's a start:
import numpy as np
import time
from collections import Counter
# if you need the whole histogram object
def dpc2(points, gridCount, gridSize):
hist = np.zeros((gridCount, gridCount, gridCount), np.uint16)
rndPoints = np.rint(points/gridSize) + int(gridCount/2)
rndPoints = rndPoints.astype(int)
inbounds = np.logical_and(np.amax(rndPoints,axis = 1) < gridCount, np.amin(rndPoints,axis = 1) >= 0)
for point in rndPoints[inbounds,:]:
hist[point[0]][point[1]][point[2]] += 1
return hist
# just care about a max point
def dpc3(points, gridCount, gridSize):
rndPoints = np.rint(points/gridSize) + int(gridCount/2)
rndPoints = rndPoints.astype(int)
inbounds = np.logical_and(np.amax(rndPoints,axis = 1) < gridCount,
np.amin(rndPoints,axis = 1) >= 0)
# cheap hashing
phashes = gridCount*gridCount*rndPoints[inbounds,0] + gridCount*rndPoints[inbounds,1] + rndPoints[inbounds,2]
max_h, max_v = Counter(phashes).most_common(1)[0]
max_coord = [(max_h // (gridCount*gridCount)) % gridCount,(max_h // gridCount) % gridCount,max_h % gridCount]
return (max_coord, max_v)
# TESTING
cloud = (np.random.rand(200000, 3)*10)-5
t1 = time.perf_counter()
hist1 = densityPointCloud(cloud , 50, 0.2)
t2 = time.perf_counter()
hist2 = dpc2(cloud,50,0.2)
t3 = time.perf_counter()
hist3 = dpc3(cloud,50,0.2)
t4 = time.perf_counter()
print(f"task 1: {round(1000*(t2-t1))}ms\ntask 2: {round(1000*(t3-t2))}ms\ntask 3: {round(1000*(t4-t3))}ms")
print(f"max value is {hist3[1]}, achieved at {hist3[0]}")
np.all(np.equal(hist1,hist2)) # check that results are identical
# check for equal max - histogram may be multi-modal so the point won't
# necessarily match
np.unravel_index(np.argmax(hist2, axis=None), hist2.shape)
The idea is to do all the if/and comparisons once: let numpy do them (effectively in C) rather then doing them 'manually' inside a Python loop. This also lets us only iterate over the points that will lead to hist being incremented.
You can also consider using a sparse data structure for hist if you think your cloud will have lots of empty space - memory allocation can become a bottleneck for very large data.
Did not scientifically benchmark this but appears to run ~2-3x faster (v2) and 6-8x faster (v3)! If you'd like all the points which are tied for the max. density, it would be easy to extract those from the Counter object.

Get all component stats of multiple arrays labeled by one of them

I already asked a similar question which got answered but now this is more in detail:
I need a really fast way to get all important component stats of two arrays, where one array is labeled by opencv2 and gives the component areas for both arrays. The stats for all components masked on the two arrays should then saved to a dictionary. My approach works but it is much too slow. Is there something to avoid the loop or a better approach then the ndimage.öabeled_comprehension?
from scipy import ndimage
import numpy as np
import cv2
def calculateMeanMaxMin(val):
return np.array([np.mean(val),np.max(val),np.min(val)])
def getTheStatsForComponents(array1,array2):
ret, thresholded= cv2.threshold(array2, 120, 255, cv2.THRESH_BINARY)
thresholded= thresholded.astype(np.uint8)
numLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(thresholded, 8, cv2.CV_8UC1)
allComponentStats=[]
meanmaxminArray2 = ndimage.labeled_comprehension(array2, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
meanmaxminArray1 = ndimage.labeled_comprehension(array1, labels, np.arange(1, numLabels+1), calculateMeanMaxMin, np.ndarray, 0)
for position, label in enumerate(range(1, numLabels)):
currentLabel = np.uint8(labels== label)
contour, _ = cv2.findContours(currentLabel, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
(side1,side2)=cv2.minAreaRect(contour[0])[1]
componentStat = stats[label]
allstats = {'position':centroids[label,:],'area':componentStat[4],'height':componentStat[3],
'width':componentStat[2],'meanArray1':meanmaxminArray1[position][0],'maxArray1':meanmaxminArray1[position][1],
'minArray1':meanmaxminArray1[position][2],'meanArray2':meanmaxminArray2[position][0],'maxArray2':meanmaxminArray2[position][1],
'minArray2':meanmaxminArray2[position][2]}
if side1 >= side2 and side1 > 0:
allstats['elongation'] = np.float32(side2 / side1)
elif side2 > side1 and side2 > 0:
allstats['elongation'] = np.float32(side1 / side2)
else:
allstats['elongation'] = np.float32(0)
allComponentStats.append(allstats)
return allComponentStats
EDIT
The two arrays are 2d arrays:
array1= np.random.choice(255,(512,512)).astype(np.uint8)
array2= np.random.choice(255,(512,512)).astype(np.uint8)
EDIT2
small example of two arrays and the labelArray with two components(1 and 2, and background 0). Calculate the min,max mean with ndimage.labeled_comprhension.
from scipy import ndimage
import numpy as np
labelArray = np.array([[0,1,1,1],[2,2,1,1],[2,2,0,1]])
data = np.array([[0.1,0.2,0.99,0.2],[0.34,0.43,0.87,0.33],[0.22,0.53,0.1,0.456]])
data2 = np.array([[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2],[0.1,0.2,0.99,0.2]])
numLabels = 2
minimumDataForAllLabels = ndimage.labeled_comprehension(data, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
minimumData2ForallLabels = ndimage.labeled_comprehension(data2, labelArray, np.arange(1, numLabels+1), np.min, np.ndarray, 0)
print(minimumDataForAllLabels)
print(minimumData2ForallLabels)
print(bin_and_do_simple_stats(labelArray.flatten(),data.flatten()))
Output:
[0.2 0.22] ##minimum of component 1 and 2 from data
[0.2 0.1] ##minimum of component 1 and 2 from data2
[0.1 0.2 0.22] ##minimum output of bin_and_do_simple_stats from data
labeled_comprehension is definitely slow.
At least the simple stats can be done much faster based on the linked post. For simplicity I'm only doing one data array, but as the procedure returns sort indices it can be easily extended to multiple arrays:
import numpy as np
from scipy import sparse
try:
from stb_pthr import sort_to_bins as _stb_pthr
HAVE_PYTHRAN = True
except:
HAVE_PYTHRAN = False
# fallback if pythran not available
def sort_to_bins_sparse(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
aux = sparse.csr_matrix((data, idx, np.arange(len(idx)+1)), (len(idx), mx)).tocsc()
return aux.data, aux.indices, aux.indptr
def sort_to_bins_pythran(idx, data, mx=-1):
indices, indptr = _stb_pthr(idx, mx)
return data[indices], indices, indptr
# pick best available
sort_to_bins = sort_to_bins_pythran if HAVE_PYTHRAN else sort_to_bins_sparse
# example data
idx = np.random.randint(0,10,(100000))
data = np.random.random(100000)
# if possible compare the two methods
if HAVE_PYTHRAN:
dsp,isp,psp = sort_to_bins_sparse(idx,data)
dph,iph,pph = sort_to_bins_pythran(idx,data)
assert (dsp==dph).all()
assert (isp==iph).all()
assert (psp==pph).all()
# example how to do simple vectorized calculations
def simple_stats(data,iptr):
min = np.minimum.reduceat(data,iptr[:-1])
mean = np.add.reduceat(data,iptr[:-1]) / np.diff(iptr)
return min, mean
def bin_and_do_simple_stats(idx,data,mx=-1):
data,indices,indptr = sort_to_bins(idx,data,mx)
return simple_stats(data,indptr)
print("minima: {}\n mean values: {}".format(*bin_and_do_simple_stats(idx,data)))
If you have pythran (not required but a bit faster), compile this as <stb_pthr.py>:
import numpy as np
#pythran export sort_to_bins(int[:], int)
def sort_to_bins(idx, mx):
if mx==-1:
mx = idx.max() + 1
cnts = np.zeros(mx + 2, int)
for i in range(idx.size):
cnts[idx[i]+2] += 1
for i in range(2, cnts.size):
cnts[i] += cnts[i-1]
res = np.empty_like(idx)
for i in range(idx.size):
res[cnts[idx[i]+1]] = i
cnts[idx[i]+1] += 1
return res, cnts[:-1]

Vector quantization(convector float vector into short vector)

I want to quantize a float vector into short one. I did research online and found many vector quantization algorithms, such as LBG. However, I still do not understand how to map a float vector space into short vector space. So I did further research, one article I found did the exact thing what I want.
import numpy as np
from sklearn.preprocessing import normalize
def get_median_values_for_bins(bins):
median_values = {}
for binidx in range(1, bins.shape[0]):
binval = bins[binidx]
binval_prev = bins[binidx - 1]
median_values[binidx] = binval_prev
median_values[bins.shape[0]] = bins[bins.shape[0]-1]
return median_values
def get_quantized_features(features, quantization_factor=30):
normalized_features = normalize(features, axis=1)
offset = np.abs(np.min(normalized_features))
offset_features = normalized_features + offset # Making all feature values positive
# Let's proceed to quantize these positive feature values
min_val = np.min(offset_features)
max_val = np.max(offset_features)
bins = np.linspace(start=min_val, stop=max_val, num=quantization_factor)
median_values = get_median_values_for_bins(bins)
original_quantized_features = np.digitize(offset_features, bins)
quantized_features = np.apply_along_axis(lambda row: map(lambda x: median_values[x], row), 1, original_quantized_features)
quantized_features = np.floor(quantization_factor*quantized_features)
return quantized_features
quantization_factor = 5000 # Adjust this depending on accuracy of quantized features.
quantized_features = get_quantized_features(features, quantization_factor)

Categories

Resources