I am performing semantic segmentation (with materials as classes) on images, and wish to calculate precision-recall curves of my accuracy. Currently, I calculate the true positives, false positives and false negatives for each class by summing the number of pixels for which ground truth and prediction agree with that class, for which only the prediction agrees, and for which only the ground truth agrees, respectively. Then I calculate precision and recall accordingly:
pixel_probs = np.array(pixel_probs) # shape (num_pixels), the classification certainty for each pixel
pixel_labels_pred, pixel_labels_gt = np.array(pixel_labels_pred).astype(bool), np.array(pixel_labels_gt).astype(bool) # shape (num_pixels, num_classes), one hot labels for each pixel
precision_mat, recall_mat = np.array([]).reshape(num_labels, 0), np.array([]).reshape(num_labels, 0) # stores the precision-recall pairs for each certainty threshold
prev_num_pixels = sum(pixel_probs > 0.0)
for threshold in sorted(thresholds):
pixel_mask = pixel_probs > threshold
if sum(pixel_mask) == prev_num_pixels: continue
prev_num_pixels == sum(pixel_mask)
pixel_labels_pred_msk = pixel_labels_pred[pixel_mask]
pixel_labels_gt_msk = pixel_labels_gt[pixel_mask]
tps = np.sum(np.logical_and(pixel_labels_gt_msk, pixel_labels_pred_msk), axis=0)
fps = np.sum(np.logical_and(np.logical_not(pixel_labels_gt_msk), pixel_labels_pred_msk), axis=0)
fns = np.sum(np.logical_and(pixel_labels_gt_msk, np.logical_not(pixel_labels_pred_msk)), axis=0)
precisions = tps / (tps + fps)
recalls = tps / (tps + fns)
precision_mat = np.concatenate([precision_mat, np.expand_dims(precisions, axis=-1)], axis=-1)
recall_mat = np.concatenate([recall_mat, np.expand_dims(recalls, axis=-1)], axis=-1)
fig = plt.figure()
fig.set_size_inches(12, 5)
for label_index in range(precision_mat.shape[0]):
r = recall_mat[label_index]
p = precision_mat[label_index]
sort_order = np.argsort(r)
r = r[sort_order]
p = p[sort_order]
plt.plot(r, p, '-o', markersize=2, label=labels[label_index])
plt.title("Precision-recall curve")
plt.legend(loc='upper left', fontsize=8.5, ncol=1, bbox_to_anchor=(1, 1))
plt.xlabel('recall', fontsize=12)
plt.ylabel('precision', fontsize=12)
plt.savefig(dir + "test/pr_curves.png")
However, this produces some very strange looking graphs:
It is true that my segmentator is performing rather horribly, but I would at least expect the curves to follow more or less a downward slope.
Am I calculating my PR-curve correctly? Are there alternative ways of calculating such curves that I should consider? Is there perhaps a bug in my plotting code?
Related
I am writing code to remove plateau outliers from time series data. I proceeded after receiving advice to use np.diff, but there was a problem that it could not be recognized if it was not the same value.
def find_plateaus(F, min_length=200, tolerance = 0.75, smoothing=15):
import numpy as np
from scipy.ndimage.filters import uniform_filter1d
# calculate smooth gradients
smoothF = uniform_filter1d(F, size = smoothing)
dF = uniform_filter1d(np.gradient(smoothF),size = smoothing)
d2F = uniform_filter1d(np.gradient(dF),size = smoothing)
def zero_runs(x):
iszero = np.concatenate(([0], np.equal(x, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
# Find ranges where second derivative is zero
# Values under eps are assumed to be zero.
eps = np.quantile(abs(d2F),tolerance)
smalld2F = (abs(d2F) <= eps)
# Find repititions in the mask "smalld2F" (i.e. ranges where d2F is constantly zero)
p = zero_runs(np.diff(smalld2F))
# np.diff(p) gives the length of each range found.
# only accept plateaus of min_length
plateaus = p[(np.diff(p) > min_length).flatten()]
return (plateaus)
plateaus = find_plateaus(test, min_length=5, tolerance = 0.02, smoothing=11)
plateaus = np.ravel(plateaus, order = 'A')
plateaus = plateaus.tolist()
print(plateaus)
test2['T&F'] = np.nan
for i in test2.index:
if i in plateaus:
test2.loc[i,['T&F']] = test2.loc[i,'data']
else :
test2.loc[i,['T&F']] = 0
fig, ax = plt.subplots(figsize=(15,6))
ax.plot(test2.index, test2['data'], color='black', label = 'time_series')
ax.scatter(test2.index,test2['T&F'], color='red', label = 'D910')
plt.legend()
plt.show();
Do you know any libraries or methods that can be used?
I want to recognize the parts marked in the picture below.
enter image description here
Still in progress, but found the answer.
First, make the np array multidimensional.
ex) time_step = 3
.....
Then, using np.std(), find the standard deviation,
After checking, you can set the standard deviation range to recognize the included range.
I am fitting some data I have with a function. With the fitted function I want to see where it crosses some threshold value, determine this threshold value and get an error on this value as well. When using the curve_fit toolbox with the absolute_sigma = True, I get a large error in the threshold value. When I turn it off, I get an extremely small error in the threshold value, both which seem not very realistic to me.
#The numbers I use for the fit
absaverage = 1.0979598702453246
Nlist = [0.31974162, 0.52558248, 0.77172549, 1.34829036, 1.91809528, 3.08342098, 5.33183816, 6.60457399, 5.93014992]
averagelist = [0.294913849040662, 0.4648514538342791,0.6496807339899529,1.014641822518085,1.2981207560776595,1.703857575892128,2.0964571453613123,2.1999054339799295,2.151973289007941]
#%%Now fitting the threshold data to obtain <threshold value>
def fittie(x, a, b):
return a*(1-np.exp(-b*x))
#guess values for the fit
aguess = 2.3
bguess = 0.42
popt,pcov = curve_fit(fittie, Nlist, averagelist, p0=[aguess,bguess], maxfev = 20000)
Nlistforplot = np.arange(0.1, 7, 0.05)
f=plt.figure()
plt.plot(Nlist, averagelist, 'bo', label='data', markersize = 5)
plt.plot(Nlistforplot, fittie(Nlistforplot, *popt), 'r--', label='fit', linewidth = 1)
plt.axhline(y = absaverage, color = 'black', lw = '1')
plt.axvline(1.54, color = 'black', lw = '1')
plt.fill_between([0,12], absaverage, absaverage+max(averagelist)+100, color = 'blue', alpha = 0.15)
plt.ylabel('-Value',fontsize='xx-large')
plt.xlabel('Nlist',fontsize='xx-large')
plt.yscale('log')
plt.xscale('log')
plt.xlim(0.2,max(Nlist)+1)
plt.ylim(0.15,max(averagelist)+1)
plt.title('Threshold determination')
plt.legend()
plt.show()
afromfit = popt[0]
bfromfit = popt[1]
print "your threshold value is"
thresholdvalue = -(1/bfromfit)*np.log(1-(absaverage/afromfit))
print thresholdvalue
#ERROR IN THRESHOLD PROPAGATION
dfdb = (1/bfromfit**2)*np.log(1-(absaverage/afromfit))
dfda = -(1/bfromfit)*(1/(1-absaverage/afromfit))*(absaverage/(afromfit**2))
sigmax2 = dfda**2*pcov[0,0]+dfdb**2*pcov[1,1]+2*dfda*dfdb*pcov[1,0]
print "sigma in threshold value is"
print sigmax2**0.5
So I obtain the same threshold value of about 1.50. The errors seem completely off though, either too large or too small. Any idea?
I wanted to know, if there is a method that shows me how long my x-axis should be. I have a record with different outliers. I can just cut them with plt.xlim() but is there a statistical method to compute a senseful x-axis limit? In the added picture a logical cut would be after 150 km drived distance. To compute the threshold of the cutting would be perfect
The dataframe that the definition gets is a standard pandas dataframe
Code:
def yearly_distribution(dataframe):
df_distr = dataframe
h=sorted(df_distr['Distance'])
l=len(h)
fig, ax =plt.subplots(figsize=(16,9))
binwidth = np.arange(0,501,0.5)
n, bins, patches = plt.hist(h, bins=binwidth, normed=1, facecolor='#023d6b', alpha=0.5, histtype='bar')
lnspc =np.arange(0,500.5,0.5)
gevfit = gev.fit(h)
pdf_gev = gev.pdf(lnspc, *gevfit)
plt.plot(lnspc, pdf_gev, label="GEV")
logfit = stats.lognorm.fit(h)
pdf_lognorm = stats.lognorm.pdf(lnspc, *logfit)
plt.plot(lnspc, pdf_lognorm, label="LogNormal")
weibfit = stats.weibull_min.fit(h)
pdf_weib = stats.weibull_min.pdf(lnspc, *weibfit)
plt.plot(lnspc, pdf_weib, label="Weibull")
burrfit = stats.burr.fit(h)
pdf_burr = stats.burr.pdf(lnspc, *burrfit)
plt.plot(lnspc, pdf_burr, label="Burr Distribution")
genparetofit = stats.genpareto.fit(h)
pdf_genpareto = stats.genpareto.pdf(lnspc, *genparetofit)
plt.plot(lnspc, pdf_genpareto, label ="Generalized Pareto")
myarray = np.array(h)
clf = GMM(8,n_iter=500, random_state=3)
myarray.shape = (myarray.shape[0],1)
clf = clf.fit(myarray)
lnspc.shape = (lnspc.shape[0],1)
pdf_gmm = np.exp(clf.score(lnspc))
plt.plot(lnspc, pdf_gmm, label = "GMM")
plt.xlim(0,500)
plt.xlabel('Distance')
plt.ylabel('Probability')
plt.title('Histogram')
plt.ylim(0,0.05)
you should remove outliers from your data before any plot or fitting :
h=sorted(df_distr['Distance'])
out_threshold= 150.0
h=[i for i in h if i<out_threshold]
EDIT
that maybe not the fastest way but with numpy.std() :
out_threshold= 2.0*np.std(h+[-a for a in h])
I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.
Currently learning TensorFlow I'm working to implement kmeans clustering using TensorFlow. I am following a tutorial on TensorFlow which first introduce kmeans then introduce Gradient Descent Optimization
We first generate samples
def create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed):
np.random.seed(seed)
slices = []
centroids = []
# Create samples for each cluster
for i in range(n_clusters):
samples = tf.random_normal((n_samples_per_cluster, n_features),
mean=0.0, stddev=5.0, dtype=tf.float32, seed=seed, name="cluster_{}".format(i))
current_centroid = (np.random.random((1, n_features)) * embiggen_factor) - (embiggen_factor/2)
centroids.append(current_centroid)
samples += current_centroid
slices.append(samples)
# Create a big "samples" dataset
samples = tf.concat(0, slices, name='samples')
centroids = tf.concat(0, centroids, name='centroids')
return centroids, samples
then define 2 function assign & update (+ euclidian distance) as usual
def assign(data, centroids):
# Explanations here: http://learningtensorflow.com/lesson6/
expanded_vectors = tf.expand_dims(samples, 0)
expanded_centroids = tf.expand_dims(centroids, 1)
# nice trick here: use 'sub' "pairwisely" (thats why we just used "expand")
#
distances = tf.reduce_sum( tf.square(
tf.sub(expanded_vectors, expanded_centroids)), 2)
mins = tf.argmin(distances, 0)
nearest_indices = mins
return nearest_indices
def update(data, nearest_indices, n_clusters):
# Updates the centroid to be the mean of all samples associated with it.
nearest_indices = tf.to_int32(nearest_indices)
partitions = tf.dynamic_partition(samples, nearest_indices, n_clusters)
new_centroids = tf.concat(0, [tf.expand_dims(tf.reduce_mean(partition, 0), 0) for partition in partitions])
return new_centroids
def euclidian_distance(x, y):
sqd = tf.squared_difference(tf.cast(x, "float32"),tf.cast(y, "float32"))
sumsqd = tf.reduce_sum(sqd)
sqrtsumsqd = tf.sqrt(sumsqd)
return sqrtsumsqd
Then define the TensorFlow model to run:
import tensorflow as tf
import numpy as np
nclusters = 3
nsamplespercluster = 500
nfeatures = 2
embiggenfactor = 70
seed = 700
np.random.seed(seed)
ocentroids, samples = create_samples(nclusters, nsamplespercluster, nfeatures, embiggenfactor, seed)
X = tf.placeholder("float32", [nclusters*nsamplespercluster, 2])
# chosing random sample points as initial centroids.
centroids = tf.Variable([samples[i] for i in np.random.choice(range(nclusters*nsamplespercluster), nclusters)])#, [10.,10.]])
mean=0.0, stddev=150, dtype=tf.float32, seed=seed))
nearest_indices = assign(X, centroids)
new_centroids = update(X, nearest_indices, nclusters)
# Our error is defined as the square of the differences between centroid
error = euclidian_distance(centroids, new_centroids)
# The Gradient Descent Optimizer
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(error)
model = tf.initialize_all_variables()
with tf.Session() as session:
data = session.run(samples)
session.run(model)
epsilon = 0.08
err = float("inf")
count = 0
while err > epsilon:
_, err = session.run([train_op, error], {X: data})
print(err)
clustering = session.run(nearest_indices)
centers = session.run(centroids)
count += 1
# Plot each 100 iteration to see progress
if (count % 100) == 0:
print(count)
plt.figure()
plt.scatter(data[:,0], data[:,1], c=clustering)
plt.scatter(centers[:,0], centers[:,1], s=300, c="orange", marker="x", linewidth=5)
print("%d iterations" % count)
plt.figure()
plt.scatter(data[:,0], data[:,1], c=clustering)
plt.scatter(centers[:,0], centers[:,1], s=300, c="orange", marker="x", linewidth=5)
This is actually working (running) but the result is decieving:
After around 1600 iteration the result is so bad. I dont even figure out how some points can be "lost" (= clustered as a color they are so away from). To my mind kmeans can converge rlly fast on such case. Here it is not even converging to a good solution. Is it due to Gradient Descent? (don't see how could it be but...)
Thanks for advices!
pltrdy