I have such histogram:
and I have this code that finds the maxima (-21.5 in my case):
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [maxima]
But I need to find the range of the most dominant values of the histogram (in this histogram for example: -30 : -5).
Something like, the value from both sides where it's probability is equal to 20% of the maxima probability.
How can I achieve it?
I had tried the following:
t_right = list(filter(lambda tup:np.logical_and(tup[1] > maxima , probs[tup[0]] <= max(probs)*0.2), enumerate(samples)))
but getting many values, I want only one value that cut the curve
I'm not sure if that is what you are looking for but I've found this article on Towards data Science code form that article is as follow:
Link: https://towardsdatascience.com/take-your-histograms-to-the-next-level-using-matplotlib-5f093ad7b9d3
# Plot
# Plot histogram
avocado.plot(kind = "hist", density = True, alpha = 0.65, bins = 15) # change density to true, because KDE uses density
# Plot KDE
avocado.plot(kind = "kde")
# Quantile lines
quant_5, quant_25, quant_50, quant_75, quant_95 = avocado.quantile(0.05), avocado.quantile(0.25), avocado.quantile(0.5), avocado.quantile(0.75), avocado.quantile(0.95)
quants = [[quant_5, 0.6, 0.16], [quant_25, 0.8, 0.26], [quant_50, 1, 0.36], [quant_75, 0.8, 0.46], [quant_95, 0.6, 0.56]]
for i in quants:
ax.axvline(i[0], alpha = i[1], ymax = i[2], linestyle = ":")
# X
ax.set_xlabel("Average Price ($)")
# Limit x range to 0-4
x_start, x_end = 0, 4
ax.set_xlim(x_start, x_end)
# Y
ax.set_ylim(0, 1)
ax.set_yticklabels([])
ax.set_ylabel("")
# Annotations
ax.text(quant_5-.1, 0.17, "5th", size = 10, alpha = 0.8)
ax.text(quant_25-.13, 0.27, "25th", size = 11, alpha = 0.85)
ax.text(quant_50-.13, 0.37, "50th", size = 12, alpha = 1)
ax.text(quant_75-.13, 0.47, "75th", size = 11, alpha = 0.85)
ax.text(quant_95-.25, 0.57, "95th Percentile", size = 10, alpha =.8)
# Overall
ax.grid(False)
ax.set_title("Avocado Prices in U.S. Markets", size = 17, pad = 10)
# Remove ticks and spines
ax.tick_params(left = False, bottom = False)
for ax, spine in ax.spines.items():
spine.set_visible(False)
plt.show()
The output of above is something like that:
I hope that could be helpful for you! :)
This is my solution, will be glad to get other ideas:
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
t_right_list = list(filter(lambda tup:np.logical_and(tup[1] > maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_right = np.median(list(zip(*t_right_list))[1])
t_left_list = list(filter(lambda tup:np.logical_and(tup[1] < maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_left = np.median(list(zip(*t_left_list))[1])
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [t_left, maxima, t_right]
In case more than one value will be retrieved in t_right/t_left (because of abs_tol param value), then median can be used (in order to get only one value)
Related
dataset: https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv
from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)
def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
# Compute quartiles for each group
q1 = df_gb[vals].quantile(q=0.25)
q2 = df_gb[vals].quantile(q=0.5)
q3 = df_gb[vals].quantile(q=0.75)
# Compute interquartile region and upper and lower bounds for outliers
iqr = q3 - q1
upper_cutoff = q3 + 1.5*iqr
lower_cutoff = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
outlier_inds = (group[vals] > upper_cutoff[cat]) \
| (group[vals] < lower_cutoff[cat])
return group[vals][outlier_inds]
# Apply outlier finder
out = df_gb.apply(outliers).dropna()
# Points of outliers for plotting
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if cat in out and not out[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
# If outliers, shrink whiskers to smallest and largest non-outlier
qmin = df_gb[vals].min()
qmax = df_gb[vals].max()
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
cats = [str(i) for i in cats]
# Build figure
p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_width = 2
p.yaxis.axis_label = ylabel
p.xaxis.axis_label = xlabel
p.title=title
p.y_range.start=0
p.title.align = 'center'
# stems
p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
# boxes
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'],
alpha=0.7, line_width=2, line_color="black")
# median (almost-0 height rects simpler than segments)
p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)
# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower, 0.2, 0.01, line_color="black")
p.rect(cats, upper, 0.2, 0.01, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="black")
return p
p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)
Hi there, from the code and dataset above I am able to produce a boxplot considering I pass through categorical variables. however I am unable to produce anything when I try to produce a boxplot for a single column. for example just checking the spread of the math scores. i tried to do
cats = df['math score']
but it didnt work. any suggestions?
I am not sute if this it is the best to implement this both in one function, but if this is your goal, one solution can be, to add a few if-else conditions.
Here is a description of the changes:
First give label a default.
# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):
Then add a if-else part for the groupby section.
# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())
# new
if label is not None:
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
else:
df_gb = df[[vals]]
cats = [vals]
Now the calculation for the outliners is a bit different, because we don't have to loop over a number of columns. Only onw column is left.
if label is not None:
out = df_gb.apply(outliers).dropna()
else:
out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]
The upper and lower part are now floats and not a list.
if label is not None:
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
upper =min(qmax, upper_cutoff)
lower =max(qmin, lower_cutoff)
I also added (changed) the line below, to avoid a warning.
colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")
With these changes the output for
p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
is still the same, but
p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')
gives us now a boxplot.
I have created a histogram using matplotlib of my experimental data, which consists of the value measured and the weight. Using the weights argument of plt.hist it is no problem weighting together the events, but when I look at options for errorbars none seem to take event weights into account. There are solutions to this problem where Poisson errors or the same error is used everywhere, like this one, but that does not solve my problem.
The error of one bin should mathematically be calculated as err(bin) = sqrt( sum {w_i^2} ) where w_i are the individual weights of the events that belong in that bin.
A simplified example of my histogram is given below.
import matplotlib.pyplot as plt
data=[1,8,5,4,1,10,8,3,6,7]
weights=[1.3,0.2,0.01,0.9,0.4,1.05,0.6,0.6,0.8,1.8]
plt.hist(data, bins = [0.0,2.5,5.0,7.5,10.0], weights=weights)
plt.show()
You have to manually compute the errors for each bin and plot that separately.
import matplotlib.pyplot as plt # type: ignore
import numpy as np # type: ignore
data = np.array([1, 8, 5, 4, 1, 10, 8, 3, 6, 7])
weights = np.array([1.3, 0.2, 0.01, 0.9, 0.4, 1.05, 0.6, 0.6, 0.8, 1.8])
bin_edges = [0.0, 2.5, 5.0, 7.5, 10.0]
bin_y, _, bars = plt.hist(data, bins=bin_edges, weights=weights)
print(f"bin_y {bin_y}")
print(f"bin_edges {bin_edges}")
errors = []
bin_centers = []
for bin_index in range(len(bin_edges) - 1):
# find which data points are inside this bin
bin_left = bin_edges[bin_index]
bin_right = bin_edges[bin_index + 1]
in_bin = np.logical_and(bin_left < data, data <= bin_right)
print(f"in_bin {in_bin}")
# filter the weights to only those inside the bin
weights_in_bin = weights[in_bin]
print(f"weights_in_bin {weights_in_bin}")
# compute the error however you want
error = np.sqrt(np.sum(weights_in_bin ** 2))
errors.append(error)
print(f"error {error}")
# save the center of the bins to plot the errorbar in the right place
bin_center = (bin_right + bin_left) / 2
bin_centers.append(bin_center)
print(f"bin_center {bin_center}")
# plot the error bars
plt.errorbar(bin_centers, bin_y, yerr=errors, linestyle="none")
plt.show()
Which produces this
By the time you added the edits I had done the plot with the stddev for each bin, just change errors to stddevs computed as
data_in_bin = data[in_bin]
variance = np.average((data_in_bin - bin_center) ** 2, weights=weights_in_bin)
stddev = np.sqrt(variance)
print(f"stddev {stddev}")
stddevs.append(stddev)
But you should check that the stddev computation makes sense for your use case. This results in :
Cheers!
I am really confused by the function pywt.cwt, as I've not been able to get it to work. The function seems to integrate instead of differentiating. I would like to work it as the following: Example CWT, but my graph looks like this: My CWT. The idea is to integrate the raw signal (av) with cumtrapz, then differentiate with a gaussian CWT (=> S1), and then once more differentiate with gaussian CWT (=> S2).
As you can see in the pictures, the bottom peaks of the red line should line up in the valleys, but the land under the top peaks for me, and the green line should move 1/4th period to the left but moves to the right... Which makes me think it integrates for some reason.
I currently have no idea what causes this... Does anyone happen to know what is going on?
Thanks in advance!
#Get data from pandas
av = dfRange['y']
#remove gravity & turns av right way up
av = av - dfRange['y'].mean()
av = av * -1
#Filter
[b,a] = signal.butter(4, [0.9/(55.2/2), 20/(55.2/2)], 'bandpass')
av = signal.filtfilt(b,a, av)
#Integrate and differentiate av => S1
integrated_av = integrate.cumtrapz(av)
[CWT_av1, frequency1] = pywt.cwt(integrated_av, 8.8 , 'gaus1', 1/55.2)
CWT_av1 = CWT_av1[0]
CWT_av1 = CWT_av1 * 0.05
#differentiate S1 => S2
[CWT_av2, frequency2] = pywt.cwt(CWT_av1, 8.8 , 'gaus1', 1/55.2)
CWT_av2 = CWT_av2[0]
CWT_av2 = CWT_av2 * 0.8
#Find Peaks
inv_CWT_av1 = CWT_av1 * -1
av1_min, _ = signal.find_peaks(inv_CWT_av1)
av2_max, _ = signal.find_peaks(CWT_av2)
#Plot
plt.style.use('seaborn')
plt.figure(figsize=(25, 7), dpi = 300)
plt.plot_date(dfRange['recorded_naive'], av, linestyle = 'solid', marker = None, color = 'steelblue')
plt.plot_date(dfRange['recorded_naive'][:-1], CWT_av1[:], linestyle = 'solid', marker = None, color = 'red')
plt.plot(dfRange['recorded_naive'].iloc[av1_min], CWT_av1[av1_min], "ob", color = 'red')
plt.plot_date(dfRange['recorded_naive'][:-1], CWT_av2[:], linestyle = 'solid', marker = None, color = 'green')
plt.plot(dfRange['recorded_naive'].iloc[av2_max], CWT_av2[av2_max], "ob", color = 'green')
plt.gcf().autofmt_xdate()
plt.show()
I'm not sure this is your answer, but an observation from playing with pywt...
From the documentation the wavelets are basically given by the differentials of a Gaussian but there is an order dependent normalisation constant.
Plotting the differentials of a Guassian against the wavelets (extracted by putting in an impulse response) gives the following:
The interesting observation is that the order dependent normalisation constant sometimes seems to include a '-1'. In particular, it does for the first order gaus1.
So, my question is, could you actually have differentiation as you expect, but also multiplication by -1?
Code for the graph:
import numpy as np
import matplotlib.pyplot as plt
import pywt
dt = 0.01
t = dt * np.arange(100)
# Calculate the differentials of a gaussian by quadrature:
# start with the gaussian y = exp(-(x - x_0) ^ 2 / dt)
ctr = t[len(t) // 2]
gaus = np.exp(-np.power(t - ctr, 2)/dt)
gaus_quad = [np.gradient(gaus, dt)]
for i in range(7):
gaus_quad.append(np.gradient(gaus_quad[-1], dt))
# Extract the wavelets using the impulse half way through the dataset
y = np.zeros(len(t))
y[len(t) // 2] = 1
gaus_cwt = list()
for i in range(1, 9):
cwt, cwt_f = pywt.cwt(y, 10, f'gaus{i}', dt)
gaus_cwt.append(cwt[0])
fig, axs = plt.subplots(4, 2)
for i, ax in enumerate(axs.flatten()):
ax.plot(t, gaus_cwt[i] / np.max(np.abs(gaus_cwt[i])))
ax.plot(t, gaus_quad[i] / np.max(np.abs(gaus_quad[i])))
ax.set_title(f'gaus {i+1}', x=0.2, y=1.0, pad=-14)
ax.axhline(0, c='k')
ax.set_xticks([])
ax.set_yticks([])
I am fitting some data I have with a function. With the fitted function I want to see where it crosses some threshold value, determine this threshold value and get an error on this value as well. When using the curve_fit toolbox with the absolute_sigma = True, I get a large error in the threshold value. When I turn it off, I get an extremely small error in the threshold value, both which seem not very realistic to me.
#The numbers I use for the fit
absaverage = 1.0979598702453246
Nlist = [0.31974162, 0.52558248, 0.77172549, 1.34829036, 1.91809528, 3.08342098, 5.33183816, 6.60457399, 5.93014992]
averagelist = [0.294913849040662, 0.4648514538342791,0.6496807339899529,1.014641822518085,1.2981207560776595,1.703857575892128,2.0964571453613123,2.1999054339799295,2.151973289007941]
#%%Now fitting the threshold data to obtain <threshold value>
def fittie(x, a, b):
return a*(1-np.exp(-b*x))
#guess values for the fit
aguess = 2.3
bguess = 0.42
popt,pcov = curve_fit(fittie, Nlist, averagelist, p0=[aguess,bguess], maxfev = 20000)
Nlistforplot = np.arange(0.1, 7, 0.05)
f=plt.figure()
plt.plot(Nlist, averagelist, 'bo', label='data', markersize = 5)
plt.plot(Nlistforplot, fittie(Nlistforplot, *popt), 'r--', label='fit', linewidth = 1)
plt.axhline(y = absaverage, color = 'black', lw = '1')
plt.axvline(1.54, color = 'black', lw = '1')
plt.fill_between([0,12], absaverage, absaverage+max(averagelist)+100, color = 'blue', alpha = 0.15)
plt.ylabel('-Value',fontsize='xx-large')
plt.xlabel('Nlist',fontsize='xx-large')
plt.yscale('log')
plt.xscale('log')
plt.xlim(0.2,max(Nlist)+1)
plt.ylim(0.15,max(averagelist)+1)
plt.title('Threshold determination')
plt.legend()
plt.show()
afromfit = popt[0]
bfromfit = popt[1]
print "your threshold value is"
thresholdvalue = -(1/bfromfit)*np.log(1-(absaverage/afromfit))
print thresholdvalue
#ERROR IN THRESHOLD PROPAGATION
dfdb = (1/bfromfit**2)*np.log(1-(absaverage/afromfit))
dfda = -(1/bfromfit)*(1/(1-absaverage/afromfit))*(absaverage/(afromfit**2))
sigmax2 = dfda**2*pcov[0,0]+dfdb**2*pcov[1,1]+2*dfda*dfdb*pcov[1,0]
print "sigma in threshold value is"
print sigmax2**0.5
So I obtain the same threshold value of about 1.50. The errors seem completely off though, either too large or too small. Any idea?
I wanted to know, if there is a method that shows me how long my x-axis should be. I have a record with different outliers. I can just cut them with plt.xlim() but is there a statistical method to compute a senseful x-axis limit? In the added picture a logical cut would be after 150 km drived distance. To compute the threshold of the cutting would be perfect
The dataframe that the definition gets is a standard pandas dataframe
Code:
def yearly_distribution(dataframe):
df_distr = dataframe
h=sorted(df_distr['Distance'])
l=len(h)
fig, ax =plt.subplots(figsize=(16,9))
binwidth = np.arange(0,501,0.5)
n, bins, patches = plt.hist(h, bins=binwidth, normed=1, facecolor='#023d6b', alpha=0.5, histtype='bar')
lnspc =np.arange(0,500.5,0.5)
gevfit = gev.fit(h)
pdf_gev = gev.pdf(lnspc, *gevfit)
plt.plot(lnspc, pdf_gev, label="GEV")
logfit = stats.lognorm.fit(h)
pdf_lognorm = stats.lognorm.pdf(lnspc, *logfit)
plt.plot(lnspc, pdf_lognorm, label="LogNormal")
weibfit = stats.weibull_min.fit(h)
pdf_weib = stats.weibull_min.pdf(lnspc, *weibfit)
plt.plot(lnspc, pdf_weib, label="Weibull")
burrfit = stats.burr.fit(h)
pdf_burr = stats.burr.pdf(lnspc, *burrfit)
plt.plot(lnspc, pdf_burr, label="Burr Distribution")
genparetofit = stats.genpareto.fit(h)
pdf_genpareto = stats.genpareto.pdf(lnspc, *genparetofit)
plt.plot(lnspc, pdf_genpareto, label ="Generalized Pareto")
myarray = np.array(h)
clf = GMM(8,n_iter=500, random_state=3)
myarray.shape = (myarray.shape[0],1)
clf = clf.fit(myarray)
lnspc.shape = (lnspc.shape[0],1)
pdf_gmm = np.exp(clf.score(lnspc))
plt.plot(lnspc, pdf_gmm, label = "GMM")
plt.xlim(0,500)
plt.xlabel('Distance')
plt.ylabel('Probability')
plt.title('Histogram')
plt.ylim(0,0.05)
you should remove outliers from your data before any plot or fitting :
h=sorted(df_distr['Distance'])
out_threshold= 150.0
h=[i for i in h if i<out_threshold]
EDIT
that maybe not the fastest way but with numpy.std() :
out_threshold= 2.0*np.std(h+[-a for a in h])