how can I create a single box plot? - python

dataset: https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv
from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)
def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
# Compute quartiles for each group
q1 = df_gb[vals].quantile(q=0.25)
q2 = df_gb[vals].quantile(q=0.5)
q3 = df_gb[vals].quantile(q=0.75)
# Compute interquartile region and upper and lower bounds for outliers
iqr = q3 - q1
upper_cutoff = q3 + 1.5*iqr
lower_cutoff = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
outlier_inds = (group[vals] > upper_cutoff[cat]) \
| (group[vals] < lower_cutoff[cat])
return group[vals][outlier_inds]
# Apply outlier finder
out = df_gb.apply(outliers).dropna()
# Points of outliers for plotting
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if cat in out and not out[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
# If outliers, shrink whiskers to smallest and largest non-outlier
qmin = df_gb[vals].min()
qmax = df_gb[vals].max()
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
cats = [str(i) for i in cats]
# Build figure
p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_width = 2
p.yaxis.axis_label = ylabel
p.xaxis.axis_label = xlabel
p.title=title
p.y_range.start=0
p.title.align = 'center'
# stems
p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
# boxes
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'],
alpha=0.7, line_width=2, line_color="black")
# median (almost-0 height rects simpler than segments)
p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)
# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower, 0.2, 0.01, line_color="black")
p.rect(cats, upper, 0.2, 0.01, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="black")
return p
p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)
Hi there, from the code and dataset above I am able to produce a boxplot considering I pass through categorical variables. however I am unable to produce anything when I try to produce a boxplot for a single column. for example just checking the spread of the math scores. i tried to do
cats = df['math score']
but it didnt work. any suggestions?

I am not sute if this it is the best to implement this both in one function, but if this is your goal, one solution can be, to add a few if-else conditions.
Here is a description of the changes:
First give label a default.
# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):
Then add a if-else part for the groupby section.
# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())
# new
if label is not None:
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
else:
df_gb = df[[vals]]
cats = [vals]
Now the calculation for the outliners is a bit different, because we don't have to loop over a number of columns. Only onw column is left.
if label is not None:
out = df_gb.apply(outliers).dropna()
else:
out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]
The upper and lower part are now floats and not a list.
if label is not None:
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
upper =min(qmax, upper_cutoff)
lower =max(qmin, lower_cutoff)
I also added (changed) the line below, to avoid a warning.
colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")
With these changes the output for
p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
is still the same, but
p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')
gives us now a boxplot.

Related

Best range of dominant values of histogram curve

I have such histogram:
and I have this code that finds the maxima (-21.5 in my case):
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [maxima]
But I need to find the range of the most dominant values of the histogram (in this histogram for example: -30 : -5).
Something like, the value from both sides where it's probability is equal to 20% of the maxima probability.
How can I achieve it?
I had tried the following:
t_right = list(filter(lambda tup:np.logical_and(tup[1] > maxima , probs[tup[0]] <= max(probs)*0.2), enumerate(samples)))
but getting many values, I want only one value that cut the curve
I'm not sure if that is what you are looking for but I've found this article on Towards data Science code form that article is as follow:
Link: https://towardsdatascience.com/take-your-histograms-to-the-next-level-using-matplotlib-5f093ad7b9d3
# Plot
# Plot histogram
avocado.plot(kind = "hist", density = True, alpha = 0.65, bins = 15) # change density to true, because KDE uses density
# Plot KDE
avocado.plot(kind = "kde")
# Quantile lines
quant_5, quant_25, quant_50, quant_75, quant_95 = avocado.quantile(0.05), avocado.quantile(0.25), avocado.quantile(0.5), avocado.quantile(0.75), avocado.quantile(0.95)
quants = [[quant_5, 0.6, 0.16], [quant_25, 0.8, 0.26], [quant_50, 1, 0.36], [quant_75, 0.8, 0.46], [quant_95, 0.6, 0.56]]
for i in quants:
ax.axvline(i[0], alpha = i[1], ymax = i[2], linestyle = ":")
# X
ax.set_xlabel("Average Price ($)")
# Limit x range to 0-4
x_start, x_end = 0, 4
ax.set_xlim(x_start, x_end)
# Y
ax.set_ylim(0, 1)
ax.set_yticklabels([])
ax.set_ylabel("")
# Annotations
ax.text(quant_5-.1, 0.17, "5th", size = 10, alpha = 0.8)
ax.text(quant_25-.13, 0.27, "25th", size = 11, alpha = 0.85)
ax.text(quant_50-.13, 0.37, "50th", size = 12, alpha = 1)
ax.text(quant_75-.13, 0.47, "75th", size = 11, alpha = 0.85)
ax.text(quant_95-.25, 0.57, "95th Percentile", size = 10, alpha =.8)
# Overall
ax.grid(False)
ax.set_title("Avocado Prices in U.S. Markets", size = 17, pad = 10)
# Remove ticks and spines
ax.tick_params(left = False, bottom = False)
for ax, spine in ax.spines.items():
spine.set_visible(False)
plt.show()
The output of above is something like that:
I hope that could be helpful for you! :)
This is my solution, will be glad to get other ideas:
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
t_right_list = list(filter(lambda tup:np.logical_and(tup[1] > maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_right = np.median(list(zip(*t_right_list))[1])
t_left_list = list(filter(lambda tup:np.logical_and(tup[1] < maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_left = np.median(list(zip(*t_left_list))[1])
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [t_left, maxima, t_right]
In case more than one value will be retrieved in t_right/t_left (because of abs_tol param value), then median can be used (in order to get only one value)

Jupyter noterbook How to show percentage on bar graph for this code?

I want to show the percentage on the bar graph from the code below, how do I do it?
results = pd.Series([accu_dt , accu_svm, accu_rf, accu_lg, accu_knn, accu_nb ])
names = ['Decision Tree','SVm','Random Forest','Logistic Regression','KNN','Naive Bayes']
ax = results.plot(kind = 'bar',figsize=(13,7),color=['black','gray','brown','blue','pink','green'])
ax.set_title('Comparision of Models',fontsize=15)
ax.set_yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])
ax.set_xticklabels(names ,fontsize=15,rotation = 45)
ax.set_xlabel("Models",fontsize=15)
ax.set_ylabel("Accuracy",fontsize=15)
Is this your expected output?
If so, credit to Chris Adams for his answer.
I predefined the values for accu_dt , accu_svm, accu_rf, accu_lg, accu_knn, accu_nb as such:
accu_dt = [0.90,0.92,0.91,0.95,0.99,0.95,0.90]
accu_svm = [0.89,0.92,0.95,0.98,0.97,0.89,0.95]
accu_rf = [0.98,0.99,0.97,0.96,0.98,0.99,0.95]
accu_lg = [0.79,0.77,0.90,0.85,0.83,0.80,0.78]
accu_knn = [0.85,0.85,0.84,0.89,0.83,0.81,0.80]
accu_nb = [0.85,0.84,0.83,0.81,0.85,0.85,0.85]
I didn't change much of your code except the width = 0.7 and the figsize=(30,7) to make the numbers more readable. Noticed that I added loop at the bottom of your code
ax = results.plot(kind = 'bar',figsize=(30,7),color=['black','gray','brown','blue','pink','green'],width = 0.7)
ax.set_title('Comparision of Models',fontsize=15)
ax.set_yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])
ax.set_xticklabels(names ,fontsize=15,rotation = 45)
ax.set_xlabel("Models",fontsize=15)
ax.set_ylabel("Accuracy",fontsize=15)
# Newly Added Loop
for p in ax.patches:
width = p.get_width()
height = p.get_height()
x, y = p.get_xy()
ax.annotate(f'{height:.0%}', (x + width/2, y + height*1.02), ha='center')
Basically what the code does in simple term:
Getting the heights, width and (x,y) coordinate of the bars
Set the annotation at designated (x,y) coordinate
Set the VALUE of the annotation equals the height of the bar (the '.0%' means percentage with zero decimal place)

spacing nodes at networkx/plotly network and labeling

I created a network using networkx and plotly as following:
edges = df.stack().reset_index()
edges.columns = ['var_1','var_2','correlation']
edges = edges.loc[ (edges['correlation'] < -0.6) | (edges['correlation'] > 0.6) & (edges['var_1'] != edges['var_2']) ].copy()
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'var_1', 'var_2', edge_attr=['correlation'])
mst = G0
# assign colours to edges depending on positive or negative correlation
# assign edge thickness depending on magnitude of correlation
edge_colours = []
edge_width = []
for key, value in nx.get_edge_attributes(mst, 'correlation').items():
edge_colours.append(assign_colour(value))
edge_width.append(assign_thickness(value))
node_size = []
degrees = [val for key, val in dict(G0.degree).items()]
max_deg = max(degrees)
min_deg = min(degrees)
for value in degrees:
node_size.append(assign_node_size(value,min_deg,max_deg))
#draw the network:
nx.draw(mst, pos=nx.fruchterman_reingold_layout(mst),
node_size=15, edge_color=edge_colours, node_colour="black",
edge_width=0.2)
plt.show()
def get_coordinates(G=mst):
"""Returns the positions of nodes and edges in a format for Plotly to draw the network"""
# get list of node positions
pos = nx.fruchterman_reingold_layout(mst)
Xnodes = [pos[n][0] for n in mst.nodes()]
Ynodes = [pos[n][1] for n in mst.nodes()]
Xedges_red = []
Yedges_red = []
Xedges_green = []
Yedges_green = []
def insert_edge(Xedges, Yedges):
Xedges.extend([pos[e[0]][0], pos[e[1]][0], None])
Yedges.extend([pos[e[0]][1], pos[e[1]][1], None])
search_dict = nx.get_edge_attributes(mst, 'correlation')
for e in mst.edges():
correlation = search_dict[(e[0], e[1])]
if correlation <= 0 : # red_edges
insert_edge(Xedges_red, Yedges_red)
else:
insert_edge(Xedges_green, Yedges_green)
# x coordinates of the nodes defining the edge e
return Xnodes, Ynodes, Xedges_red, Yedges_red, Xedges_green, Yedges_green
node_label = list(mst.nodes())
node_label = [fix_string(x) for x in node_label]
# get coordinates for nodes and edges
Xnodes, Ynodes, Xedges_red, Yedges_red, Xedges_green, Yedges_green = get_coordinates()
external_data = [list(x) for x in coding_names_df.values]
external_data = {fix_string(x[0]): x[1] for x in external_data}
external_data2 = [list(y) for y in coding_names_df.values]
external_data2 = {fix_string(y[0]): y[2] for y in external_data2}
external_data3 = [list(z) for z in coding_names_df.values]
external_data3 = {fix_string(z[0]): z[3] for z in external_data3}
external_data4 = [list(s) for s in coding_names_df.values]
external_data4 = {fix_string(s[0]): s[4] for s in external_data4}
# =============================================================================
description = [f"<b>{index}) {node}</b>"
"<br><br>Realm: " +
"<br>" + external_data.get(node, 'No external data found') +
"<br><br>Type: " +
"<br>" + external_data2.get(node, 'No external data found')
for index, node in enumerate(node_label)]
# =============================================================================
# def nodes colours:
node_colour = [assign_node_colour(node, external_data3, coding_names_df) for node in node_label]
node_shape = [assign_node_shape(node, external_data4, coding_names_df) for node in node_label]
# edges
# negative:
tracer_red = go.Scatter(x=Xedges_red, y=Yedges_red,
mode='lines',
line= dict(color="#FA0000", width=1),
hoverinfo='none',
showlegend=False)
# positive:
tracer_green = go.Scatter(x=Xedges_green, y=Yedges_green,
mode='lines',
line= dict(color= "#29A401", width=1),
hoverinfo='none',
showlegend=False)
# nodes
tracer_marker = go.Scatter(x=Xnodes, y=Ynodes,
mode='markers+text',
textposition='top center',
marker=dict(size=node_size,
line=dict(width=0.8, color='black'),
color=node_colour,
symbol=node_shape),
hovertext=description,
hoverinfo='text',
textfont=dict(size=7),
showlegend=False)
axis_style = dict(title='',
titlefont=dict(size=20),
showgrid=False,
zeroline=False,
showline=False,
ticks='',
showticklabels=False)
layout = dict(title='',
width=1300,
height=900,
autosize=False,
showlegend=False,
xaxis=axis_style,
yaxis=axis_style,
hovermode='closest',
plot_bgcolor = '#fff')
fig = dict(data=[tracer_red, tracer_green, tracer_marker], layout=layout)
display(HTML("""<p>Node sizes are proportional to the size of annualised returns.<br>
Node colours signify positive or negative returns since beginning of the timeframe.</p> """))
plot(fig)
and I got this plot: network
I want to add labels, but it's getting too crowded (especially in the middle)
so I have two questions:
How can I spacing the middle? (but still to keep the fruchterman_reingold possition)
How can I add just a few specific labels?
any help will be graet! Tnx :)
Something you could try is setting the k parameter in the layout algorithm, which as mentioned in the docs it sets:
k: (float (default=None)) – Optimal distance between nodes. If None the distance is set to 1/sqrt(n) where n is the number of nodes. Increase this value to move nodes farther apart.
So by playing a bit with this value, and increasing accordingly we can get a more spread out layout and avoid overlap between node labels.
Here's a simple example to illustrate what the behavior is:
result_set = {('plant','tree'), ('tree','oak'), ('flower', 'rose'), ('flower','daisy'), ('plant','flower'), ('tree','pine'), ('plant','roots'), ('animal','fish'),('animal','bird'), ('bird','robin'), ('bird','falcon'), ('animal', 'homo'),('homo','homo-sapiens'), ('animal','reptile'), ('reptile','snake'),('fungi','mushroom'), ('fungi','mold'), ('fungi','toadstool'),('reptile','crocodile'), ('mushroom','Portabello'), ('mushroom','Shiitake'),('pine','roig'),('pine','pinyer'), ('tree','eucaliptus'),('rose','Floribunda'),('rose','grandiflora')}
G=nx.from_edgelist(result_set)
pos=nx.fruchterman_reingold_layout(G)
plt.figure(figsize=(8,5))
nx.draw(G, pos=pos,
with_labels=True,
nodesize=1000,
node_color='lightgreen')
And if we increase the value of k to say 0.5, we get a nice spreading of the nodes in the layout:
pos_spaced=nx.fruchterman_reingold_layout(G, k=0.5, iterations=100)
plt.figure(figsize=(10,6))
nx.draw(G,
pos=pos_spaced,
with_labels=True,
nodesize=1000,
node_color='lightgreen')
How can I add just a few specific labels?
For this you set the labels parameters in draw to a dictionary containing the labels you want displayed. In the case the node names are the same as the labels, just create a dictionary mapping node->node as follows:
show_labels = ['plant', 'tree', 'oak', 'eucaliptus']
pos_spaced=nx.fruchterman_reingold_layout(G, k=0.54, iterations=100)
plt.figure(figsize=(10,6))
nx.draw(G,
pos=pos_spaced,
with_labels=True,
nodesize=1000,
labels=dict(zip(show_labels,show_labels)),
node_color='lightgreen')

Giving Custom inter quartile range for Boxplot in Matplotlib

The Matplotlib or Seaborn box plot gives the interquartile range between the 25th percentile and 75th percentile. Is there a way to give custom interquartile range for the Boxplot ? I need to get the box plot such that the interquartile range is between 10th percentile and 90th percentile. Looked up on google and other sources, came to know about getting custom whiskers on the box plot but not custom interquartile range. Hoping would get some useful solutions here.
Yes, it is possible to plot a boxplot with box edges at any percentiles you desire.
Convention
With box and whisker plots it is convention to plot the 25th and 75th percentiles of the data. Thus, you should be aware that departing from this convention puts you at risk of misleading readers. You should also carefully consider what altering the box percentiles means to outlier classification and the whiskers of the boxplot.
Quick solution
A quick fix (ignoring any implications for whisker locations) is to compute the boxplot statistics we desire, alter the locations of q1 and q3, and then plot with ax.bxp:
import matplotlib.cbook as cbook
import matplotlib.pyplot as plt
import numpy as np
# Generate some random data to visualise
np.random.seed(2019)
data = np.random.normal(size=100)
stats = {}
# Compute the boxplot stats (as in the default matplotlib implementation)
stats['A'] = cbook.boxplot_stats(data, labels='A')[0]
stats['B'] = cbook.boxplot_stats(data, labels='B')[0]
stats['C'] = cbook.boxplot_stats(data, labels='C')[0]
# For box A compute the 1st and 99th percentiles
stats['A']['q1'], stats['A']['q3'] = np.percentile(data, [1, 99])
# For box B compute the 10th and 90th percentiles
stats['B']['q1'], stats['B']['q3'] = np.percentile(data, [10, 90])
# For box C compute the 25th and 75th percentiles (matplotlib default)
stats['C']['q1'], stats['C']['q3'] = np.percentile(data, [25, 75])
fig, ax = plt.subplots(1, 1)
# Plot boxplots from our computed statistics
ax.bxp([stats['A'], stats['B'], stats['C']], positions=range(3))
However, viewing the plot produced we see that altering q1 and q3 whilst leaving the whiskers unchanged may not be a sensible idea. You could counter this by recomputing eg. stats['A']['iqr'] and the whisker locations stats['A']['whishi'] and stats['A']['whislo'].
A more complete solution
Looking through matplotlib's source code we find that matplotlib uses matplotlib.cbook.boxplot_stats to compute the statistics used in the boxplot.
Within boxplot_stats we find the code q1, med, q3 = np.percentile(x, [25, 50, 75]). This is the line we can alter to change the plotted percentiles.
So a potential solution would be to make a copy of matplotlib.cbook.boxplot_stats and alter it as we desire. Here I call the function my_boxplot_stats and add an argument percents to make it easy to alter the locations of q1 and q3.
import itertools
from matplotlib.cbook import _reshape_2D
import matplotlib.pyplot as plt
import numpy as np
# Function adapted from matplotlib.cbook
def my_boxplot_stats(X, whis=1.5, bootstrap=None, labels=None,
autorange=False, percents=[25, 75]):
def _bootstrap_median(data, N=5000):
# determine 95% confidence intervals of the median
M = len(data)
percentiles = [2.5, 97.5]
bs_index = np.random.randint(M, size=(N, M))
bsData = data[bs_index]
estimate = np.median(bsData, axis=1, overwrite_input=True)
CI = np.percentile(estimate, percentiles)
return CI
def _compute_conf_interval(data, med, iqr, bootstrap):
if bootstrap is not None:
# Do a bootstrap estimate of notch locations.
# get conf. intervals around median
CI = _bootstrap_median(data, N=bootstrap)
notch_min = CI[0]
notch_max = CI[1]
else:
N = len(data)
notch_min = med - 1.57 * iqr / np.sqrt(N)
notch_max = med + 1.57 * iqr / np.sqrt(N)
return notch_min, notch_max
# output is a list of dicts
bxpstats = []
# convert X to a list of lists
X = _reshape_2D(X, "X")
ncols = len(X)
if labels is None:
labels = itertools.repeat(None)
elif len(labels) != ncols:
raise ValueError("Dimensions of labels and X must be compatible")
input_whis = whis
for ii, (x, label) in enumerate(zip(X, labels)):
# empty dict
stats = {}
if label is not None:
stats['label'] = label
# restore whis to the input values in case it got changed in the loop
whis = input_whis
# note tricksyness, append up here and then mutate below
bxpstats.append(stats)
# if empty, bail
if len(x) == 0:
stats['fliers'] = np.array([])
stats['mean'] = np.nan
stats['med'] = np.nan
stats['q1'] = np.nan
stats['q3'] = np.nan
stats['cilo'] = np.nan
stats['cihi'] = np.nan
stats['whislo'] = np.nan
stats['whishi'] = np.nan
stats['med'] = np.nan
continue
# up-convert to an array, just to be safe
x = np.asarray(x)
# arithmetic mean
stats['mean'] = np.mean(x)
# median
med = np.percentile(x, 50)
## Altered line
q1, q3 = np.percentile(x, (percents[0], percents[1]))
# interquartile range
stats['iqr'] = q3 - q1
if stats['iqr'] == 0 and autorange:
whis = 'range'
# conf. interval around median
stats['cilo'], stats['cihi'] = _compute_conf_interval(
x, med, stats['iqr'], bootstrap
)
# lowest/highest non-outliers
if np.isscalar(whis):
if np.isreal(whis):
loval = q1 - whis * stats['iqr']
hival = q3 + whis * stats['iqr']
elif whis in ['range', 'limit', 'limits', 'min/max']:
loval = np.min(x)
hival = np.max(x)
else:
raise ValueError('whis must be a float, valid string, or list '
'of percentiles')
else:
loval = np.percentile(x, whis[0])
hival = np.percentile(x, whis[1])
# get high extreme
wiskhi = np.compress(x <= hival, x)
if len(wiskhi) == 0 or np.max(wiskhi) < q3:
stats['whishi'] = q3
else:
stats['whishi'] = np.max(wiskhi)
# get low extreme
wisklo = np.compress(x >= loval, x)
if len(wisklo) == 0 or np.min(wisklo) > q1:
stats['whislo'] = q1
else:
stats['whislo'] = np.min(wisklo)
# compute a single array of outliers
stats['fliers'] = np.hstack([
np.compress(x < stats['whislo'], x),
np.compress(x > stats['whishi'], x)
])
# add in the remaining stats
stats['q1'], stats['med'], stats['q3'] = q1, med, q3
return bxpstats
With this in place we can compute our statistics and then plot with plt.bxp.
# Generate some random data to visualise
np.random.seed(2019)
data = np.random.normal(size=100)
stats = {}
# Compute the boxplot stats with our desired percentiles
stats['A'] = my_boxplot_stats(data, labels='A', percents=[1, 99])[0]
stats['B'] = my_boxplot_stats(data, labels='B', percents=[10, 90])[0]
stats['C'] = my_boxplot_stats(data, labels='C', percents=[25, 75])[0]
fig, ax = plt.subplots(1, 1)
# Plot boxplots from our computed statistics
ax.bxp([stats['A'], stats['B'], stats['C']], positions=range(3))
See that with this solution the whiskers are adjusted in our function based on our selected percentiles.:

Cutting outliers in Histogram (Python)

I wanted to know, if there is a method that shows me how long my x-axis should be. I have a record with different outliers. I can just cut them with plt.xlim() but is there a statistical method to compute a senseful x-axis limit? In the added picture a logical cut would be after 150 km drived distance. To compute the threshold of the cutting would be perfect
The dataframe that the definition gets is a standard pandas dataframe
Code:
def yearly_distribution(dataframe):
df_distr = dataframe
h=sorted(df_distr['Distance'])
l=len(h)
fig, ax =plt.subplots(figsize=(16,9))
binwidth = np.arange(0,501,0.5)
n, bins, patches = plt.hist(h, bins=binwidth, normed=1, facecolor='#023d6b', alpha=0.5, histtype='bar')
lnspc =np.arange(0,500.5,0.5)
gevfit = gev.fit(h)
pdf_gev = gev.pdf(lnspc, *gevfit)
plt.plot(lnspc, pdf_gev, label="GEV")
logfit = stats.lognorm.fit(h)
pdf_lognorm = stats.lognorm.pdf(lnspc, *logfit)
plt.plot(lnspc, pdf_lognorm, label="LogNormal")
weibfit = stats.weibull_min.fit(h)
pdf_weib = stats.weibull_min.pdf(lnspc, *weibfit)
plt.plot(lnspc, pdf_weib, label="Weibull")
burrfit = stats.burr.fit(h)
pdf_burr = stats.burr.pdf(lnspc, *burrfit)
plt.plot(lnspc, pdf_burr, label="Burr Distribution")
genparetofit = stats.genpareto.fit(h)
pdf_genpareto = stats.genpareto.pdf(lnspc, *genparetofit)
plt.plot(lnspc, pdf_genpareto, label ="Generalized Pareto")
myarray = np.array(h)
clf = GMM(8,n_iter=500, random_state=3)
myarray.shape = (myarray.shape[0],1)
clf = clf.fit(myarray)
lnspc.shape = (lnspc.shape[0],1)
pdf_gmm = np.exp(clf.score(lnspc))
plt.plot(lnspc, pdf_gmm, label = "GMM")
plt.xlim(0,500)
plt.xlabel('Distance')
plt.ylabel('Probability')
plt.title('Histogram')
plt.ylim(0,0.05)
you should remove outliers from your data before any plot or fitting :
h=sorted(df_distr['Distance'])
out_threshold= 150.0
h=[i for i in h if i<out_threshold]
EDIT
that maybe not the fastest way but with numpy.std() :
out_threshold= 2.0*np.std(h+[-a for a in h])

Categories

Resources