I need to center the bars of a histogram.
x = array
y = [0,1,2,3,4,5,6,7,8,9,10]
num_bins = len(array)
n, bins, patches = plt.hist(x, num_bins, facecolor='green', alpha=0.5)
barWidth=20
x.bar(x, y, width=barWidth, align='center')
plt.show()
What I need, is that it looks like the one in this picture
I tried almost everything, but still can't go through.
Thank you all
For your task, I think it's better to calculate the histogram with NumPy and plot with bat function. Please refer to a following code and see how to use bin_edges.
import matplotlib.pyplot as plt
import numpy as np
num_samples = 100
num_bins = 10
lb, ub = 0, 10 # lower bound, upper bound
# create samples
y = np.random.random(num_samples) * ub
# caluculate histogram
hist, bin_edges = np.histogram(y, num_bins, range=(lb, ub))
width = (bin_edges[1] - bin_edges[0])
# plot histogram
plt.bar(bin_edges[:-1], hist, align='center',
width=width, edgecolor='k', facecolor='green', alpha=0.5)
plt.xticks(range(num_bins))
plt.xlim([lb-width/2, ub-width/2])
plt.show()
Related
I am looking for a way to color the intervals below the curve with different colors; on the interval x < 0, I would like to fill the area under the curve with one color and on the interval x >= 0 with another color, like the following image:
This is the code for basic kde plot:
fig, (ax1) = plt.subplots(1, 1, figsize = ((plot_size + 1.5) * 1,(plot_size + 1.5)))
sns.kdeplot(data=pd.DataFrame(w_contrast, columns=['contrast']), x="contrast", ax=ax1);
ax1.set_xlabel(f"Dry Yield Posterior Contrast (kg)");
Is there a way to fill the area under the curve with different colors using seaborn?
seaborn is a high level api for matplotlib, so the curve will have to be calculated; similar to, but simpler than this answer.
Calculate the values for the kde curve with scipy.stats.gaussian_kde
Use matplotlib.pyplot.fill_between to fill the areas.
Use scipy.integrate.simpson to calculate the area under the curve, which will be passed to matplotlib.pyplot.annotate to annotate.
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy.integrate import simps
import numpy as np
# load sample data
df = sns.load_dataset('planets')
# create the kde model
kde = gaussian_kde(df.mass.dropna())
# plot
fig, ax = plt.subplots(figsize=(9, 6))
g = sns.kdeplot(data=df.mass, ax=ax, c='k')
# remove margins; optional
g.margins(x=0, y=0)
# get the min and max of the x-axis
xmin, xmax = g.get_xlim()
# create points between the min and max
x = np.linspace(xmin, xmax, 1000)
# calculate the y values from the model
kde_y = kde(x)
# select x values below 0
x0 = x[x < 0]
# get the len, which will be used for slicing the other arrays
x0_len = len(x0)
# slice the arrays
y0 = kde_y[:x0_len]
x1 = x[x0_len:]
y1 = kde_y[x0_len:]
# calculate the area under the curves
area0 = np.round(simps(y0, x0, dx=1) * 100, 0)
area1 = np.round(simps(y1, x1, dx=1) * 100, 0)
# fill the areas
g.fill_between(x=x0, y1=y0, color='r', alpha=.5)
g.fill_between(x=x1, y1=y1, color='b', alpha=.5)
# annotate
g.annotate(f'{area0:.0f}%', xy=(-1, 0.075), xytext=(10, 0.150), arrowprops=dict(arrowstyle="->", color='r', alpha=.5))
g.annotate(f'{area1:.0f}%', xy=(1, 0.05), xytext=(10, 0.125), arrowprops=dict(arrowstyle="->", color='b', alpha=.5))
I am plotting a polar 2d histogram in Python 3.7 using matplotlib and the following code (adapted from this answer to another question):
import numpy as np
import matplotlib.pyplot as plt
# input data
azimut = np.random.rand(3000)*2*np.pi
radius = np.random.rayleigh(9, size=3000)
# binning
rbins = np.linspace(0, radius.max(), 10)
abins = np.linspace(0, 2*np.pi, 10)
# histogram
hist, _, _ = np.histogram2d(azimut, radius, bins=(abins, rbins))
A, R = np.meshgrid(abins, rbins)
# plot
fig, ax = plt.subplots(subplot_kw=dict(projection="polar"))
pc = ax.pcolormesh(A, R, hist.T, cmap='inferno')
fig.colorbar(pc)
plt.show()
To produce the following plot:
Due to the larger bin sizes, the polar projection is appearing more like a polygon rather than a circle.
Is there any way to plot this so that the bins appear curved rather than straight? I.E. so that the plot is always circular, regardless of the bin size and doesn't become polygon-like when bins are larger?
A matplotlib solution would be preferable, but others are welcome.
Thanks very much for any help.
To get a rounded look, the mesh can be subdivided into more angles. Note that np.linspace(0, 2 * np.pi, 10) creates 9 bins (and 10 boundaries). For the subdivided mesh you need e.g. 90 bins, so 91 boundaries. The histogram values need to be repeated by the same factor.
The code below uses a different colormap for debugging purposes. An optional grid highlights the original boundaries.
import numpy as np
import matplotlib.pyplot as plt
# input data
azimut = np.random.rand(3000) * 2 * np.pi
radius = np.random.rayleigh(9, size=3000)
# binning
rbins = np.linspace(0, radius.max(), 7)
abins = np.linspace(0, 2 * np.pi, 10)
subdivs = 10
abins2 = np.linspace(0, 2 * np.pi, (len(abins) - 1) * subdivs + 1)
# histogram
hist, _, _ = np.histogram2d(azimut, radius, bins=(abins, rbins))
A1, R1 = np.meshgrid(abins, rbins)
A2, R2 = np.meshgrid(abins2, rbins)
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4), subplot_kw=dict(projection="polar"))
# plot with original mesh
pc1 = ax1.pcolormesh(A1, R1, hist.T, cmap='hsv')
ax1.tick_params(axis='y', labelcolor='white')
ax1.set_xticks(abins[:-1])
fig.colorbar(pc1, ax=ax1)
# plot with subdivided mesh
pc2 = ax2.pcolormesh(A2, R2, np.repeat(hist.T, subdivs, axis=1), cmap='hsv')
ax2.tick_params(axis='y', labelcolor='white')
ax2.set_xticks(abins[:-1])
ax2.set_yticks(rbins, minor=True)
ax2.grid(axis='x', color='white')
ax2.grid(axis='y', which='minor', color='white')
fig.colorbar(pc2, ax=ax2)
plt.tight_layout()
plt.show()
I am trying to create f-distributed random numbers with given degree of freedoms d1 and d2, and plot both a histogram with f-distributed random numbers, and plot an idealised f-distribution curve, but when I give small values to df's, the histogram does not show up. I am new at statistics and matplotlib, and I could not figure out how to deal with this problem.
This is my code:
def distF(request, distribution_id):
dist = get_object_or_404(Distribution, pk=distribution_id)
dfd = dist.var4
dfn = dist.var2
x = np.random.f(dfn, dfd, size = dist.var3)
num_bins = 50
fig, ax = plt.subplots()
print(x)
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, normed=True)
y = np.linspace(0, 5, 1001)[1:]
dist = st.f(dfn, dfd, 0)
#y = np.linspace(st.f.ppf(0.01, dfn, dfd), st.f.ppf(0.99, dfn, dfd), 100)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, 4])
ax.set_ylim([0, 3])
fig.tight_layout()
canvas = FigureCanvas(fig)
response = HttpResponse(content_type='image/png')
canvas.print_png(response)
plt.close(fig)
return response
This is how the plots look like:
F-distribution plot with small df values
F-distribution plot with large df values
The problem is that the f distribution with a dfd of 1 spreads out hugely towards large numbers. So let's say you have values of 2000 or so in your array x, but only 50 bins between 0 and 2000. That makes the bin rather large and hence rather low in height. I would think that if you anyway want to limit your view to some low number, it would be better to also limit the histogram to that number.
In the code below the limit would be 5 and the bin width is 0.2.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
dfn = 10
dfd =1
limit = 5
x = np.random.f(dfn, dfd, size = 100)
bins = np.arange(0, limit, 0.2)
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, bins, normed=True)
y = np.linspace(0, limit, 1001)[1:]
dist = st.f(dfn, dfd, 0)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, limit])
fig.tight_layout()
plt.show()
I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()
I have this simplified code:
import numpy, pylab, random
data = [ random.gauss(1, 0.2) for x in range(1000) ]
fig = pylab.figure()
weights = numpy.ones_like(data)/float(len(data))
n, bins, patches =pylab.hist(data, bins=20, histtype='stepfilled',
weights=weights)
pylab.xlim(min(bins), max(bins))
pylab.ylim(0, 1)
p, = pylab.plot(bins)
pylab.savefig("test.png")
And the plot looks like this:
What is that green line? How can I remove it?
That's because you are plotting the bins points of x-axis on the y-axis (that's the green line), you don't need the plot():
import numpy, pylab, random
data = [ random.gauss(1, 0.2) for x in range(1000) ]
fig = pylab.figure()
weights = numpy.ones_like(data)/float(len(data))
n, bins, patches =pylab.hist(data, bins=20, histtype='bar',
weights=weights)
pylab.xlim(min(bins), max(bins))
pylab.ylim(0, 1)
pylab.show()