Histogram does not show up in f-distribution plot - python

I am trying to create f-distributed random numbers with given degree of freedoms d1 and d2, and plot both a histogram with f-distributed random numbers, and plot an idealised f-distribution curve, but when I give small values to df's, the histogram does not show up. I am new at statistics and matplotlib, and I could not figure out how to deal with this problem.
This is my code:
def distF(request, distribution_id):
dist = get_object_or_404(Distribution, pk=distribution_id)
dfd = dist.var4
dfn = dist.var2
x = np.random.f(dfn, dfd, size = dist.var3)
num_bins = 50
fig, ax = plt.subplots()
print(x)
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, normed=True)
y = np.linspace(0, 5, 1001)[1:]
dist = st.f(dfn, dfd, 0)
#y = np.linspace(st.f.ppf(0.01, dfn, dfd), st.f.ppf(0.99, dfn, dfd), 100)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, 4])
ax.set_ylim([0, 3])
fig.tight_layout()
canvas = FigureCanvas(fig)
response = HttpResponse(content_type='image/png')
canvas.print_png(response)
plt.close(fig)
return response
This is how the plots look like:
F-distribution plot with small df values
F-distribution plot with large df values

The problem is that the f distribution with a dfd of 1 spreads out hugely towards large numbers. So let's say you have values of 2000 or so in your array x, but only 50 bins between 0 and 2000. That makes the bin rather large and hence rather low in height. I would think that if you anyway want to limit your view to some low number, it would be better to also limit the histogram to that number.
In the code below the limit would be 5 and the bin width is 0.2.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
dfn = 10
dfd =1
limit = 5
x = np.random.f(dfn, dfd, size = 100)
bins = np.arange(0, limit, 0.2)
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, bins, normed=True)
y = np.linspace(0, limit, 1001)[1:]
dist = st.f(dfn, dfd, 0)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, limit])
fig.tight_layout()
plt.show()

Related

How can i plot data too far apart with MatPlotLib?

I'm developing a simple statistics calculator, and the last step is to plot the numbers given by the user.
Everything goes fine when the numbers are close apart, as shown below with the numbers [-2, -2, 0, 1, 3, 5]: Plot 1
However, when there'are numbers too far apar, the graph breaks because of the big amount of ticks:
Plot 2 (broken)
The code is here:
def plot(l = np.array(\[\]), m = float, M = float):
plt.style.use('classic')
plt.title(f'NUMBERS PLOT')
plt.ylabel('NUMBERS DENSITY')
plt.xlabel('GIVEN NUMBERS')
m = np.min(l) #Smallest given number
M = np.max(l) #Biggest given number
bins = np.arange(m, M+2) - 0.5 #-0.5 for aligning the bars with the ticks
freq_num = Counter(l) #Counting the occurrence of every number
most_freq = (freq_num.most_common(1)\[0\]\[1\]) #Taking the occurrence of the most dense number
plt.hist(l, bins = bins)
plt.yticks(np.arange(0, most_freq+2, 1.0))
plt.xticks(np.arange(m, M+2))
plt.xlim(\[m-1, M+2\])
return plt.show()
How can i, for example, exclude the zero-density numbers, for example the numbers between 20 and 60, from the x-axis?
Is there another way to avoid this result?
I think that a good option would be to apply the matplotlib broken axis feature.
As an example, I have plotted a bimodal distribution whose peaks are far apart from each other. This example should be pretty close to your data. The broken axis allows to get a better visualization of this plot.
Here is the code:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
N = 1000
mu = 400
sigma = 5
mu2 = 10
sigma2 = 20
dens_1 = np.random.normal(mu, sigma, N)
dens_2 = np.random.normal(mu2, sigma2, N)
dens = np.concatenate([dens_1, dens_2])
plt.hist(dens, bins=100)
plt.show()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, facecolor='w')
ax1.hist(dens, bins=100)
ax2.hist(dens, bins=100)
ax1.set_xlim(-100, 150)
ax2.set_xlim(350, 450)
ax1.spines.right.set_visible(False)
ax2.spines.left.set_visible(False)
ax2.yaxis.tick_right()
ax1.tick_params(labeltop=False)
ax2.xaxis.tick_bottom()
d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((1-d, 1+d), (-d,+d), **kwargs)
ax1.plot((1-d, 1+d),(1-d, 1+d), **kwargs)
kwargs.update(transform=ax2.transAxes)
ax2.plot((-d, d), (1-d, 1+d), **kwargs)
ax2.plot((-d, d), (-d, +d), **kwargs)
plt.show()
Here is the plot without broken axis:
and here with broken axis:

SciPy Bernoulli random number generation, different behavior within a for loop for different sample sizes

The purpose of this code is to demonstrate CLT.
If I do the following:
num_samples = 10000
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=100))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(100))
z_ed = np.divide(sample_demeaned, denominator)
plt.hist(z_ed, bins=40, edgecolor='k', density=True)
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
plt.plot(x, y, color='red')
I get:
However, if I try to do it with a for loop for different sample sizes:
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(size))
z_ed = np.divide(sample_demeaned, denominator)
sample_std_means[:, col] = sample_means
And then plot each of them in a 2x2 grid:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[i], bins=40, edgecolor='k', color='midnightblue')
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
I get the following image:
I cannot debug the discrepancy here. Any help is highly appreciated.
Please note that scipy.stats and numpy have been imported as st and np respectively in both code blocks.
First, note that one numpy's strong points is that it allows operations which mix arrays and single numbers. This is called broadcasting. So, for example sample_demeaned = np.subtract(sample_means, 0.5) can be written more concise as sample_demeaned = sample_means - 0.5.
Several issues are going wrong:
sample_std_means[:, col] = sample_means should use the just calculated z_ed instead of sample_means.
ax.hist(sample_std_means[i], ...) uses the i'th row of the array. That row only contains 4 elements. You'd want sample_std_means[;,i] to take the i'th column.
The pdf is drawn in its normalized form (with an area below the curve equal to one). However, the histogram's height is proportional to the number of samples. Its total area is num_samples * bin_width, where the histogram's default bin width is the length from the first to the last element divided by the number of bins. To get both the pdf and histogram with similar sizes, either the histogram should be normalized (using density=True) or the pdf should be multiplied by the expected area of the histogram.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
sample_means[i] = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_demeaned = sample_means - 0.5
z_ed = sample_demeaned / (0.5 / np.sqrt(size))
sample_std_means[:, col] = z_ed
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 1000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[:, i], bins=40, edgecolor='k', color='midnightblue', density=True)
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
# bin_width = (sample_std_means[:, i].max() - sample_std_means[:, i].min()) / 40
# ax.plot(x, y * num_samples * bin_width, color='red')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
Now note the weird empty bars in the histograms. A histogram works best for continuous distributions. But the mean of n Bernoulli trials can have at most n+1 different outcomes. When all trials would be True, the mean would be n/n = 1. When all would be False, the mean would be 0. Combined, the possible means are 0, 1/n, 2/n, ..., 1. The histogram of such a discrete distribution should take these values into account for the boundaries between the bins.
The following code creates a scatter plot, using the position of the means and a random y-value to visualize how many there are per x. Also, the position of the bin boundaries is calculated and visualized by dotted vertical lines.
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
for i, ax in enumerate(axes.flatten()):
ax.scatter(sample_std_means[:, i], np.random.uniform(0, 1, num_samples), color='r', alpha=0.5, lw=0, s=1)
# there are n+1 possible mean values for n bernoulli trials
# n+2 boundaries will be needed to separate the bins
bins = np.arange(-1, sample_sizes[i]+1) / sample_sizes[i]
bins += (bins[1] - bins[0]) / 2 # shift half a bin
bins -= 0.5 # subtract the mean
bins /= (0.5 / np.sqrt(sample_sizes[i])) # correction factor
for b in bins:
ax.axvline(b, color='g', ls=':')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.set_xlim((-3, 3))
And here are the histograms using these bins:
ax.hist(sample_std_means[:, i], bins=bins, edgecolor='k', color='midnightblue', density=True)

Align histogram bars - Python

I need to center the bars of a histogram.
x = array
y = [0,1,2,3,4,5,6,7,8,9,10]
num_bins = len(array)
n, bins, patches = plt.hist(x, num_bins, facecolor='green', alpha=0.5)
barWidth=20
x.bar(x, y, width=barWidth, align='center')
plt.show()
What I need, is that it looks like the one in this picture
I tried almost everything, but still can't go through.
Thank you all
For your task, I think it's better to calculate the histogram with NumPy and plot with bat function. Please refer to a following code and see how to use bin_edges.
import matplotlib.pyplot as plt
import numpy as np
num_samples = 100
num_bins = 10
lb, ub = 0, 10 # lower bound, upper bound
# create samples
y = np.random.random(num_samples) * ub
# caluculate histogram
hist, bin_edges = np.histogram(y, num_bins, range=(lb, ub))
width = (bin_edges[1] - bin_edges[0])
# plot histogram
plt.bar(bin_edges[:-1], hist, align='center',
width=width, edgecolor='k', facecolor='green', alpha=0.5)
plt.xticks(range(num_bins))
plt.xlim([lb-width/2, ub-width/2])
plt.show()

Dividing matplotlib histogram by maximum bin value

I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()

Color-coding a histogram

I have a set of N objects with two properties: x and y.
I would like to depict the distribution of x with a histogram in MATPLOTLIB using hist(). Easy enough. Now, I would like to color-code EACH bar of the histogram with a color that represents the average value of y in that set with a colormap. Is there an easy way to do this? Here, x and y are both N-d numpy arrays. Thanks!
fig = plt.figure()
n, bins, patches = plt.hist(x, 100, normed=1, histtype='stepfilled')
plt.setp(patches, 'facecolor', 'g', 'alpha', 0.1)
plt.xlabel('x')
plt.ylabel('Normalized frequency')
plt.show()
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# set up the bins
Nbins = 10
bins = np.linspace(0, 1, Nbins +1, endpoint=True)
# get some fake data
x = np.random.rand(300)
y = np.arange(300)
# figure out which bin each x goes into
bin_num = np.digitize(x, bins, right=True) - 1
# compute the counts per bin
hist_vals = np.bincount(bin_num)
# set up array for bins
means = np.zeros(Nbins)
# numpy slicing magic to sum the y values by bin
means[bin_num] += y
# take the average
means /= hist_vals
# make the figure/axes objects
fig, ax = plt.subplots(1,1)
# get a color map
my_cmap = cm.get_cmap('jet')
# get normalize function (takes data in range [vmin, vmax] -> [0, 1])
my_norm = Normalize()
# use bar plot
ax.bar(bins[:-1], hist_vals, color=my_cmap(my_norm(means)), width=np.diff(bins))
# make sure the figure updates
plt.draw()
plt.show()
related: vary the color of each bar in bargraph using particular value

Categories

Resources