I'm developing a simple statistics calculator, and the last step is to plot the numbers given by the user.
Everything goes fine when the numbers are close apart, as shown below with the numbers [-2, -2, 0, 1, 3, 5]: Plot 1
However, when there'are numbers too far apar, the graph breaks because of the big amount of ticks:
Plot 2 (broken)
The code is here:
def plot(l = np.array(\[\]), m = float, M = float):
plt.style.use('classic')
plt.title(f'NUMBERS PLOT')
plt.ylabel('NUMBERS DENSITY')
plt.xlabel('GIVEN NUMBERS')
m = np.min(l) #Smallest given number
M = np.max(l) #Biggest given number
bins = np.arange(m, M+2) - 0.5 #-0.5 for aligning the bars with the ticks
freq_num = Counter(l) #Counting the occurrence of every number
most_freq = (freq_num.most_common(1)\[0\]\[1\]) #Taking the occurrence of the most dense number
plt.hist(l, bins = bins)
plt.yticks(np.arange(0, most_freq+2, 1.0))
plt.xticks(np.arange(m, M+2))
plt.xlim(\[m-1, M+2\])
return plt.show()
How can i, for example, exclude the zero-density numbers, for example the numbers between 20 and 60, from the x-axis?
Is there another way to avoid this result?
I think that a good option would be to apply the matplotlib broken axis feature.
As an example, I have plotted a bimodal distribution whose peaks are far apart from each other. This example should be pretty close to your data. The broken axis allows to get a better visualization of this plot.
Here is the code:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
N = 1000
mu = 400
sigma = 5
mu2 = 10
sigma2 = 20
dens_1 = np.random.normal(mu, sigma, N)
dens_2 = np.random.normal(mu2, sigma2, N)
dens = np.concatenate([dens_1, dens_2])
plt.hist(dens, bins=100)
plt.show()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, facecolor='w')
ax1.hist(dens, bins=100)
ax2.hist(dens, bins=100)
ax1.set_xlim(-100, 150)
ax2.set_xlim(350, 450)
ax1.spines.right.set_visible(False)
ax2.spines.left.set_visible(False)
ax2.yaxis.tick_right()
ax1.tick_params(labeltop=False)
ax2.xaxis.tick_bottom()
d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((1-d, 1+d), (-d,+d), **kwargs)
ax1.plot((1-d, 1+d),(1-d, 1+d), **kwargs)
kwargs.update(transform=ax2.transAxes)
ax2.plot((-d, d), (1-d, 1+d), **kwargs)
ax2.plot((-d, d), (-d, +d), **kwargs)
plt.show()
Here is the plot without broken axis:
and here with broken axis:
Related
The purpose of this code is to demonstrate CLT.
If I do the following:
num_samples = 10000
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=100))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(100))
z_ed = np.divide(sample_demeaned, denominator)
plt.hist(z_ed, bins=40, edgecolor='k', density=True)
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
plt.plot(x, y, color='red')
I get:
However, if I try to do it with a for loop for different sample sizes:
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(size))
z_ed = np.divide(sample_demeaned, denominator)
sample_std_means[:, col] = sample_means
And then plot each of them in a 2x2 grid:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[i], bins=40, edgecolor='k', color='midnightblue')
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
I get the following image:
I cannot debug the discrepancy here. Any help is highly appreciated.
Please note that scipy.stats and numpy have been imported as st and np respectively in both code blocks.
First, note that one numpy's strong points is that it allows operations which mix arrays and single numbers. This is called broadcasting. So, for example sample_demeaned = np.subtract(sample_means, 0.5) can be written more concise as sample_demeaned = sample_means - 0.5.
Several issues are going wrong:
sample_std_means[:, col] = sample_means should use the just calculated z_ed instead of sample_means.
ax.hist(sample_std_means[i], ...) uses the i'th row of the array. That row only contains 4 elements. You'd want sample_std_means[;,i] to take the i'th column.
The pdf is drawn in its normalized form (with an area below the curve equal to one). However, the histogram's height is proportional to the number of samples. Its total area is num_samples * bin_width, where the histogram's default bin width is the length from the first to the last element divided by the number of bins. To get both the pdf and histogram with similar sizes, either the histogram should be normalized (using density=True) or the pdf should be multiplied by the expected area of the histogram.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
sample_means[i] = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_demeaned = sample_means - 0.5
z_ed = sample_demeaned / (0.5 / np.sqrt(size))
sample_std_means[:, col] = z_ed
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 1000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[:, i], bins=40, edgecolor='k', color='midnightblue', density=True)
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
# bin_width = (sample_std_means[:, i].max() - sample_std_means[:, i].min()) / 40
# ax.plot(x, y * num_samples * bin_width, color='red')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
Now note the weird empty bars in the histograms. A histogram works best for continuous distributions. But the mean of n Bernoulli trials can have at most n+1 different outcomes. When all trials would be True, the mean would be n/n = 1. When all would be False, the mean would be 0. Combined, the possible means are 0, 1/n, 2/n, ..., 1. The histogram of such a discrete distribution should take these values into account for the boundaries between the bins.
The following code creates a scatter plot, using the position of the means and a random y-value to visualize how many there are per x. Also, the position of the bin boundaries is calculated and visualized by dotted vertical lines.
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
for i, ax in enumerate(axes.flatten()):
ax.scatter(sample_std_means[:, i], np.random.uniform(0, 1, num_samples), color='r', alpha=0.5, lw=0, s=1)
# there are n+1 possible mean values for n bernoulli trials
# n+2 boundaries will be needed to separate the bins
bins = np.arange(-1, sample_sizes[i]+1) / sample_sizes[i]
bins += (bins[1] - bins[0]) / 2 # shift half a bin
bins -= 0.5 # subtract the mean
bins /= (0.5 / np.sqrt(sample_sizes[i])) # correction factor
for b in bins:
ax.axvline(b, color='g', ls=':')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.set_xlim((-3, 3))
And here are the histograms using these bins:
ax.hist(sample_std_means[:, i], bins=bins, edgecolor='k', color='midnightblue', density=True)
I have a list of angles in degrees. I want to display a polar histogram in which the [0°, 360°) range of values is subdivided into equal bins, and display how many values in the angles list fall into each bin. I get histogram data using the following code (and I've checked it is correct):
bins_number = 8 # the [0, 360) interval will be subdivided into this number of equal bins
bins = np.linspace(0.0, 360.0, bins_number + 1)
n, _, _ = plt.hist(angles, bins)
Now, I've tried to plot this data into a polar histogram using the following code:
plt.clf()
width = 2 * np.pi / bins_number
ax = plt.subplot(1, 1, 1, projection='polar')
bars = ax.bar(bins[:bins_number], n, width=width, bottom=0.0)
for bar in bars:
bar.set_alpha(0.5)
plt.show()
but what I get is shown in this image:
As you can see, bars are not placed at the correct angle, and some of them overlap each other, while they should be all contiguous without overlapping.
What am I doing wrong? Thank you in advance.
As in the comment, using radians instead of degrees:
import numpy as np
import matplotlib.pyplot as plt
n_numbers = 100
bins_number = 8 # the [0, 360) interval will be subdivided into this
# number of equal bins
bins = np.linspace(0.0, 2 * np.pi, bins_number + 1)
angles = 2 * np.pi * np.random.rand(n_numbers)
n, _, _ = plt.hist(angles, bins)
plt.clf()
width = 2 * np.pi / bins_number
ax = plt.subplot(1, 1, 1, projection='polar')
bars = ax.bar(bins[:bins_number], n, width=width, bottom=0.0)
for bar in bars:
bar.set_alpha(0.5)
plt.show()
Here were are only plotting centres of bins versus the number of occurrence of the angles in each bin
import numpy as np
import matplotlib.pyplot as plt
degrees = np.random.randint(0, 360, size=200)
radians = np.deg2rad(degrees)
bin_size = 20
a , b=np.histogram(degrees, bins=np.arange(0, 360+bin_size, bin_size))
centers = np.deg2rad(np.ediff1d(b)//2 + b[:-1])
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection='polar')
ax.bar(centers, a, width=np.deg2rad(bin_size), bottom=0.0, color='.8', edgecolor='k')
ax.set_theta_zero_location("N")
ax.set_theta_direction(-1)
plt.show()
I am trying to create f-distributed random numbers with given degree of freedoms d1 and d2, and plot both a histogram with f-distributed random numbers, and plot an idealised f-distribution curve, but when I give small values to df's, the histogram does not show up. I am new at statistics and matplotlib, and I could not figure out how to deal with this problem.
This is my code:
def distF(request, distribution_id):
dist = get_object_or_404(Distribution, pk=distribution_id)
dfd = dist.var4
dfn = dist.var2
x = np.random.f(dfn, dfd, size = dist.var3)
num_bins = 50
fig, ax = plt.subplots()
print(x)
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, normed=True)
y = np.linspace(0, 5, 1001)[1:]
dist = st.f(dfn, dfd, 0)
#y = np.linspace(st.f.ppf(0.01, dfn, dfd), st.f.ppf(0.99, dfn, dfd), 100)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, 4])
ax.set_ylim([0, 3])
fig.tight_layout()
canvas = FigureCanvas(fig)
response = HttpResponse(content_type='image/png')
canvas.print_png(response)
plt.close(fig)
return response
This is how the plots look like:
F-distribution plot with small df values
F-distribution plot with large df values
The problem is that the f distribution with a dfd of 1 spreads out hugely towards large numbers. So let's say you have values of 2000 or so in your array x, but only 50 bins between 0 and 2000. That makes the bin rather large and hence rather low in height. I would think that if you anyway want to limit your view to some low number, it would be better to also limit the histogram to that number.
In the code below the limit would be 5 and the bin width is 0.2.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
dfn = 10
dfd =1
limit = 5
x = np.random.f(dfn, dfd, size = 100)
bins = np.arange(0, limit, 0.2)
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, bins, normed=True)
y = np.linspace(0, limit, 1001)[1:]
dist = st.f(dfn, dfd, 0)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, limit])
fig.tight_layout()
plt.show()
I am trying to plot both a circular histogram and a vector (overlapping) on the same polar plot, but cannot get the vector to show up.
Basically, my data set consists of the times at which unitary events occur during a repeating cycle. This data is in the array "all_phases", which is just a list of degree values for each of these events. I want to show (1) the overall distribution of events w/ a circular histogram (bins corresponding to degree ranges) and (2) a vector sum as a measure of the coherence of all of these values (treating each event as a unit vector).
I can plot either one of these things individually on the subplot titled "histo", but when I try to plot both, only the histogram shows up. I have tried playing with the z-indexes of both objects to no use. The code is:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
array = np.array
all_phases = [array([-38.24240218]), array([-120.51570738]), array([-23.70224663]),
array([114.9540152]), array([ 2.94523445]), array([-2.16112692]), array([-18.72274284]),
array([13.2292216]), array([-95.5659992]), array([15.69046269]), array([ 51.12022047]),
array([-89.10567276]), array([ 41.77283949]), array([-9.92584921]), array([-7.59680678]),
array([166.71824996]), array([-178.94642752]), array([-23.75819463]), array([38.69481261]),
array([-52.26651244]), array([-57.40976514]), array([33.68226762]), array([-122.1818295]),
array([ 10.17007425]), array([-38.03726335]),array([44.9504975]), array([ 134.63972923]),
array([ 63.02516932]),array([-106.54049292]), array([-25.6527599])]
number_bins = 60
bin_size = 360/number_bins
cluster_num = 1
counts, theta = np.histogram(all_phases, np.arange(-180, 180 + bin_size, bin_size), density=True)
theta = theta[:-1]+ bin_size/2.
theta = theta * np.pi / 180
a_deg = map(lambda x: np.ndarray.item(x), all_phases)
a_rad = map(lambda x: math.radians(x), a_deg)
a_cos = map(lambda x: math.cos(x), a_rad)
a_sin = map(lambda x: math.sin(x), a_rad)
uv_x = sum(a_cos)/len(a_cos)
uv_y = sum(a_sin)/len(a_sin)
uv_radius = np.sqrt((uv_x*uv_x) + (uv_y*uv_y))
uv_phase = np.angle(complex(uv_x, uv_y))
"""
plot histogram and vector sum
"""
fig = plt.figure()
ax1 = fig.add_axes([0.1, 0.16, 0.05, 0.56])
histo = fig.add_subplot(111, polar=True)
histo.yaxis.set_ticks(())
histo.arrow(0,0,0.11, 1, head_width=.01, zorder=2)
plt.suptitle("Phase distribution for Neuron #" + str(cluster_num), fontsize=15, y=.94)
plt.subplots_adjust(bottom=0.12, right=0.95, top=0.78, wspace=0.4)
width = (2*np.pi) / number_bins
bars = histo.bar(theta, counts, width = width, bottom=0.002)
for r, bar in zip(counts, bars):
bar.set_facecolor(plt.cm.jet(r / max(counts)))
bar.set_alpha(0.7)
bar.set_zorder(1)
norm = matplotlib.colors.Normalize(vmin (counts.min())*len(all_phases)*bin_size, vmax=(counts.max())*len(all_phases)*bin_size)
cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=plt.cm.jet,
orientation='vertical', norm=norm, alpha=0.4,
ticks=np.arange(0, (counts.max())*len(all_phases)*bin_size)+1, )
cb1.ax.tick_params(labelsize=9)
cb1.solids.set_rasterized(True)
cb1.set_label("# spikes")
cb1.ax.yaxis.set_label_position('left')
plt.show()
cluster_num = cluster_num + 1
vs_radius and vs_phase are the parameters for the vector sum arrow I want to put on the polar plot, which I end up calling w/ histo.arrow().
My suspicion is that it might have something to do with trying to put two things on a subplot object?
Any help or thoughts would be very much appreciated!!
The problem is that the FancyArrow that is used by Axes.arrow() does not play well with polar plots.
Instead, you could use the annotate() function to draw a simple arrow that works better in the case of polar plots.
for example:
# Compute pie slices
N = 20
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
radii = 10 * np.random.rand(N)
width = np.pi / 4 * np.random.rand(N)
ax = plt.subplot(111, projection='polar')
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
v_angle = 0.275*np.pi
v_length = 4
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
plt.show()
As a general rule, when you deal with polar plot, you have to work just as if you were working with a linear plot. That is to say, you shouldn't try to draw your arrow from (0,0) but rather from (uv_phase, 0)
fig, ax = plt.subplots()
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()