I am using pyplot.hist2d to plot a 2D histogram (x vs.y) weighted by a third variable, z. Instead of summing z values in a given pixel [x_i,y_i] as done by hist2d, I'd like to obtain the average z of all data points falling in that pixel.
Is there a python script doing that ?
Thanks.
Numpy's histogram2d() can calculate both the counts (a standard histogram) as the sums (via the weights parameter). Dividing both gives the mean value.
The example below shows the 3 histograms together with a colorbar. The number of samples is chosen relatively small to demonstrate what would happen for cells with a count of zero (the division gives NaN, so the cell is left blank).
import numpy as np
import matplotlib.pyplot as plt
N = 1000
x = np.random.uniform(0, 10, N)
y = np.random.uniform(0, 10, N)
z = np.cos(x) * np.sin(y)
counts, xbins, ybins = np.histogram2d(x, y, bins=(30, 20))
sums, _, _ = np.histogram2d(x, y, weights=z, bins=(xbins, ybins))
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 4))
m1 = ax1.pcolormesh(ybins, xbins, counts, cmap='coolwarm')
plt.colorbar(m1, ax=ax1)
ax1.set_title('counts')
m2 = ax2.pcolormesh(ybins, xbins, sums, cmap='coolwarm')
plt.colorbar(m2, ax=ax2)
ax2.set_title('sums')
with np.errstate(divide='ignore', invalid='ignore'): # suppress possible divide-by-zero warnings
m3 = ax3.pcolormesh(ybins, xbins, sums / counts, cmap='coolwarm')
plt.colorbar(m3, ax=ax3)
ax3.set_title('mean values')
plt.tight_layout()
plt.show()
Related
I am looking for a way to color the intervals below the curve with different colors; on the interval x < 0, I would like to fill the area under the curve with one color and on the interval x >= 0 with another color, like the following image:
This is the code for basic kde plot:
fig, (ax1) = plt.subplots(1, 1, figsize = ((plot_size + 1.5) * 1,(plot_size + 1.5)))
sns.kdeplot(data=pd.DataFrame(w_contrast, columns=['contrast']), x="contrast", ax=ax1);
ax1.set_xlabel(f"Dry Yield Posterior Contrast (kg)");
Is there a way to fill the area under the curve with different colors using seaborn?
seaborn is a high level api for matplotlib, so the curve will have to be calculated; similar to, but simpler than this answer.
Calculate the values for the kde curve with scipy.stats.gaussian_kde
Use matplotlib.pyplot.fill_between to fill the areas.
Use scipy.integrate.simpson to calculate the area under the curve, which will be passed to matplotlib.pyplot.annotate to annotate.
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy.integrate import simps
import numpy as np
# load sample data
df = sns.load_dataset('planets')
# create the kde model
kde = gaussian_kde(df.mass.dropna())
# plot
fig, ax = plt.subplots(figsize=(9, 6))
g = sns.kdeplot(data=df.mass, ax=ax, c='k')
# remove margins; optional
g.margins(x=0, y=0)
# get the min and max of the x-axis
xmin, xmax = g.get_xlim()
# create points between the min and max
x = np.linspace(xmin, xmax, 1000)
# calculate the y values from the model
kde_y = kde(x)
# select x values below 0
x0 = x[x < 0]
# get the len, which will be used for slicing the other arrays
x0_len = len(x0)
# slice the arrays
y0 = kde_y[:x0_len]
x1 = x[x0_len:]
y1 = kde_y[x0_len:]
# calculate the area under the curves
area0 = np.round(simps(y0, x0, dx=1) * 100, 0)
area1 = np.round(simps(y1, x1, dx=1) * 100, 0)
# fill the areas
g.fill_between(x=x0, y1=y0, color='r', alpha=.5)
g.fill_between(x=x1, y1=y1, color='b', alpha=.5)
# annotate
g.annotate(f'{area0:.0f}%', xy=(-1, 0.075), xytext=(10, 0.150), arrowprops=dict(arrowstyle="->", color='r', alpha=.5))
g.annotate(f'{area1:.0f}%', xy=(1, 0.05), xytext=(10, 0.125), arrowprops=dict(arrowstyle="->", color='b', alpha=.5))
The purpose of this code is to demonstrate CLT.
If I do the following:
num_samples = 10000
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=100))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(100))
z_ed = np.divide(sample_demeaned, denominator)
plt.hist(z_ed, bins=40, edgecolor='k', density=True)
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
plt.plot(x, y, color='red')
I get:
However, if I try to do it with a for loop for different sample sizes:
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(size))
z_ed = np.divide(sample_demeaned, denominator)
sample_std_means[:, col] = sample_means
And then plot each of them in a 2x2 grid:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[i], bins=40, edgecolor='k', color='midnightblue')
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
I get the following image:
I cannot debug the discrepancy here. Any help is highly appreciated.
Please note that scipy.stats and numpy have been imported as st and np respectively in both code blocks.
First, note that one numpy's strong points is that it allows operations which mix arrays and single numbers. This is called broadcasting. So, for example sample_demeaned = np.subtract(sample_means, 0.5) can be written more concise as sample_demeaned = sample_means - 0.5.
Several issues are going wrong:
sample_std_means[:, col] = sample_means should use the just calculated z_ed instead of sample_means.
ax.hist(sample_std_means[i], ...) uses the i'th row of the array. That row only contains 4 elements. You'd want sample_std_means[;,i] to take the i'th column.
The pdf is drawn in its normalized form (with an area below the curve equal to one). However, the histogram's height is proportional to the number of samples. Its total area is num_samples * bin_width, where the histogram's default bin width is the length from the first to the last element divided by the number of bins. To get both the pdf and histogram with similar sizes, either the histogram should be normalized (using density=True) or the pdf should be multiplied by the expected area of the histogram.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
sample_means[i] = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_demeaned = sample_means - 0.5
z_ed = sample_demeaned / (0.5 / np.sqrt(size))
sample_std_means[:, col] = z_ed
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 1000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[:, i], bins=40, edgecolor='k', color='midnightblue', density=True)
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
# bin_width = (sample_std_means[:, i].max() - sample_std_means[:, i].min()) / 40
# ax.plot(x, y * num_samples * bin_width, color='red')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
Now note the weird empty bars in the histograms. A histogram works best for continuous distributions. But the mean of n Bernoulli trials can have at most n+1 different outcomes. When all trials would be True, the mean would be n/n = 1. When all would be False, the mean would be 0. Combined, the possible means are 0, 1/n, 2/n, ..., 1. The histogram of such a discrete distribution should take these values into account for the boundaries between the bins.
The following code creates a scatter plot, using the position of the means and a random y-value to visualize how many there are per x. Also, the position of the bin boundaries is calculated and visualized by dotted vertical lines.
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
for i, ax in enumerate(axes.flatten()):
ax.scatter(sample_std_means[:, i], np.random.uniform(0, 1, num_samples), color='r', alpha=0.5, lw=0, s=1)
# there are n+1 possible mean values for n bernoulli trials
# n+2 boundaries will be needed to separate the bins
bins = np.arange(-1, sample_sizes[i]+1) / sample_sizes[i]
bins += (bins[1] - bins[0]) / 2 # shift half a bin
bins -= 0.5 # subtract the mean
bins /= (0.5 / np.sqrt(sample_sizes[i])) # correction factor
for b in bins:
ax.axvline(b, color='g', ls=':')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.set_xlim((-3, 3))
And here are the histograms using these bins:
ax.hist(sample_std_means[:, i], bins=bins, edgecolor='k', color='midnightblue', density=True)
My code so far, I'm very new to programming and have been trying for a while.
Here I apply the Box-Muller transform to approximate two Gaussian normal distributions starting from a random uniform sampling. Then, I create a histogram for both of them.
Now, I would like to compare the obtained histograms with "the real thing": a standard Bell curve. How to draw such a curve to match the histograms?
import numpy as np
import matplotlib.pyplot as plt
N = 10000
z1 = np.random.uniform(0, 1.0, N)
z2 = np.random.uniform(0, 1.0, N)
R_sq = -2 * np.log(z1)
theta = 2 * np.pi * z2
z1 = np.sqrt(R_sq) * np.cos(theta)
z2 = np.sqrt(R_sq) * np.sin(theta)
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.hist(z1, bins=40, range=(-4, 4), color='red')
plt.title("Histgram")
plt.xlabel("z1")
plt.ylabel("frequency")
ax2 = fig.add_subplot(2, 1, 2)
ax2.hist(z2, bins=40, range=(-4, 4), color='blue')
plt.xlabel("z2")
plt.show()
To obtain the 'kernel density estimation', scipy.stats.gaussian_kde calculates a function to fit the data.
To just draw a Gaussian normal curve, there is [scipy.stats.norm]. Subtracting the mean and dividing by the standard deviation, adapts the position to the given data.
Both curves would be drawn such that the area below the curve sums to one. To adjust them to the size of the histogram, these curves need to be scaled by the length of the data times the bin-width. Alternatively, this scaling can stay at 1, and the histogram scaled by adding the parameter hist(..., density=True).
In the demo code the data is mutilated to illustrate the difference between the kde and the Gaussian normal.
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
x = np.linspace(-4,4,1000)
N = 10000
z1 = np.random.randint(1, 3, N) * np.random.uniform(0, .4, N)
z2 = np.random.uniform(0, 1, N)
R_sq = -2 * np.log(z1)
theta = 2 * np.pi * z2
z1 = np.sqrt(R_sq) * np.cos(theta)
z2 = np.sqrt(R_sq) * np.sin(theta)
fig = plt.figure(figsize=(12,4))
for ind_subplot, zi, col in zip((1, 2), (z1, z2), ('crimson', 'dodgerblue')):
ax = fig.add_subplot(1, 2, ind_subplot)
ax.hist(zi, bins=40, range=(-4, 4), color=col, label='histogram')
ax.set_xlabel("z"+str(ind_subplot))
ax.set_ylabel("frequency")
binwidth = 8 / 40
scale_factor = len(zi) * binwidth
gaussian_kde_zi = stats.gaussian_kde(z1)
ax.plot(x, gaussian_kde_zi(x)*scale_factor, color='springgreen', linewidth=3, label='kde')
std_zi = np.std(zi)
mean_zi = np.mean(zi)
ax.plot(x, stats.norm.pdf((x-mean_zi)/std_zi)*scale_factor, color='black', linewidth=2, label='normal')
ax.legend()
plt.show()
The original values for z1 and z2 very much resemble a normal distribution, and so the black line (the Gaussian normal for the data) and the green line (the KDE) very much resemble each other.
The current code first calculates the real mean and the real standard deviation of the data. As you want to mimic a perfect Gaussian normal, you should compare to the curve with mean zero and standard deviatio one. You'll see they're almost identical on the plot.
I am making a colormap of a 2D numpy meshgrid:
X, Y = np.meshgrid(fields, frequencies)
cs = ax.contourf(X, Y, fields_freqs_abs_grid, cmap="viridis", N=256)
The values in fields_freqs_abs_grid, which are plotted by color, have already been logarithmically scaled.
The colormap produced by python's matplotlib is coarse -- it scales over 8 colors even though I use "N=256" for the number of RGB pixels. Increasing N to 2048 did not change anything. A plot using the MatLab language on the same data produces a colormap with significantly higher color resolution. How do I increase the number of colors mapped in Python?
The result is:
But I want the result to be:
Thank you!
Warren Weckesser's comments definitely works and can give you a high resolution image. I implemented his idea in the example below.
In regarding to use contourf(), I'm not sure if this is a version dependent issue, but in the most recent version,
contourf() doesn't have a kwarg for N.
As you can see in the document, you want to use N as an arg (in syntax: contourf(X,Y,Z,N)) to specify how many levels you want to plot rather than the number of RGB pixels. contourf() draws filled contours and the resolution depends on the number of levels to draw. Your N=256 won't do anything and contourf() will automatically choose 7 levels.
The following code is modified from the official example, comparing resolutions with different N. In case there is a version issue, this code gives the following plot with python 3.5.2; matplotlib 1.5.3:
import numpy as np
import matplotlib.pyplot as plt
delta = 0.025
x = y = np.arange(-3.0, 3.01, delta)
X, Y = np.meshgrid(x, y)
Z1 = plt.mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0)
Z2 = plt.mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1)
Z = 10 * (Z1 - Z2)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.set_size_inches(8, 6)
# Your code sample
CS1 = ax1.contourf(X, Y, Z, cmap="viridis", N=256)
ax1.set_title('Your code sample')
ax1.set_xlabel('word length anomaly')
ax1.set_ylabel('sentence length anomaly')
cbar1 = fig.colorbar(CS1, ax=ax1)
# Contour up to N=7 automatically-chosen levels,
# which should give the same as your code.
N = 7
CS2 = ax2.contourf(X, Y, Z, N, cmap="viridis")
ax2.set_title('N=7')
ax2.set_xlabel('word length anomaly')
ax2.set_ylabel('sentence length anomaly')
cbar2 = fig.colorbar(CS2, ax=ax2)
# Contour up to N=100 automatically-chosen levels.
# The resolution is still not as high as using imshow().
N = 100
CS3 = ax3.contourf(X, Y, Z, N, cmap="viridis")
ax3.set_title('N=100')
ax3.set_xlabel('word length anomaly')
ax3.set_ylabel('sentence length anomaly')
cbar3 = fig.colorbar(CS3, ax=ax3)
IM = ax4.imshow(Z, cmap="viridis", origin='lower', extent=(-3, 3, -3, 3))
ax4.set_title("Warren Weckesser's idea")
ax4.set_xlabel('word length anomaly')
ax4.set_ylabel('sentence length anomaly')
cbar4 = fig.colorbar(IM, ax=ax4)
fig.tight_layout()
plt.show()
I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()