python: scatter plot with median and CI - python

I am looking for a python plot on the lines of http://www.r-bloggers.com/visually-weighted-watercolor-plots-new-variants-please-vote/

This gives the equivalent of the standard deviation bands:
# generate random variables
x,y = generate_random()
# bin the values and determine the envelopes
df = bin_by(x, y, nbins=25, bins = None)
###
# Plot 1
###
# determine the colors
cols = ['#EE7550', '#F19463', '#F6B176']
with plt.style.context('fivethirtyeight'):
# plot the 3rd stdv
plt.fill_between(df.x, df['5th'], df['95th'], alpha=0.7,color = cols[2])
plt.fill_between(df.x, df['10th'], df['90th'], alpha=0.7,color = cols[1])
plt.fill_between(df.x, df['25th'], df['75th'], alpha=0.7,color = cols[0])
# plt the line
plt.plot(df.x, df['median'], color = '1', alpha = 0.7, linewidth = 1)
# plot the points
plt.scatter(x, y, facecolors='white', edgecolors='0', s = 5, lw = 0.7)
plt.savefig('fig1.png', facecolor='white', edgecolor='none')
plt.show()
def bin_by(x, y, nbins=30, bins = None):
"""
Divide the x axis into sections and return groups of y based on its x value
"""
if bins is None:
bins = np.linspace(x.min(), x.max(), nbins)
bin_space = (bins[-1] - bins[0])/(len(bins)-1)/2
indicies = np.digitize(x, bins + bin_space)
Bit of a discussion and link to my Github from my blog

cut-paste from my larger piece of code. It does not give what I want. I am posting per Evert's suggestion
fig = plt.figure(figsize=(8, 8))
plt.plot(xlist, ylist, 'b,')
plt.plot([0.0,0.8],[0.0,0.8],'y-')
data2d=zip(xlist,ylist)
bins = np.linspace(0.0, 0.2, 21)
medianlist=binpercentile(data2d,bins)
c10list=binpercentile(data2d,bins,0.1)
c90list=binpercentile(data2d,bins,0.9)
centerbins=[(x+y)/2.0 for x,y in zip(bins[:-1],bins[1:])]
centerbins.insert(0,0)
medianlist.insert(0,0)
c10list.insert(0,0)
c90list.insert(0,0)
plt.plot(centerbins,c10list,'r--')
plt.plot(centerbins,c90list,'r--')
plt.plot(centerbins,medianlist,'r-')
imagefilename='%s.%s'%('.'.join(infile.split('.')[0:-1]),'diffmed.pdf')
plt.savefig(imagefilename)

Related

Calculating PDF given a histogram

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

SciPy Bernoulli random number generation, different behavior within a for loop for different sample sizes

The purpose of this code is to demonstrate CLT.
If I do the following:
num_samples = 10000
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=100))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(100))
z_ed = np.divide(sample_demeaned, denominator)
plt.hist(z_ed, bins=40, edgecolor='k', density=True)
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
plt.plot(x, y, color='red')
I get:
However, if I try to do it with a for loop for different sample sizes:
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
mean = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_means[i] = mean
sample_demeaned = np.subtract(sample_means, 0.5)
denominator = np.divide(0.5, np.sqrt(size))
z_ed = np.divide(sample_demeaned, denominator)
sample_std_means[:, col] = sample_means
And then plot each of them in a 2x2 grid:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 10000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[i], bins=40, edgecolor='k', color='midnightblue')
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
I get the following image:
I cannot debug the discrepancy here. Any help is highly appreciated.
Please note that scipy.stats and numpy have been imported as st and np respectively in both code blocks.
First, note that one numpy's strong points is that it allows operations which mix arrays and single numbers. This is called broadcasting. So, for example sample_demeaned = np.subtract(sample_means, 0.5) can be written more concise as sample_demeaned = sample_means - 0.5.
Several issues are going wrong:
sample_std_means[:, col] = sample_means should use the just calculated z_ed instead of sample_means.
ax.hist(sample_std_means[i], ...) uses the i'th row of the array. That row only contains 4 elements. You'd want sample_std_means[;,i] to take the i'th column.
The pdf is drawn in its normalized form (with an area below the curve equal to one). However, the histogram's height is proportional to the number of samples. Its total area is num_samples * bin_width, where the histogram's default bin width is the length from the first to the last element divided by the number of bins. To get both the pdf and histogram with similar sizes, either the histogram should be normalized (using density=True) or the pdf should be multiplied by the expected area of the histogram.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
num_samples = 10000
sample_sizes = np.array([5, 20, 75, 100])
sample_std_means = np.empty(shape=(num_samples, len(sample_sizes)))
for col, size in enumerate(sample_sizes):
sample_means = np.empty(num_samples)
for i in range(num_samples):
sample_means[i] = np.mean(st.bernoulli.rvs(p=0.5, size=size))
sample_demeaned = sample_means - 0.5
z_ed = sample_demeaned / (0.5 / np.sqrt(size))
sample_std_means[:, col] = z_ed
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
x = np.linspace(st.norm.ppf(0.001), st.norm.ppf(0.999), 1000)
y = st.norm.pdf(x)
for i, ax in enumerate(axes.flatten()):
ax.hist(sample_std_means[:, i], bins=40, edgecolor='k', color='midnightblue', density=True)
ax.set_ylabel('Density')
ax.set_xlabel(f'n = {sample_sizes[i]}')
# bin_width = (sample_std_means[:, i].max() - sample_std_means[:, i].min()) / 40
# ax.plot(x, y * num_samples * bin_width, color='red')
ax.plot(x, y, color='red')
ax.set_xlim((-3, 3))
plt.show()
Now note the weird empty bars in the histograms. A histogram works best for continuous distributions. But the mean of n Bernoulli trials can have at most n+1 different outcomes. When all trials would be True, the mean would be n/n = 1. When all would be False, the mean would be 0. Combined, the possible means are 0, 1/n, 2/n, ..., 1. The histogram of such a discrete distribution should take these values into account for the boundaries between the bins.
The following code creates a scatter plot, using the position of the means and a random y-value to visualize how many there are per x. Also, the position of the bin boundaries is calculated and visualized by dotted vertical lines.
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
for i, ax in enumerate(axes.flatten()):
ax.scatter(sample_std_means[:, i], np.random.uniform(0, 1, num_samples), color='r', alpha=0.5, lw=0, s=1)
# there are n+1 possible mean values for n bernoulli trials
# n+2 boundaries will be needed to separate the bins
bins = np.arange(-1, sample_sizes[i]+1) / sample_sizes[i]
bins += (bins[1] - bins[0]) / 2 # shift half a bin
bins -= 0.5 # subtract the mean
bins /= (0.5 / np.sqrt(sample_sizes[i])) # correction factor
for b in bins:
ax.axvline(b, color='g', ls=':')
ax.set_xlabel(f'n = {sample_sizes[i]}')
ax.set_xlim((-3, 3))
And here are the histograms using these bins:
ax.hist(sample_std_means[:, i], bins=bins, edgecolor='k', color='midnightblue', density=True)

Matrix Contour Plot of 3 Parameters

I have run an MCMC chain for parameter estimation and have obtained accepted parameter values. I have 3 parameters and about 300 000 accepted values for each parameter.
I would now like to do a contour plot (doable) but in a 3-choose-2 triangular matrix type (a very specific requirement) Please see the attached photo contour-plot. The image shows some unrelated contour-plots from a paper but I want to have a similar type of plot for my parameters.
In total, I will have 6 plots: 3 single parameter histograms (like the top plot in each column in the image) and 3-choose-2 = 3 contour plots (as the lower triangle). Again, I need it to look as much as possible like the image.
How to achieve this on Python?
Update:
I have been able to write the below code which gives me a plot as my-plot-here.
However, I need an exact / as best as possible match with the type of figure 1. i.e. I need my xticks, yticks to show inside and not outside, the spaces between the figues to go away, a better way to show the left vertical plot labels (I'm currently using set_ylabel), the outer crooked contour level to go away, and have detailed (long-short) ticks along the x-axis of the pdfs.
def plot_histogram_fig(param, nbins, subplot_index, subplot_title):
counts, bins = np.histogram(param, bins = nbins)
plotcounts = np.insert(counts, -1, counts[-1])
bincentres = (bins[:-1] + bins[1:])/2
ax = fig.add_subplot(3, 3, subplot_index)
#ax.step(bins, plotcounts, where='post', c='y')
ax.plot(bincentres, counts, 'b')
#ax.plot([bins[np.argmax(counts)], bins[np.argmax(counts)]], [0, np.max(counts)], 'y')
ax.set_yticks([])
return [ax, counts, bincentres]
def plot_contour_fig(p1, p2, nbins, subplot_index):
H, xedges, yedges = np.histogram2d(p1, p2, bins = nbins)
Z = H.T
#Z_gauss = scipy.ndimage.gaussian_filter(Z, sigma = 0.8, order = 0) #filtering
X, Y = np.meshgrid(xedges[:-1], yedges[:-1])
ax = fig.add_subplot(3, 3, subplot_index)
im = ax.contour(X, Y, Z, levels = 6)
#plt.colorbar(im, ax = ax)
ax.clabel(im, inline=True, fontsize=4)
return [ax, H, xedges, yedges]
nbins = 50
fig = plt.figure(figsize = (10, 6))
#Histograms
ax1 = plot_histogram_fig(all_alphas, nbins, 1, subplot_title = 'alpha')
ax1[0].set_xticks([])
ax1[0].set_ylabel('alpha')
ax5 = plot_histogram_fig(all_betas, nbins, 5, subplot_title = 'beta')
ax5[0].set_xticks([])
ax9 = plot_histogram_fig(all_gammas, nbins, 9, subplot_title = 'gamma')
ax9[0].set_title('gamma', y = -0.5)
#Contours
ax4 = plot_contour_fig(all_alphas, all_betas, nbins, 4)
ax4[0].set_xticklabels([])
ax4[0].set_ylabel('beta')
ax7 = plot_contour_fig(all_alphas, all_gammas, nbins, 7)
ax7[0].set_title('alpha', y = -0.5)
ax7[0].set_ylabel('gamma')
ax8 = plot_contour_fig(all_betas, all_gammas, nbins, 8)
ax8[0].set_yticklabels([])
ax8[0].set_title('beta', y = -0.5)
plt.show()
all_alphas, all_betas, all_gammas are 1d numpy arrays storing the accepted parameter values.

Histogram does not show up in f-distribution plot

I am trying to create f-distributed random numbers with given degree of freedoms d1 and d2, and plot both a histogram with f-distributed random numbers, and plot an idealised f-distribution curve, but when I give small values to df's, the histogram does not show up. I am new at statistics and matplotlib, and I could not figure out how to deal with this problem.
This is my code:
def distF(request, distribution_id):
dist = get_object_or_404(Distribution, pk=distribution_id)
dfd = dist.var4
dfn = dist.var2
x = np.random.f(dfn, dfd, size = dist.var3)
num_bins = 50
fig, ax = plt.subplots()
print(x)
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, normed=True)
y = np.linspace(0, 5, 1001)[1:]
dist = st.f(dfn, dfd, 0)
#y = np.linspace(st.f.ppf(0.01, dfn, dfd), st.f.ppf(0.99, dfn, dfd), 100)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, 4])
ax.set_ylim([0, 3])
fig.tight_layout()
canvas = FigureCanvas(fig)
response = HttpResponse(content_type='image/png')
canvas.print_png(response)
plt.close(fig)
return response
This is how the plots look like:
F-distribution plot with small df values
F-distribution plot with large df values
The problem is that the f distribution with a dfd of 1 spreads out hugely towards large numbers. So let's say you have values of 2000 or so in your array x, but only 50 bins between 0 and 2000. That makes the bin rather large and hence rather low in height. I would think that if you anyway want to limit your view to some low number, it would be better to also limit the histogram to that number.
In the code below the limit would be 5 and the bin width is 0.2.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
dfn = 10
dfd =1
limit = 5
x = np.random.f(dfn, dfd, size = 100)
bins = np.arange(0, limit, 0.2)
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, bins, normed=True)
y = np.linspace(0, limit, 1001)[1:]
dist = st.f(dfn, dfd, 0)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, limit])
fig.tight_layout()
plt.show()

Dividing matplotlib histogram by maximum bin value

I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()

Categories

Resources