Sample Distribution Simulation not resulting in Normal - python

I was trying to simulate "Sampling Distribution of Sample Proportions" using Python. I tried with a Bernoulli Variable as in example here
The crux is that, out of large number of gumballs, we have yellow balls with true proportion of 0.6. If we take samples (of some size, say 10), take mean of that and plot, we should get a normal distribution.
I tried to do in python but I only always get uniform distribution (or flats out in the middle). I am not able to understand what am I missing.
Program:
from SDSP import create_bernoulli_population, get_frequency_df
from random import shuffle, choices
from bi_to_nor_demo import get_metrics, bare_minimal_plot
import matplotlib.pyplot as plt
N = 10000 # 10000 balls
p = 0.6 # probability of yellow ball is 0.6, and others (1-0.6)=>0.4
n_pickups = 1000 # sample size
n_experiments = 100 # I dont know what this is called
# generate population
population = create_bernoulli_population(N,p)
theor_df = get_frequency_df(population)
theor_df
# choose sample, take mean and add to X_mean_list. Do this for n_experiments times
X_hat = []
X_mean_list = []
for each_experiment in range(n_experiments):
X_hat = choices(population, k=n_pickups) # this method is with replacement
shuffle(population)
X_mean = sum(X_hat)/len(X_hat)
X_mean_list.append(X_mean)
# plot X_mean_list as bar graph
stats_df = get_frequency_df(X_mean_list)
fig, ax = plt.subplots(1,1, figsize=(5,5))
X = stats_df['x'].tolist()
P = stats_df['p(x)'].tolist()
ax.bar(X, P, color="C0")
plt.show()
Dependent functions:
bi_to_nor_demo
SDSP
Output:
Update:
I even tried uniform distribution as below but getting similar output. Not converging to normal :(. (using below function in place of create_bernoulli_population)
def create_uniform_population(N, Y=[]):
"""
Given the total size of population N,
this function generates list of those outcomes uniformly distributed
population list
N - Population size, eg N=10000
p - probability of interested outcome
Returns the outcomes spread out in population as a list
"""
uniform_p = 1/len(Y)
print(uniform_p)
total_pops = []
for i in range(0,len(Y)):
each_o = [i]*(int(uniform_p*N))
total_pops += each_o
shuffle(total_pops)
return total_pops

can you please share your matplotlib settings? I think you have the plot truncated, you are correct in that the sample distribution of the sample proportion on a bernoulli should be normally distributed around the population expected value ...
perhaps using something as:
plt.tight_layout()
to check if there are no graph issues

def plotHist(nr, N, n_):
''' plots the RVs'''
x = np.zeros((N))
sp = f.add_subplot(3, 2, n_ )
for i in range(N):
for j in range(nr):
x[i] += np.random.binomial(10, 0.6)/10
x[i] *= 1/nr
plt.hist(x, 100, normed=True, color='#348ABD', label=" %d RVs"%(nr));
plt.setp(sp.get_yticklabels(), visible=False)
N = 1000000 # number of samples taken
nr = ([1, 2, 4, 8, 16, 32])
for i in range(np.size(nr)):
plotHist(nr[i], N, i+1)
Above is a code sample based on a general blog I wrote on CLT: https://rajeshrinet.github.io/blog/2014/central-limit-theorem/
Essentially, I am generating several random numbers (nr) from a distribution in the range (0,1) and summing them. Then I see, how do they converge as I increase the number of the random numbers.
Here is a screenshot of the code and the result.

Solution:
I guess I have arrived at the solution. By reverse engineering Rajesh's approach and taking hint from Daniel if graph could be an issue, finally I have figured out the culprit: default bar graph width being 0.8 is too wide to show my graph as flattened on top. Below is modified code and output.
from SDSP import create_bernoulli_population, get_frequency_df
from random import shuffle, choices
from bi_to_nor_demo import get_metrics, bare_minimal_plot
import matplotlib.pyplot as plt
N = 10000 # 10000 balls
p = 0.6 # probability of yellow ball is 0.6, and others (1-0.6)=>0.4
n_pickups = 10 # sample size
n_experiments = 2000 # I dont know what this is called
# THEORETICAL PDF
# generate population and calculate theoretical bernoulli pdf
population = create_bernoulli_population(N,p)
theor_df = get_frequency_df(population)
# STATISTICAL PDF
# choose sample, take mean and add to X_mean_list. Do this for n_experiments times.
X_hat = []
X_mean_list = []
for each_experiment in range(n_experiments):
X_hat = choices(population, k=n_pickups) # choose, say 10 samples from population (with replacement)
X_mean = sum(X_hat)/len(X_hat)
X_mean_list.append(X_mean)
stats_df = get_frequency_df(X_mean_list)
# plot both theoretical and statistical outcomes
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(5,10))
from SDSP import plot_pdf
mu,var,sigma = get_metrics(theor_df)
plot_pdf(theor_df, ax1, mu, sigma, p, title='True Population Parameters')
mu,var,sigma = get_metrics(stats_df)
plot_pdf(stats_df, ax2, mu, sigma, p=mu, bar_width=round(0.5/n_pickups,3),title='Sampling Distribution of\n a Sample Proportion')
plt.tight_layout()
plt.show()
Output:

Related

QQplot for discrete distribution

I have a set whose samples are discrete values (in particular, the size of a queue over time). Now I'd like to find what distribution they belong to. To achieve this goal I'd act the same way I did for the other quantities, i.e. plotting a qqplot, launching
import statsmodels.api as sm
sm.qqplot(df, dist = 'geom', sparams = (.5,), line ='s', alpha = 0.3, marker ='.')
This works if dist is not a discrete random variables (e.g. 'exp' or 'norm') and indeed I used to get some results, but when the distribution is discrete (say, 'geom'), I get
AttributeError: 'geom_gen' object has no attribute 'fit'
I searched on the Internet how to make a qqplot (or something similar) to spot what distribution my samples belong to but I found nothing
def discreteQQ(x_sample):
p_test = np.array([])
for i in range(0, 1001):
p_test = np.append(p_test, i/1000)
i = i + 1
x_sample = np.sort(x_sample)
x_theor = stats.geom.rvs(.5, size=len(x_sample))
ecdf_sample = np.arange(1, len(x_sample) + 1)/(len(x_sample)+1)
x_theor = stats.geom.ppf(ecdf_sample, p=0.5)
for p in p_test:
plt.scatter(np.quantile(x_theor, p), np.quantile(x_sample, p), c = 'blue')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Sample quantiles')
plt.show()
Generate a theoretical geometric distribution using scipy.stats.geom, convert the sample and theoretical data using statsmodels' ProbPlot and pass these to statsmodels' qqplot_2samples.
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.graphics.gofplots import qqplot_2samples
p_theor = 1/4 # The probability we check for
p_sample = 1/5 # The true probability of the sample distribution
# The experimental data
x_sample = stats.geom.rvs(p_sample, size=50)
# The model data
x_theor = stats.geom.rvs(p_theor, size=100)
qqplot_2samples(ProbPlot(x_sample), ProbPlot(x_theor), line='45')
plt.show()

How to visually represent time evolution in 2-d Brownian motion simulation

I have modeled Brownian motion in both the x and y directions as random walks. I have plotted the data on a 2-d plot but, while it is not so difficult to trace the simulated particle's path from the origin, I want to be able to see the time-evolution of the particle's path visually represented on the plot, whether it be by changing the color of the line over time, or by adding a third dimension to the plot to represent time, or by using some sort of dynamic graph type.
I haven't tried implementing anything, but I have tried to look at what options are available to me. I want to avoid using a 3d plot if possible. That said, I am open to using something other than matplotlib if it makes sense for this situation (like pyqtgraph).
Here is my code:
import random
import numpy as np
import matplotlib.pyplot as plt
#n is how many trajectory evaluations
n = 1000
t= np.linspace(0,10000,num=n)
def brownianMotion(time):
B = [0]
for t in range(len(time)-1):
nrand = random.gauss(0,(time[t+1] - time[t])**.5)
B.append(B[t]+nrand)
return B
xpath = brownianMotion(t)
ypath = brownianMotion(t)
def plot(x,y):
plt.figure()
xplot = np.insert(x,0,0)
yplot = np.insert(y,0,0)
plt.plot(xplot,yplot,'go-',lw=1,ms=.1)
#np.arange(0,n+1),'go-', lw=1, ms = .1)
plt.xlim([-150,150])
plt.ylim([-150,150])
plt.title('Brownian Motion')
plt.xlabel('xDisplacement')
plt.ylabel('yDisplacement')
plt.show()
plot(xpath,ypath)
All in all, this is just for fun and something I did while bored at work. All suggestions are welcome! Thank you for your time!
Please let me know if I should post a picture of my code's output.
Edit: Additionally, if I wanted to represent multiple particles in the same graph, how could I do that so that the multiple pathes are distinguishable? I have modified my code for this purpose shown below but currently this code outputs a messy green mixture of particles.
import random
import numpy as np
import matplotlib.pyplot as plt
nparticles = 20
#n is how many trajectory evaluations
n = 100
t= np.linspace(0,1000,num=n)
def brownianMotion(time):
B = [0]
for t in range(len(time)-1):
nrand = random.gauss(0,(time[t+1] - time[t])**.5)
B.append(B[t]+nrand)
return B
xs = []
ys = []
for i in range(nparticles):
xs.append(brownianMotion(t))
ys.append(brownianMotion(t))
#xpath = brownianMotion(t)
#ypath = brownianMotion(t)
def plot(x,y):
plt.figure()
for xpath, ypath in zip(x,y):
xplot = np.insert(xpath,0,0)
yplot = np.insert(ypath,0,0)
plt.plot(xplot,yplot,'go-',lw=1,ms=.1)
#np.arange(0,n+1),'go-', lw=1, ms = .1)
plt.xlim([np.amin(x),np.amax(x)])
plt.ylim([np.amin(y),np.amax(y)])
plt.title('Brownian Motion')
plt.xlabel('xDisplacement')
plt.ylabel('yDisplacement')
plt.show()
plot(xs,ys)

get bins coordinates with hexbin in matplotlib

I use matplotlib's method hexbin to compute 2d histograms on my data.
But I would like to get the coordinates of the centers of the hexagons in order to further process the results.
I got the values using get_array() method on the result, but I cannot figure out how to get the bins coordinates.
I tried to compute them given number of bins and the extent of my data but i don't know the exact number of bins in each direction. gridsize=(10,2) should do the trick but it does not seem to work.
Any idea?
I think this works.
from __future__ import division
import numpy as np
import math
import matplotlib.pyplot as plt
def generate_data(n):
"""Make random, correlated x & y arrays"""
points = np.random.multivariate_normal(mean=(0,0),
cov=[[0.4,9],[9,10]],size=int(n))
return points
if __name__ =='__main__':
color_map = plt.cm.Spectral_r
n = 1e4
points = generate_data(n)
xbnds = np.array([-20.0,20.0])
ybnds = np.array([-20.0,20.0])
extent = [xbnds[0],xbnds[1],ybnds[0],ybnds[1]]
fig=plt.figure(figsize=(10,9))
ax = fig.add_subplot(111)
x, y = points.T
# Set gridsize just to make them visually large
image = plt.hexbin(x,y,cmap=color_map,gridsize=20,extent=extent,mincnt=1,bins='log')
# Note that mincnt=1 adds 1 to each count
counts = image.get_array()
ncnts = np.count_nonzero(np.power(10,counts))
verts = image.get_offsets()
for offc in xrange(verts.shape[0]):
binx,biny = verts[offc][0],verts[offc][1]
if counts[offc]:
plt.plot(binx,biny,'k.',zorder=100)
ax.set_xlim(xbnds)
ax.set_ylim(ybnds)
plt.grid(True)
cb = plt.colorbar(image,spacing='uniform',extend='max')
plt.show()
I would love to confirm that the code by Hooked using get_offsets() works, but I tried several iterations of the code mentioned above to retrieve center positions and, as Dave mentioned, get_offsets() remains empty. The workaround that I found is to use the non-empty 'image.get_paths()' option. My code takes the mean to find centers but which means it is just a smidge longer, but it does work.
The get_paths() option returns a set of x,y coordinates embedded that can be looped over and then averaged to return the center position for each hexagram.
The code that I have is as follows:
counts=image.get_array() #counts in each hexagon, works great
verts=image.get_offsets() #empty, don't use this
b=image.get_paths() #this does work, gives Path([[]][]) which can be plotted
for x in xrange(len(b)):
xav=np.mean(b[x].vertices[0:6,0]) #center in x (RA)
yav=np.mean(b[x].vertices[0:6,1]) #center in y (DEC)
plt.plot(xav,yav,'k.',zorder=100)
I had this same problem. I think what needs to be developed is a framework to have a HexagonalGrid object which can then be applied to many different data sets (and it would be awesome to do it for N dimensions). This is possible and it surprises me that neither Scipy or Numpy has anything for it (furthermore there seems to be nothing else like it except perhaps binify)
That said, I assume you want to use hexbinning to compare multiple binned data sets. This requires some common base. I got this to work using matplotlib's hexbin the following way:
import numpy as np
import matplotlib.pyplot as plt
def get_data (mean,cov,n=1e3):
"""
Quick fake data builder
"""
np.random.seed(101)
points = np.random.multivariate_normal(mean=mean,cov=cov,size=int(n))
x, y = points.T
return x,y
def get_centers (hexbin_output):
"""
about 40% faster than previous post only cause you're not calculating the
min/max every time
"""
paths = hexbin_output.get_paths()
v = paths[0].vertices[:-1] # adds a value [0,0] to the end
vx,vy = v.T
idx = [3,0,5,2] # index for [xmin,xmax,ymin,ymax]
xmin,xmax,ymin,ymax = vx[idx[0]],vx[idx[1]],vy[idx[2]],vy[idx[3]]
half_width_x = abs(xmax-xmin)/2.0
half_width_y = abs(ymax-ymin)/2.0
centers = []
for i in xrange(len(paths)):
cx = paths[i].vertices[idx[0],0]+half_width_x
cy = paths[i].vertices[idx[2],1]+half_width_y
centers.append((cx,cy))
return np.asarray(centers)
# important parts ==>
class Hexagonal2DGrid (object):
"""
Used to fix the gridsize, extent, and bins
"""
def __init__ (self,gridsize,extent,bins=None):
self.gridsize = gridsize
self.extent = extent
self.bins = bins
def hexbin (x,y,hexgrid):
"""
To hexagonally bin the data in 2 dimensions
"""
fig = plt.figure()
ax = fig.add_subplot(111)
# Note mincnt=0 so that it will return a value for every point in the
# hexgrid, not just those with count>mincnt
# Basically you fix the gridsize, extent, and bins to keep them the same
# then the resulting count array is the same
hexbin = plt.hexbin(x,y, mincnt=0,
gridsize=hexgrid.gridsize,
extent=hexgrid.extent,
bins=hexgrid.bins)
# you could close the figure if you don't want it
# plt.close(fig.number)
counts = hexbin.get_array().copy()
return counts, hexbin
# Example ===>
if __name__ == "__main__":
hexgrid = Hexagonal2DGrid((21,5),[-70,70,-20,20])
x_data,y_data = get_data((0,0),[[-40,95],[90,10]])
x_model,y_model = get_data((0,10),[[100,30],[3,30]])
counts_data, hexbin_data = hexbin(x_data,y_data,hexgrid)
counts_model, hexbin_model = hexbin(x_model,y_model,hexgrid)
# if you want the centers, they will be the same for both
centers = get_centers(hexbin_data)
# if you want to ignore the cells with zeros then use the following mask.
# But if want zeros for some bins and not others I'm not sure an elegant way
# to do this without using the centers
nonzero = counts_data != 0
# now you can compare the two data sets
variance_data = counts_data[nonzero]
square_diffs = (counts_data[nonzero]-counts_model[nonzero])**2
chi2 = np.sum(square_diffs/variance_data)
print(" chi2={}".format(chi2))

scipy.interpolate.UnivariateSpline not smoothing regardless of parameters

I'm having trouble getting scipy.interpolate.UnivariateSpline to use any smoothing when interpolating. Based on the function's page as well as some previous posts, I believe it should provide smoothing with the s parameter.
Here is my code:
# Imports
import scipy
import pylab
# Set up and plot actual data
x = [0, 5024.2059124920379, 7933.1645067836089, 7990.4664106277542, 9879.9717114947653, 13738.60563208926, 15113.277958924193]
y = [0.0, 3072.5653360000988, 5477.2689107965398, 5851.6866463790966, 6056.3852496014106, 7895.2332350173638, 9154.2956175610598]
pylab.plot(x, y, "o", label="Actual")
# Plot estimates using splines with a range of degrees
for k in range(1, 4):
mySpline = scipy.interpolate.UnivariateSpline(x=x, y=y, k=k, s=2)
xi = range(0, 15100, 20)
yi = mySpline(xi)
pylab.plot(xi, yi, label="Predicted k=%d" % k)
# Show the plot
pylab.grid(True)
pylab.xticks(rotation=45)
pylab.legend( loc="lower right" )
pylab.show()
Here is the result:
I have tried this with a range of s values (0.01, 0.1, 1, 2, 5, 50), as well as explicit weights, set to either the same thing (1.0) or randomized. I still can't get any smoothing, and the number of knots is always the same as the number of data points. In particular, I'm looking for outliers like that 4th point (7990.4664106277542, 5851.6866463790966) to be smoothed over.
Is it because I don't have enough data? If so, is there a similar spline function or cluster technique I can apply to achieve smoothing with this few datapoints?
Short answer: you need to choose the value for s more carefully.
The documentation for UnivariateSpline states that:
Positive smoothing factor used to choose the number of knots. Number of
knots will be increased until the smoothing condition is satisfied:
sum((w[i]*(y[i]-s(x[i])))**2,axis=0) <= s
From this one can deduce that "reasonable" values for smoothing, if you don't pass in explicit weights, are around s = m * v where m is the number of data points and v the variance of the data. In this case, s_good ~ 5e7.
EDIT: sensible values for s depend of course also on the noise level in the data. The docs seem to recommend choosing s in the range (m - sqrt(2*m)) * std**2 <= s <= (m + sqrt(2*m)) * std**2 where std is the standard deviation associated with the "noise" you want to smooth over.
#Zhenya's answer of manually setting knots in between datapoints was too rough to deliver good results in noisy data without being selective about how this technique is applied. However, inspired by his/her suggestion, I have had success with Mean-Shift clustering from the scikit-learn package. It performs auto-determination of the cluster count and seems to do a fairly good smoothing job (very smooth in fact).
# Imports
import numpy
import pylab
import scipy
import sklearn.cluster
# Set up original data - note that it's monotonically increasing by X value!
data = {}
data['original'] = {}
data['original']['x'] = [0, 5024.2059124920379, 7933.1645067836089, 7990.4664106277542, 9879.9717114947653, 13738.60563208926, 15113.277958924193]
data['original']['y'] = [0.0, 3072.5653360000988, 5477.2689107965398, 5851.6866463790966, 6056.3852496014106, 7895.2332350173638, 9154.2956175610598]
# Cluster data, sort it and and save
inputNumpy = numpy.array([[data['original']['x'][i], data['original']['y'][i]] for i in range(0, len(data['original']['x']))])
meanShift = sklearn.cluster.MeanShift()
meanShift.fit(inputNumpy)
clusteredData = [[pair[0], pair[1]] for pair in meanShift.cluster_centers_]
clusteredData.sort(lambda pair1, pair2: cmp(pair1[0],pair2[0]))
data['clustered'] = {}
data['clustered']['x'] = [pair[0] for pair in clusteredData]
data['clustered']['y'] = [pair[1] for pair in clusteredData]
# Build a spline using the clustered data and predict
mySpline = scipy.interpolate.UnivariateSpline(x=data['clustered']['x'], y=data['clustered']['y'], k=1)
xi = range(0, round(max(data['original']['x']), -3) + 3000, 20)
yi = mySpline(xi)
# Plot the datapoints
pylab.plot(data['clustered']['x'], data['clustered']['y'], "D", label="Datapoints (%s)" % 'clustered')
pylab.plot(xi, yi, label="Predicted (%s)" % 'clustered')
pylab.plot(data['original']['x'], data['original']['y'], "o", label="Datapoints (%s)" % 'original')
# Show the plot
pylab.grid(True)
pylab.xticks(rotation=45)
pylab.legend( loc="lower right" )
pylab.show()
While I'm not aware of any library which will do it for you off-hand, I'd try a bit more DIY approach: I'd start from making a spline with knots in between the raw data points, in both x and y. In your particular example, having a single knot in between the 4th and 5th points should do the trick, since it'd remove the huge derivative at around x=8000.
I had trouble getting BigChef's answer running, here is a variation that works on python 3.6:
# Imports
import pylab
import scipy
import sklearn.cluster
# Set up original data - note that it's monotonically increasing by X value!
data = {}
data['original'] = {}
data['original']['x'] = [0, 5024.2059124920379, 7933.1645067836089, 7990.4664106277542, 9879.9717114947653, 13738.60563208926, 15113.277958924193]
data['original']['y'] = [0.0, 3072.5653360000988, 5477.2689107965398, 5851.6866463790966, 6056.3852496014106, 7895.2332350173638, 9154.2956175610598]
# Cluster data, sort it and and save
import numpy
inputNumpy = numpy.array([[data['original']['x'][i], data['original']['y'][i]] for i in range(0, len(data['original']['x']))])
meanShift = sklearn.cluster.MeanShift()
meanShift.fit(inputNumpy)
clusteredData = [[pair[0], pair[1]] for pair in meanShift.cluster_centers_]
clusteredData.sort(key=lambda li: li[0])
data['clustered'] = {}
data['clustered']['x'] = [pair[0] for pair in clusteredData]
data['clustered']['y'] = [pair[1] for pair in clusteredData]
# Build a spline using the clustered data and predict
mySpline = scipy.interpolate.UnivariateSpline(x=data['clustered']['x'], y=data['clustered']['y'], k=1)
xi = range(0, int(round(max(data['original']['x']), -3)) + 3000, 20)
yi = mySpline(xi)
# Plot the datapoints
pylab.plot(data['clustered']['x'], data['clustered']['y'], "D", label="Datapoints (%s)" % 'clustered')
pylab.plot(xi, yi, label="Predicted (%s)" % 'clustered')
pylab.plot(data['original']['x'], data['original']['y'], "o", label="Datapoints (%s)" % 'original')
# Show the plot
pylab.grid(True)
pylab.xticks(rotation=45)
pylab.show()

Fitting distributions, goodness of fit, p-value. Is it possible to do this with Scipy (Python)?

INTRODUCTION: I'm a bioinformatician. In my analysis which I perform on all human genes (about 20 000) I search for a particular short sequence motif to check how many times this motif occurs in each gene.
Genes are 'written' in a linear sequence in four letters (A,T,G,C). For example: CGTAGGGGGTTTAC... This is the four-letter alphabet of genetic code which is like the secret language of each cell, it’s how DNA actually stores information.
I suspect that frequent repetitions of a particular short motif sequence (AGTGGAC) in some genes are crucial in a specific biochemical process in the cell. Since the motif itself is very short it is difficult with computational tools to distinguish between true functional examples in genes and those that look similar by chance. To avoid this problem I get sequences of all genes and concatenated into a single string and shuffled. The length of each of the original genes was stored. Then for each of the original sequence lengths, a random sequence was constructed by repeatedly picking A or T or G or C at random from the concatenated sequence and transferring it to the random sequence. In this way, the resulting set of randomized sequences has the same length distribution, as well as the overall A,T,G,C composition. Then I search for the motif in these randomized sequences. I performed this procedure 1000 times and averaged the results.
15000 genes that do not contain a given motif
5000 genes that contain 1 motif
3000 genes that contain 2 motifs
1000 genes that contain 3 motifs
...
1 gene that contain 6 motifs
So even after 1000 times randomization of true genetic code, there aren't any genes which have more than 6 motifs. But in the true genetic code, there are a few genes which contain more then 20 occurrences of the motif, which suggest that these repetition might be functional and it's unlikely to find them in such an abundance by pure chance.
PROBLEM:
I would like to know the probability of finding a gene with let's say 20 occurrences of the motif in my distribution. So I want to know the probability to find such a gene by chance. I would like to implement this in Python, but I don't know how.
Can I do such an analysis in Python?
Any help would be appreciated.
In SciPy documentation you will find a list of all implemented continuous distribution functions. Each one has a fit() method, which returns the corresponding shape parameters.
Even if you don't know which distribution to use you can try many distrubutions simultaneously and choose the one that fits better to your data, like in the code below. Note that if you have no idea about the distribution it may be difficult to fit the sample.
import matplotlib.pyplot as plt
import scipy
import scipy.stats
size = 20000
x = scipy.arange(size)
# creating the dummy sample (using beta distribution)
y = scipy.int_(scipy.round_(scipy.stats.beta.rvs(6,2,size=size)*47))
# creating the histogram
h = plt.hist(y, bins=range(48))
dist_names = ['alpha', 'beta', 'arcsine',
'weibull_min', 'weibull_max', 'rayleigh']
for dist_name in dist_names:
dist = getattr(scipy.stats, dist_name)
params = dist.fit(y)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
if arg:
pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
else:
pdf_fitted = dist.pdf(x, loc=loc, scale=loc) * size
plt.plot(pdf_fitted, label=dist_name)
plt.xlim(0,47)
plt.legend(loc='upper left')
plt.show()
References:
- Distribution fitting with Scipy
- Fitting empirical distribution to theoretical ones with Scipy (Python)?
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import random
mpl.style.use("ggplot")
def danoes_formula(data):
"""
DANOE'S FORMULA
https://en.wikipedia.org/wiki/Histogram#Doane's_formula
"""
N = len(data)
skewness = st.skew(data)
sigma_g1 = math.sqrt((6*(N-2))/((N+1)*(N+3)))
num_bins = 1 + math.log(N,2) + math.log(1+abs(skewness)/sigma_g1,2)
num_bins = round(num_bins)
return num_bins
def plot_histogram(data, results, n):
## n first distribution of the ranking
N_DISTRIBUTIONS = {k: results[k] for k in list(results)[:n]}
## Histogram of data
plt.figure(figsize=(10, 5))
plt.hist(data, density=True, ec='white', color=(63/235, 149/235, 170/235))
plt.title('HISTOGRAM')
plt.xlabel('Values')
plt.ylabel('Frequencies')
## Plot n distributions
for distribution, result in N_DISTRIBUTIONS.items():
# print(i, distribution)
sse = result[0]
arg = result[1]
loc = result[2]
scale = result[3]
x_plot = np.linspace(min(data), max(data), 1000)
y_plot = distribution.pdf(x_plot, loc=loc, scale=scale, *arg)
plt.plot(x_plot, y_plot, label=str(distribution)[32:-34] + ": " + str(sse)[0:6], color=(random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)))
plt.legend(title='DISTRIBUTIONS', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
def fit_data(data):
## st.frechet_r,st.frechet_l: are disbled in current SciPy version
## st.levy_stable: a lot of time of estimation parameters
ALL_DISTRIBUTIONS = [
st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
st.foldcauchy,st.foldnorm, st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,
st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
]
MY_DISTRIBUTIONS = [st.beta, st.expon, st.norm, st.uniform, st.johnsonsb, st.gennorm, st.gausshyper]
## Calculae Histogram
num_bins = danoes_formula(data)
frequencies, bin_edges = np.histogram(data, num_bins, density=True)
central_values = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]
results = {}
for distribution in MY_DISTRIBUTIONS:
## Get parameters of distribution
params = distribution.fit(data)
## Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
## Calculate fitted PDF and error with fit in distribution
pdf_values = [distribution.pdf(c, loc=loc, scale=scale, *arg) for c in central_values]
## Calculate SSE (sum of squared estimate of errors)
sse = np.sum(np.power(frequencies - pdf_values, 2.0))
## Build results and sort by sse
results[distribution] = [sse, arg, loc, scale]
results = {k: results[k] for k in sorted(results, key=results.get)}
return results
def main():
## Import data
data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())
results = fit_data(data)
plot_histogram(data, results, 5)
if __name__ == "__main__":
main()

Categories

Resources