What is the best way to fit multivariate linear equation? - python

I want to fit the linear equation of the form Y=coff1A+coff2B+C and calculate constant values coff1, coff2, and C. I did several ways but I got a huge difference in coefficient values. What is the efficient way to fit this type of equation?
import pandas as pd
import lmfit
dataset = {'A': [0.021426, -0.003970,0.001040, -0.003789, 0.009423, 0.046421, 0.039426, 0.027010, 0.024423, 0.022277],
'B': [ 0.000056, 0.000098, 0.000057, 0.000066, 0.000047 ,-0.009798,-0.008069,-0.005124,-0.004505,-0.004006],
'y': [242245.852, 153763.713, 205788.950, 161561.380, 250021.084,235739.216, 283089.372, 429715.097, 480362.889, 531978.557]}
data = pd.DataFrame(dataset)
A=data['A']
B=data['B']
y=data['y']
def resid(params,A, B, ydata):
f_rot = params['f_rot'].value
xip = params['xip'].value
xim = params['xim'].value
f_obs =f_rot+A*xip-B*xim
return f_obs - ydata
#data = np.loadtxt("C:/Users/USER/Desktop/Combined_values_FINAL_EAP.txt")
params = lmfit.Parameters()
params.add('f_rot', 15)
params.add('xip', 2)
params.add('xim', 10)
fit =lmfit.minimize(resid, params, args=(y,A,B), method='least_squares')
lmfit.report_fit(fit)
#Answer of this fit
#f_rot: 3.9748e-04 +/- 0.00139658 (351.36%) (init = 150000)
# xip: 3.7470e-10 +/- 4.7904e-09 (1278.48%) (init = 2)
# xim: 0.19744062 +/- 0.03694633 (18.71%) (init = 10)

Related

Python Fit Arbitrary Gaussian Functions - Bad Fits?

I'm trying to generalise some code to be able to fit multiple (n from 1 to >10) gaussian curves/peaks within a single dataset.
Using Scipy Optimise Curve_fit I can get pretty good fits when I hard code functions for 1-3 gaussians, and I've managed to produce functions which run without error for a generalise, arbitrary number of gaussians. However, the output fit is very poor. This is despite giving the input parameters which are identical to those used to generate the 'raw' data - i.e. a best case scenario.
Also, there is a non-zero chance the specific function may need to be modified from a simple gaussian at some point, but for now it should be OK.
Below is my code example, and the output figure is shown below.
import numpy as np
import pandas as pd
import scipy
import scipy.optimize
import matplotlib.pyplot as plt
from matplotlib import gridspec
amp1 = 1
cen1 = 1
sigma1 = 0.05
df=pd.DataFrame(index=np.linspace(0,10,num=1000),columns=['int'])
def _ngaussian(x, amps,cens,sigmas):
fn = 0
if len(amps)== len(cens)== len(sigmas):
for i in range(len(amps)):
fn = fn+amps[i]*(1/(sigmas[i]*(np.sqrt(2*np.pi))))*\
(np.exp((-1.0/2.0)*(((x-cens[i])/sigmas[i])**2)))
else:
print('Your inputs have unequal lengths')
return fn
amps = [1,1.1,0.9]
cens = [1,2,1.7]
sigmas=[0.05]*3
popt_peaks = [amps,cens,sigmas]
df['peaks'] = _ngaussian(df.index, *popt_peaks)
# Optionally adding noise to the raw data
#noise = np.random.normal(0,0.1,len(df['peaks']))
#df['peaks'] = df['peaks']+noise
def wrapper_fit_func(x, *args):
N = len(args)
a, b, c = list(args[0][:N]),list(args[0][N:N*2]),list(args[0][2*N:3*N])
return _ngaussian(x, a, b, c)
def unwrapper_fit_func(x, *args):
N = int(len(args)/3)
a, b, c = list(args[:N]),list(args[N:N*2]),list(args[2*N:3*N])
return _ngaussian(x, a, b, c)
popt_fitpeaks, pcov_fitpeaks = scipy.optimize.curve_fit(lambda x, *popt_peaks: wrapper_fit_func(x, popt_peaks),
df.index, df['peaks'], p0=popt_peaks,
method='lm')
df['peaks_fit'] = unwrapper_fit_func(df.index, *popt_fitpeaks)
fig = plt.figure(figsize=(8,8))
gs = gridspec.GridSpec(1,1)
ax1 = fig.add_subplot(gs[0])
ax1.set_xlim(0,3)
ax1.plot(df.index, df['peaks'], "b",label='ideal data')
ax1.plot(df.index, df['peaks_fit'], "g",label='fit data')
ax1.legend(loc='upper right')
If you're interested, the context is in analytical chemistry, nuclear magnetic resonance (NMR) and Fourier transform ion cyclotron resonance mass spectrometry (FTICR MS) signal processing.
You might find lmfit (https://lmfit.github.io/lmfit-py/, disclosure: I am a lead author) useful for this. It provides an easy-to-use Model class for modeling data, including builtin Models for Gaussian, Voigt, and similar lineshapes making it easy to compare model functions.
Lmfit models can be added (or mulitplied) to make a Composite Model, making it easy to support 1, 2, 3, etc Gaussians and include different baseline functions as well. There are docs and several examples at the link above. A small rewrite of your example (including adding a bit of noise) might look like this:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lmfit.models import GaussianModel
amp1 = 1
cen1 = 1
sigma1 = 0.05
df=pd.DataFrame(index=np.linspace(0,10,num=1000),columns=['int'])
def _ngaussian(x, amps,cens,sigmas):
fn = 0
if len(amps)== len(cens)== len(sigmas):
for i in range(len(amps)):
fn = fn+amps[i]*(1/(sigmas[i]*(np.sqrt(2*np.pi))))*\
(np.exp((-1.0/2.0)*(((x-cens[i])/sigmas[i])**2)))
fn = fn+np.random.normal(size=len(x), scale=0.05)
else:
print('Your inputs have unequal lengths')
return fn
amps = [1.30, 0.92, 2.11]
cens = [1.10, 1.73, 2.06]
sigmas=[0.05, 0.09, 0.07]
popt_peaks = [amps,cens,sigmas]
df['peaks'] = _ngaussian(df.index, *popt_peaks)
# create a model with 3 Gaussians: pretty easy to generalize
# to a loop to make N peaks
model = (GaussianModel(prefix='p1_') +
GaussianModel(prefix='p2_') +
GaussianModel(prefix='p3_') )
# create Parameters (named from function arguments). For
# Gaussian, Lorentzian, Voigt, etc these are "center", "amplitude", "sigma"
params = model.make_params(p1_center=1.0, p1_amplitude=2, p1_sigma=0.1,
p2_center=1.5, p2_amplitude=2, p2_sigma=0.1,
p3_center=2.0, p3_amplitude=2, p3_sigma=0.1)
# Parameters can have min/max bounds, be fixed (`.vary = False`)
# or constrained to a mathematical expression of other Parameter values
params['p1_center'].min = 0.8
params['p1_center'].max = 1.5
params['p2_center'].min = 1.1
params['p2_center'].max = 1.9
params['p3_center'].min = 1.88
params['p3_center'].max = 3.00
# run the fit
result = model.fit(df['peaks'], params, x=df.index)
# print out the fit results
print(result.fit_report())
# plot results
plt.plot(df.index, df['peaks'], 'o', label='data')
plt.plot(df.index, result.best_fit, '-', label='fit')
plt.legend()
plt.gca().set_xlim(0, 3)
plt.show()
This will produce a fit plot like this:
and print out a report of
[[Model]]
((Model(gaussian, prefix='p1_') + Model(gaussian, prefix='p2_')) + Model(gaussian, prefix='p3_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 102
# data points = 1000
# variables = 9
chi-square = 6.88439024
reduced chi-square = 0.00694691
Akaike info crit = -4960.49871
Bayesian info crit = -4916.32892
[[Variables]]
p1_amplitude: 1.29432022 +/- 0.00428720 (0.33%) (init = 2)
p1_center: 1.09993745 +/- 1.9012e-04 (0.02%) (init = 1)
p1_sigma: 0.04970776 +/- 1.9012e-04 (0.38%) (init = 0.1)
p2_amplitude: 0.91875183 +/- 0.00604913 (0.66%) (init = 2)
p2_center: 1.73039597 +/- 6.7594e-04 (0.04%) (init = 1.5)
p2_sigma: 0.09054027 +/- 7.0994e-04 (0.78%) (init = 0.1)
p3_amplitude: 2.10077395 +/- 0.00533617 (0.25%) (init = 2)
p3_center: 2.06019332 +/- 2.0105e-04 (0.01%) (init = 2)
p3_sigma: 0.06970239 +/- 2.0752e-04 (0.30%) (init = 0.1)
p1_fwhm: 0.11705282 +/- 4.4770e-04 (0.38%) == '2.3548200*p1_sigma'
p1_height: 10.3878975 +/- 0.03440799 (0.33%) == '0.3989423*p1_amplitude/max(2.220446049250313e-16, p1_sigma)'
p2_fwhm: 0.21320604 +/- 0.00167179 (0.78%) == '2.3548200*p2_sigma'
p2_height: 4.04824243 +/- 0.02582408 (0.64%) == '0.3989423*p2_amplitude/max(2.220446049250313e-16, p2_sigma)'
p3_fwhm: 0.16413657 +/- 4.8866e-04 (0.30%) == '2.3548200*p3_sigma'
p3_height: 12.0238006 +/- 0.02922330 (0.24%) == '0.3989423*p3_amplitude/max(2.220446049250313e-16, p3_sigma)'
[[Correlations]] (unreported correlations are < 0.100)
C(p3_amplitude, p3_sigma) = 0.622
C(p2_amplitude, p2_sigma) = 0.621
C(p1_amplitude, p1_sigma) = 0.577
C(p2_sigma, p3_sigma) = -0.299
C(p2_sigma, p3_amplitude) = -0.271
C(p2_amplitude, p3_sigma) = -0.239
C(p2_sigma, p3_center) = 0.226
C(p2_amplitude, p3_amplitude) = -0.210
C(p2_center, p3_sigma) = -0.192
C(p2_amplitude, p3_center) = 0.171
C(p2_center, p3_amplitude) = -0.160
C(p2_center, p3_center) = 0.126

Scipy.optimize_curve_fit says that the covariance of the parameters cannot be estimated

I am trying to perform a curve fit in order to estimate values of the parameters of a function for a set of data using scipy.optimize.curve_fit. My data has uncertainties in both the x and y data. When I look up the documentation for scipy.optimize.curve_fit, it says that sigma=the uncertainty of the y values. But when I put that in, I get an OptimizeWarning: OptimizeWarning: Covariance of the parameters could not be estimated
How do I fix this?
This is my code:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
frequency = [111.11, 160, 540.54, 740.74, 909.09, 1250, 1538.46, 2000,
2352.94, 2666.67, 2941.18, 3333.33, 3571.43, 3846.15, 4347.83, 4545.45, 5000]
ufrequency = [3.70, 3.84, 32.14, 17.49, 34.94, 46.80, 37.47, 61.12, 83.32, 106.10, 52.42, 72.22, 81.76, 93.64, 117.39, 127.69, 152.75]
yvalues = [88/90, 175/200, 76/99, 17/26, 30/53, 25/53, 11/27, 8/27, 4/15,
7/30, 29/135, 5/27, 23/135, 22/135, 8/53, 37/265, 33/260]
uyvalues = [0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 4.94e-3, 4.67e-3, 4.37e-3, 3.93e-3, 3.70e-3, 3.60e-3, 3.46e-3, 3.32e-3, 3.2e-3]
plt.errorbar(frequency, yvalues, xerr=ufrequency, yerr = uyvalues, fmt = 'b+', label = "Data")
plt.show()
defining the model function
def f(freq, C, R, d, mu):
return 1/(np.sqrt(1+((R*d)/(2/mu*C*2*np.pi*freq))**2))
define initial guesses for the parameters
C0 = 70000000
R0 = 0.01012
d0 = 0.0004
mu0 = 1.256629e-6
p0 = [C0, R0, d0, mu0]
name = ["C", "R", "d", "mu"]
tmodel = np.linspace(100, 5000, 1000)
ystart = f(tmodel,*p0)
popt, pcov = curve_fit(f, frequency, yvalues, p0, sigma=uyvalues, absolute_sigma=True)
Your model us overdetermined: note that it onky depends on the product d*R, so there's no way to find these two separately. Ditto for mu*C.

Fit the gamma distribution only to a subset of the samples

I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.

Python Pseudo-Voigt fit experimental data

I would like to use Pseudo-Voigt function to fit the data points below.
I looked at matplotlib and numpy but haven't found a way yet.
The data looks like this:
[3.3487290833206163, 3.441076831745743, 7.7932863251851305, 7.519064207516034, 7.394406511652473, 11.251458210206666, 4.679476113847004, 8.313048016542345, 9.348006472917458, 6.086336477997078, 10.765370342398741, 11.402519337778239, 11.151689287913552, 8.546151698722557, 8.323886291540909, 7.133249200994414, 10.242189407441712, 8.887686444395982, 10.759444780127321, 9.21095463298772, 15.693160143294264, 9.239683298899614, 9.476116297451632, 10.128625585058783, 10.94392508956097, 10.274287987647595, 9.552394167463973, 9.51931115335406, 9.923989117054466, 8.646255122559495, 12.207746464070603, 15.249531807666745, 9.820667193850705, 11.913964012172858, 9.506862412612637, 15.858588835799232, 14.918486963658015, 15.089436171053094, 14.38496801289269, 14.42394419048644, 15.759311758218061, 17.063349232010786, 12.232863723786215, 10.988245956134314, 19.109899560493286, 18.344353100589824, 17.397232553539542, 12.372706600456558, 13.038720878764792, 19.100965014037367, 17.094480819566147, 20.801679461435484, 15.763762333448557, 22.302320507719728, 23.394129891315963, 19.884812694503303, 22.09743700979689, 16.995815335935077, 24.286037929073284, 25.214705826961016, 25.305223543285013, 22.656121668613896, 30.185701748800568, 28.28382587095781, 35.63753811848088, 35.59816270398698, 35.64529822281625, 36.213428394807224, 39.56541841125095, 46.360702383473075, 55.84449512752349, 64.50142387788203, 77.75090937376423, 83.00423387164669, 111.98365374689226, 121.05211901294848, 176.82062069814936, 198.46769832454626, 210.52624393366017, 215.36708238568033, 221.58003148955638, 209.7551225151964, 198.4104196333782, 168.13949002992925, 126.0081896958841, 110.39003569380478, 90.88743461485616, 60.5443025644061, 71.00628698937221, 61.616294708485384, 45.32803695045095, 43.85638472551629, 48.863070901568086, 44.65252243455522, 41.209120125948104, 36.63478075990383, 36.098369542551325, 37.75419965137265, 41.102019290969956, 26.874409332756752, 24.63314900554918, 26.05340465966265, 26.787053802870535, 16.51559065528567, 19.367731289491633, 17.794958746427422, 19.52785218727518, 15.437635249660396, 21.96712662378481, 15.311043443598177, 16.49893493905559, 16.41202114648668, 17.904512123179114, 14.198812322372405, 15.296623848360126, 14.39383356078112, 10.807540004905345, 17.405310725810278, 15.309786310492559, 15.117665282794073, 15.926377010540376, 14.000223621497955, 15.827757539949431, 19.22355433703294, 12.278007446886507, 14.822245428954957, 13.226674931853903, 10.551237809932955, 8.58081654372226, 10.329123069771072, 13.709943935412294, 11.778442391614956, 14.454930746849122, 10.023352452542506, 11.01463585064886, 10.621062477382623, 9.29665510291416, 9.633579419680572, 11.482703531988037, 9.819073927883121, 12.095918617534196, 9.820590920621864, 9.620109753045565, 13.215701804432598, 8.092085538619543, 9.828015669152578, 8.259655585415379, 9.424189583067022, 13.149985946123934, 7.471175119197948, 10.947567075630904, 10.777888096711512, 8.477442195191612, 9.585429992609711, 7.032549866566089, 5.103962051624133, 9.285999577275545, 7.421574444036404, 5.740841317806245, 2.3672530845679]
You can use lmfit (pip install --user lmfit):
http://lmfit.github.io/lmfit-py/builtin_models.html#pseudovoigtmodel
http://lmfit.github.io/lmfit-py/builtin_models.html#example-1-fit-peaked-data-to-gaussian-lorentzian-and-voigt-profiles
import numpy as np
from lmfit.models import PseudoVoigtModel
x = np.arange(0, 160)
y = # grabbed from your post
mod = PseudoVoigtModel()
pars = mod.guess(y, x=x)
out = mod.fit(y, pars, x=x)
print(out.fit_report(min_correl=0.25))
out.plot()
which results in:
[[Model]]
Model(pvoigt)
[[Fit Statistics]]
# function evals = 73
# data points = 160
# variables = 4
chi-square = 10762.372
reduced chi-square = 68.990
[[Variables]]
amplitude: 4405.17064 +/- 83.84199 (1.90%) (init= 2740.16)
sigma: 5.63732815 +/- 0.236117 (4.19%) (init= 5)
center: 79.5249321 +/- 0.103164 (0.13%) (init= 79)
fraction: 1.21222411 +/- 0.052349 (4.32%) (init= 0.5)
fwhm: 11.2746563 +/- 0.472234 (4.19%) == '2.0000000*sigma'
[[Correlations]] (unreported correlations are < 0.250)
C(sigma, fraction) = -0.774
C(amplitude, fraction) = 0.314
You could use the nmrglue library:
from nmrglue import linshapes1d as ls
ls.sim_pvoigt_fwhm(x, x0, fwhm, eta)
where
x: Array of values at which to evalutate distribution.
x0: Center of the distribution.
fwhm: Full-width at half-maximum of the Pseudo Voigt profile.
eta: Lorentzian/Gaussian mixing parameter.

Fitting negative binomial in python

In scipy there is no support for fitting a negative binomial distribution using data
(maybe due to the fact that the negative binomial in scipy is only discrete).
For a normal distribution I would just do:
from scipy.stats import norm
param = norm.fit(samp)
Is there something similar 'ready to use' function in any other library?
Statsmodels has discrete.discrete_model.NegativeBinomial.fit(), see here:
https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.NegativeBinomial.fit.html#statsmodels.discrete.discrete_model.NegativeBinomial.fit
Not only because it is discrete, also because maximum likelihood fit to negative binomial can be quite involving, especially with an additional location parameter. That would be the reason why .fit() method is not provided for it (and other discrete distributions in Scipy), here is an example:
In [163]:
import scipy.stats as ss
import scipy.optimize as so
In [164]:
#define a likelihood function
def likelihood_f(P, x, neg=1):
n=np.round(P[0]) #by definition, it should be an integer
p=P[1]
loc=np.round(P[2])
return neg*(np.log(ss.nbinom.pmf(x, n, p, loc))).sum()
In [165]:
#generate a random variable
X=ss.nbinom.rvs(n=100, p=0.4, loc=0, size=1000)
In [166]:
#The likelihood
likelihood_f([100,0.4,0], X)
Out[166]:
-4400.3696690513316
In [167]:
#A simple fit, the fit is not good and the parameter estimate is way off
result=so.fmin(likelihood_f, [50, 1, 1], args=(X,-1), full_output=True, disp=False)
P1=result[0]
(result[1], result[0])
Out[167]:
(4418.599495886474, array([ 59.61196161, 0.28650831, 1.15141838]))
In [168]:
#Try a different set of start paramters, the fit is still not good and the parameter estimate is still way off
result=so.fmin(likelihood_f, [50, 0.5, 0], args=(X,-1), full_output=True, disp=False)
P1=result[0]
(result[1], result[0])
Out[168]:
(4417.1495981801972,
array([ 6.24809397e+01, 2.91877405e-01, 6.63343536e-04]))
In [169]:
#In this case we need a loop to get it right
result=[]
for i in range(40, 120): #in fact (80, 120) should probably be enough
_=so.fmin(likelihood_f, [i, 0.5, 0], args=(X,-1), full_output=True, disp=False)
result.append((_[1], _[0]))
In [170]:
#get the MLE
P2=sorted(result, key=lambda x: x[0])[0][1]
sorted(result, key=lambda x: x[0])[0]
Out[170]:
(4399.780263084549,
array([ 9.37289361e+01, 3.84587087e-01, 3.36856705e-04]))
In [171]:
#Which one is visually better?
plt.hist(X, bins=20, normed=True)
plt.plot(range(260), ss.nbinom.pmf(range(260), np.round(P1[0]), P1[1], np.round(P1[2])), 'g-')
plt.plot(range(260), ss.nbinom.pmf(range(260), np.round(P2[0]), P2[1], np.round(P2[2])), 'r-')
Out[171]:
[<matplotlib.lines.Line2D at 0x109776c10>]
I know this thread is quite old, but current readers may want to look at this repo which is made for this purpose: https://github.com/gokceneraslan/fit_nbinom
There's also an implementation here, though part of a larger package: https://github.com/ernstlab/ChromTime/blob/master/optimize.py
I stumbled across this thread, and found an answer for anyone else wondering.
If you simply need the n, p parameterisation used by scipy.stats.nbinom you can convert the mean and variance estimates:
mu = np.mean(sample)
sigma_sqr = np.var(sample)
n = mu**2 / (sigma_sqr - mu)
p = mu / sigma_sqr
If you the dispersionparameter you can use a negative binomial regression model from statsmodels with just an interaction term. This will find the dispersionparameter alpha using MLE.
# Data processing
import pandas as pd
import numpy as np
# Analysis models
import statsmodels.formula.api as smf
from scipy.stats import nbinom
def convert_params(mu, alpha):
"""
Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports
Parameters
----------
mu : float
Mean of NB distribution.
alpha : float
Overdispersion parameter used for variance calculation.
See https://en.wikipedia.org/wiki/Negative_binomial_distribution#Alternative_formulations
"""
var = mu + alpha * mu ** 2
p = mu / var
r = mu ** 2 / (var - mu)
return r, p
# Generate sample data
n = 2
p = 0.9
sample = nbinom.rvs(n=n, p=p, size=10000)
# Estimate parameters
## Mean estimates expectation parameter for negative binomial distribution
mu = np.mean(sample)
## Dispersion parameter from nb model with only interaction term
nbfit = smf.negativebinomial("nbdata ~ 1", data=pd.DataFrame({"nbdata": sample})).fit()
alpha = nbfit.params[1] # Dispersion parameter
# Convert parameters to n, p parameterization
n_est, p_est = convert_params(mu, alpha)
# Check that estimates are close to the true values:
print("""
{:<3} {:<3}
True parameters: {:<3} {:<3}
Estimates : {:<3} {:<3}""".format('n', 'p', n, p,
np.round(n_est, 2), np.round(p_est, 2)))

Categories

Resources