Scipy stats.rv_continuous to find MLE from 2-dimensional data - python

Referring to here, I would like to find the MLE of alpha and lam, given the following PDF
import scipy.stats as st
import numpy as np
class Weib(st.rv_continuous):
def _pdf(self, data, alpha, lam):
t = data[0]
delta = data[1]
fx = (alpha * lam * (t**(alpha-1)))**(delta) * np.exp(-lam * (t**alpha))
return fx
def _argcheck(self, alpha, lam):
a = alpha > 0
l = lam > 0
return (a & l)
And I tried
Weib_inst = Weib(name='Weib')
Samples = Weib_inst.rvs(alpha=1, lam=3, size = 1000)
And it says
'float' object is not subscriptable
Weib_inst._fitstart([[1,2],[2,4]]) also returns the same error message.
It seems this occurs because the data is not 1-dimensional, but I cannot find the way to bypass this.
Any help might be appreciated.

You may try to define _fitstart in your subclass. The framework assumes univariate distributions, however.

Related

How to convert symbolic expression so it works with curve_fit?

so what I'm trying to do is to fit the specific model onto the dataset of x-y values and get constants of the model. I can get the constant (in this case there is only one) and then the fitted y_opt. Below is the working example of doing so:
import pandas as pd
from scipy.optimize import curve_fit
data = pd.read_csv(r'')
x_measured = data['x[-]'].values
y_measured = data['y[-]'].values
def y_NH(x_eng, D):
y_comp = D * x_eng*(x_eng**2 + 3 * x_eng + 3) / (1 + x_eng)**2
return y_comp
D = curve_fit(y_NH, x_measured, y_measured)
y_opt = y_NH(x_measured, D[0])
This works well, but it's not exactly good for me.
The formula for y_comp is something I needed to derive manually - originally I had other variable, let's say Y_comp, and got y_comp by differentiating Y_comp (by x_eng, obviously). What I would like to achieve is to feed my function with Y_comp (because there will be more like Z_comp, F_comp etc.), it would differentiate it resulting in y_comp (z_comp, f_comp) and then it would fit the model it onto my dataset - the result then would be constant(s) of the particular model.
I started with some work, but still I am not sufficient and would appreciate some help on this topic. The bugged code is:
import sympy as sy
from sympy.utilities.lambdify import lambdify
def y_NH2(x_eng, D):
lambda1 = sy.Symbol('lambda1')
x_eng = sy.Symbol('x_eng')
#Gi = sy.Symbol('Gi')
lambda1 = x_eng + 1
W = lambda1**2 + 2 / lambda1
y_comp_symb = sy.diff(W, x_eng)
y_comp = lambdify(x_eng, y_comp_symb,'numpy')
y_return = D / 2 * y_comp(x_eng)
return y_return
y_p = y_NH2(x_measured, 12)
print(y_p)
D = curve_fit(y_NH2, x_measured, y_measured)
y_opt = y_NH2(x_measured, D[0])
This raises an error in curve_fit that is: "error: Result from function call is not a proper array of floats."
Could you please give me a hint?

Simultaneous numerical fit of two equations using Numpy least square method

I am trying to fit below mentioned two equations using python leastsq method but am not sure whether this is the right approach. First equation has incomplete gamma function in it while the second one is slightly complex, and along with an exponential function contains a term which is obtained by using a separate fitting formula.
J_mg = T_incomplete(hw/T_mag)
J_nmg = e^(-hw/T)*g(w,T)
Here g is a function of w and T and is calucated using a given fitting formula.
I am following the steps outlined in this question.
Here is what I have done
import numpy as np
from scipy.optimize import leastsq
from scipy.special import gammaincc
from scipy.special import gamma
from matplotlib.pyplot import plot
# generating data
NPTS = 10
hw = np.linspace(0.5, 10, NPTS)
j1 = np.linspace(0.001,10,NPTS)
j2 = np.linspace(0.003,10,NPTS)
T_mag = np.linspace(0.3,0.5,NPTS)
#defining functions
def calc_gaunt_factor(hw,T):
fitting_coeff= np.loadtxt('fitting_coeff.txt', skiprows=1)
#T is in KeV
#K_b = 8.6173303(50)e−5 ev/K
g = 0
gamma = 0.0136/T
theta= hw/T
A= (np.log10(gamma**2) +0.5)*0.4
B= (np.log10(theta)+1.5)*0.4
for i in range(11):
for j in range(11):
g_ij = fitting_coeff[i][j]*(A**i)*(B**j)
g = g_ij+g
return g
def j_w_mag(hw,T_mag):
order= 0.001
return np.sqrt(1/T_mag)*gamma(order)*gammaincc(order,hw/T_mag)
def j_w_nonmag(hw,T):
gamma = 0.0136/T
theta= hw/T
return np.sqrt(1/T)*np.exp((-hw)/T)*calc_gaunt_factor(hw,T)
def residual_func(T,T_mag,hw,j1,j2):
err_unmag = np.nan_to_num(j1 - j_w_nonmag(hw,T))
err_mag = np.nan_to_num(j2 - j_w_mag(hw,T_mag))
err= np.concatenate((err_unmag, err_mag))
return err
par_init = np.array([.35])
best, cov, info, message, ler = leastsq(residual_func,par_init,args=(T_mag,hw,j1,j2),full_output=True)
print("Best-Fit Parameters:")
print("T=%s" %(best[0]))
I am getting weird value for my fitting parameter, T. Is this the right approach? Thanks.

more accuracy with sicpy interp1d

I am trying to implement a non parametric estimation of the KL divergence shown in this paper
Here is my code:
import numpy as np
import math
import itertools
import random
from scipy.interpolate import interp1d
def log(x):
if x > 0: return math.log(x)
else: return 0
g = lambda x, inp,N : sum(0.5 + 0.5 * np.sign(x-inp))/N
def ecdf(x,N):
out = [g(i,x,N) for i in x]
fun = interp1d(x, out, kind='linear', bounds_error = False, fill_value = (0,1))
return fun
def KL_est(x,y):
ex = min(np.diff(sorted(np.unique(x))))
ey = min(np.diff(sorted(np.unique(y))))
e = min(ex,ey) * 0.9
N = len(x)
x.sort()
y.sort()
P = ecdf(x,N)
Q = ecdf(y,N)
KL = sum(log(v) for v in ((P(x)-P(x-e))/(Q(x)-Q(x-e))) ) / N
return KL
My trouble is with scipy interp1d. I am using the function returned from interp1d to find the value of new inputs. The problem is, some of the input values are very close (10^-5 apart) and the function returns the same value for both. In my code above, Q(x) - Q(x-e) leads to a divide by zero error.
Here is some test code that reproduces the problem:
x = np.random.normal(0, 1, 10)
y = np.random.normal(0, 1, 10)
ex = min(np.diff(sorted(np.unique(x))))
ey = min(np.diff(sorted(np.unique(y))))
e = min(ex,ey) * 0.9
N = len(x)
x.sort()
y.sort()
P = ecdf(x,N)
Q = ecdf(y,N)
KL = sum(log(v) for v in ((P(x)-P(x-e))/(Q(x)-Q(x-e))) ) / N
How would I go about getting a more accurate interpolation?
As e gets small you are effectively trying to compute the ratio of derivatives of P and Q numerically. As you are finding, you run out of precision really quickly in floating point doing it this way.
An alternate approach would be to use an interpolation function that can return derivatives directly. For example, you could try scipy.interpolate.InterpolatedUnivariateSpline. You were saying kind='linear' to interp1d, so the equivalent is k=1. Once you construct it, the spline has method derivatives() that gives you all the derivatives at different points. For small values of e you could switch to using the derivative.

How do I calculate PDF (probability density function) in Python?

I have the following code below that prints the PDF graph for a particular mean and standard deviation.
http://imgur.com/a/oVgML
Now I need to find the actual probability, of a particular value. So for example if my mean is 0, and my value is 0, my probability is 1. This is usually done by calculating the area under the curve. Similar to this:
http://homepage.divms.uiowa.edu/~mbognar/applets/normal.html
I am not sure how to approach this problem
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
def normal(power, mean, std, val):
a = 1/(np.sqrt(2*np.pi)*std)
diff = np.abs(np.power(val-mean, power))
b = np.exp(-(diff)/(2*std*std))
return a*b
pdf_array = []
array = np.arange(-2,2,0.1)
print array
for i in array:
print i
pdf = normal(2, 0, 0.1, i)
print pdf
pdf_array.append(pdf)
plt.plot(array, pdf_array)
plt.ylabel('some numbers')
plt.axis([-2, 2, 0, 5])
plt.show()
print
Unless you have a reason to implement this yourself. All these functions are available in scipy.stats.norm
I think you asking for the cdf, then use this code:
from scipy.stats import norm
print(norm.cdf(x, mean, std))
If you want to write it from scratch:
class PDF():
def __init__(self,mu=0, sigma=1):
self.mean = mu
self.stdev = sigma
self.data = []
def calculate_mean(self):
self.mean = sum(self.data) // len(self.data)
return self.mean
def calculate_stdev(self,sample=True):
if sample:
n = len(self.data)-1
else:
n = len(self.data)
mean = self.mean
sigma = 0
for el in self.data:
sigma += (el - mean)**2
sigma = math.sqrt(sigma / n)
self.stdev = sigma
return self.stdev
def pdf(self, x):
return (1.0 / (self.stdev * math.sqrt(2*math.pi))) * math.exp(-0.5*((x - self.mean) / self.stdev) ** 2)
The area under a curve y = f(x) from x = a to x = b is the same as the integral of f(x)dx from x = a to x = b. Scipy has a quick easy way to do integrals. And just so you understand, the probability of finding a single point in that area cannot be one because the idea is that the total area under the curve is one (unless MAYBE it's a delta function). So you should get 0 ≤ probability of value < 1 for any particular value of interest. There may be different ways of doing it, but a conventional way is to assign confidence intervals along the x-axis like this. I would read up on Gaussian curves and normalization before continuing to code it.

Density of multivariate t distribution in Python for large number of observations

I am trying to evaluate the density of multivariate t distribution of a 13-d vector. Using the dmvt function from the mvtnorm package in R, the result I get is
[1] 1.009831e-13
When i tried to write the function by myself in Python (thanks to the suggestions in this post:
multivariate student t-distribution with python), I realized that the gamma function was taking very high values (given the fact that I have n=7512 observations), making my function going out of range.
I tried to modify the algorithm, using the math.lgamma() and np.linalg.slogdet() functions to transform it to the log scale, but the result I got was
8.97669876e-15
This is the function that I used in python is the following:
def dmvt(x,mu,Sigma,df,d):
'''
Multivariate t-student density:
output:
the density of the given element
input:
x = parameter (d dimensional numpy array or scalar)
mu = mean (d dimensional numpy array or scalar)
Sigma = scale matrix (dxd numpy array)
df = degrees of freedom
d: dimension
'''
Num = math.lgamma( 1. *(d+df)/2 ) - math.lgamma( 1.*df/2 )
(sign, logdet) = np.linalg.slogdet(Sigma)
Denom =1/2*logdet + d/2*( np.log(pi)+np.log(df) ) + 1.*( (d+df)/2 )*np.log(1 + (1./df)*np.dot(np.dot((x - mu),np.linalg.inv(Sigma)), (x - mu)))
d = 1. * (Num - Denom)
return np.exp(d)
Any ideas why this functions does not produce the same results as the R equivalent?
Using as x = (0,0) produces similar results (up to a point, die to rounding) but with x = (1,1)1 I get a significant difference!
I finally managed to 'translate' the code from the mvtnorm package in R and the following script works without numerical underflows.
import numpy as np
import scipy.stats
import math
from math import lgamma
from numpy import matrix
from numpy import linalg
from numpy.linalg import slogdet
import scipy.special
from scipy.special import gammaln
mu = np.array([3,3])
x = np.array([1, 1])
Sigma = np.array([[1, 0], [0, 1]])
p=2
df=1
def dmvt(x, mu, Sigma, df, log):
'''
Multivariate t-student density. Returns the density
of the function at points specified by x.
input:
x = parameter (n x d numpy array)
mu = mean (d dimensional numpy array)
Sigma = scale matrix (d x d numpy array)
df = degrees of freedom
log = log scale or not
'''
p = Sigma.shape[0] # Dimensionality
dec = np.linalg.cholesky(Sigma)
R_x_m = np.linalg.solve(dec,np.matrix.transpose(x)-mu)
rss = np.power(R_x_m,2).sum(axis=0)
logretval = lgamma(1.0*(p + df)/2) - (lgamma(1.0*df/2) + np.sum(np.log(dec.diagonal())) \
+ p/2 * np.log(math.pi * df)) - 0.5 * (df + p) * math.log1p((rss/df) )
if log == False:
return(np.exp(logretval))
else:
return(logretval)
print(dmvt(x,mu,Sigma,df,True))
print(dmvt(x,mu,Sigma,df,False))

Categories

Resources