How to get p-value from numpy.linalg.lstsq? - python

I would like to calculate the p-value of the fit I got from numpy.linalg.lstsq. Here a toy example:
import numpy as np
x = np.array([[ 58295.62187335],[ 45420.95483714],[ 3398.64920064],[ 977.22166306],[ 5515.32801851],[ 14184.57621022],[ 16027.2803392 ],[ 15313.01865824],[ 6443.2448182 ]])
y = np.array([ 143547.79123381, 22996.69597427, 2591.56411049, 661.93115277, 8826.96549102, 17735.13549851, 11629.13003263, 14438.33177173, 6997.89334741])
a, res, rank, s = np.linalg.lstsq(x, y)
from previous question (get the R^2 value from scipy.linalg.lstsq) I know got to get R², however I would also like to compute the p-value.
Thanks in advance.

you could use scipy.stats
import numpy as np
from scipy.stats import pearsonr
x = np.array([ 58295.62187335, 45420.95483714, 3398.64920064, 977.22166306, 5515.32801851, 14184.57621022, 16027.2803392 , 15313.01865824, 6443.2448182 ])
y = np.array([ 143547.79123381, 22996.69597427, 2591.56411049, 661.93115277, 8826.96549102, 17735.13549851, 11629.13003263, 14438.33177173, 6997.89334741])
r, p = pearsonr(x,y)

Related

Inverse problem with scipy.optimize.differential_evolution

I've been doing a code trying to calculate y=ax+b with values for x and y. So I want to determine a and b.
For example, I have y=2x+1 so
xexp =[ 0., 0.1111111, 0.2222222, 0.3333333, 0.4444444, 0.5555556, 0.6666667, 0.7777778, 0.8888889, 1.] and
yexp=[1., 1.2222222, 1.4444444, 1.6666667, 1.8888889, 2.1111111, 2.3333333, 2.5555556, 2.7777778, 3.]
I need to create a loop in my code to calculate every ycalc[i] and return as my objective function the mean error of each iteration using all the given data.
from scipy.optimize import differential_evolution
import numpy as np
from matplotlib import pyplot
from scipy.optimize import LinearConstraint
from scipy.optimize import NonlinearConstraint
#2*x+1 // a*x+b --> have xexp and yexp need to calc a and b when xcalc = xexp
errototal=[]
ycalc=[]
erro=[]
#experimental values
xexp =[ 0., 0.1111111, 0.2222222, 0.3333333, 0.4444444, 0.5555556, 0.6666667, 0.7777778, 0.8888889, 1.]
yexp=[1., 1.2222222, 1.4444444, 1.6666667, 1.8888889, 2.1111111, 2.3333333, 2.5555556, 2.7777778, 3.]
#obj function
def func(xcalc):
return np.mean(sum(errototal))
bounds = [(-5, 5), (-5, 5)]
plot1=[]
def condition(xk, convergence):
for i in range(0,9):
ycalc.append(xk[0]*xexp[i]+xk[1])
# ycalc.append(xk[0]*xk[2]+xk[1])
erro.append(abs(yexp[i]-ycalc[i]))
errototal.append(erro[i])
plot1.append(func(sum(errototal)))
print(xk,errototal,ycalc, yexp)
print(sum(errototal))
result = differential_evolution(func, bounds, disp=True, callback=condition)
# line plot of best objective function values
pyplot.plot(plot1, '.-')
pyplot.xlabel('Improvement Number')
pyplot.ylabel('Evaluation f(x)')
pyplot.show()
My code stops in the first step. Any tips?

SKLearn ElasticNetCV: Looking for a similar cross-validation-error-plot to Matlab's lassoPlot or R's plot(cv.glmnet(x,y))

I use sklearn.linear_model.ElasticNetCV and I would like to get a similar figure as Matlab provides with lassoPlot with plottype=CV or R's plot(cv.glmnet(x,y)), i.e., a plot of the cross validations errors over various alphas (note, in Matlab and R this parameter is called lambda). Here is an example:
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import scipy.stats as stats
# toy example
# generate 200 samples of five-dimensional artificial data X from a
# exponential distributions with various means:
X = np.zeros( (200,5 ) )
for col in range(5):
X[ :, col ] = stats.expon.rvs( scale=1.0/(col+1) )
# generate response data Y = X*r + eps where r has just two nonzero
# components, and the noise eps is normal with standard deviation 0.1:
r = np.array( [ 0, 2, 0, -3, 0 ] )
Y = np.dot(X,r) + sp.randn( 200 )*0.1
enet = lm.ElasticNetCV()
alphas,coefs, _ = enet.path( X, Y )
# plot regulization paths
plt.plot( -np.log10(alphas), coefs.T, linestyle='-' )
plt.show()
I would like to plot also in a separate figure the cross validation error for each alpha. But It seems that ElasticNetCV.path() does not return a mse vector. Is there a simliar functionality in sklearn to Matlab.lassoPlot with plottype='CV' see: http://de.mathworks.com/help/stats/lasso-and-elastic-net.html or R's cv.glmnet(x,y) https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html. Alternatively, I would implement it using sklearn.cross_validation. Do you have any suggestions?

Flatten a numpy array

I am solving a ODE as follows:
import numpy as np
import scipy as sp
import math
from math import *
from scipy.integrate import odeint
import matplotlib.pyplot as plt
def g(y, x):
y0 = y[0]
return x #formula##
# Initial conditions on y, y' at x=0
init = 0 #value##
# First integrate from 0 to 100
xplotval=np.linspace(4,8,4) #linspacefunction
print(xplotval)
I am getting output as:
[[ 7. ]
[ 5.76455273 ]
[ 5.41898906 ]
[ 6.49185668 ]]
I'd like to output a single dimensional array as follows:
[7., 5.76455273, 5.41898906, 6.49185668]
How can I?
Maybe you want flatten:
print(xplotval.flatten())
Unless you actually want the transposed vector, which you would get with numpy.transpose:
print(np.transpose(xplotval))
You can simply use list comprehension, something like:
oneD = [l[0] for l in xplotval]

How does "statsmodels.regression.linear_model. WLS" work?

I have used 'statsmodels.regression.linear_model' to do WLS.
But I have no idea about how to give weight my regression.
Does anyone know how the weight be given and how it work?
import numpy as np
import statsmodels.api as sm
Y = [1,2,3,4,5,6,7]
X = range(1,8)
W= [1,1,1,1,1,1,1]
X = sm.add_constant(X)
wls_model = sm.WLS(Y,X, weights=W)
results = wls_model.fit()
results.params
print results.params
#[ -1.55431223e-15 1.00000000e+00]
import numpy as np
import statsmodels.api as sm
Y = [1,2,3,4,5,6,7]
X = range(1,8)
W= range(1,8)
X = sm.add_constant(X)
wls_model = sm.WLS(Y,X, weights=W)
results = wls_model.fit()
results.params
print results.params
#[0 1]
why when weight is range(1,8) the slope and intercept is 1 and 0.
but when weight is "1" the intercept is not 0.
In your example, the data is linear anyway, so the the regression will be a perfect fit, no matter what your weights. But if you change your data to have an outlier in the first position like this
Y = [-5,2,3,4,5,6,7]
then with constant weights you get
[-3.42857143 1.64285714]
but with W = range(1,8) you get
[-1.64285714 1.28571429]
which is closer to what you want without the outlier.

Vectorised code for sampling from truncated normal distributions with different intervals

The following code generates a sample of size 100 from trunctated normal distributions with different intervals. Is there any effecient(vectorised) way of doing this?
from scipy.stats import truncnorm
import numpy as np
sample=[]
a_s=np.random.uniform(0,1,size=100)
b_s=a_s+0.2
for i in range(100):
sample.append(truncnorm.rvs(a_s[i], b_s[i], size=100))
print sample
One day in the not so distant future, all NumPy/SciPy functions will broadcast all their arguments, and you will be able to do truncnorm.rvs(a_s, b_s, size=100), but since we are not there yet, you could manually generate your random samples from a uniform distribution and the CDF and PPF of a normal distribution:
import numpy as np
from scipy.stats import truncnorm, norm
a_s = np.random.uniform(0, 1, size=100)
b_s = a_s + 0.2
cdf_start = norm.cdf(a_s)
cdf_stop = norm.cdf(b_s)
cdf_samples = np.random.uniform(0, 1, size=(100, 100))
cdf_samples *= (cdf_stop - cdf_start)[:, None]
cdf_samples += cdf_start[:, None]
truncnorm_samples = norm.ppf(cdf_samples)

Categories

Resources