How does "statsmodels.regression.linear_model. WLS" work? - python

I have used 'statsmodels.regression.linear_model' to do WLS.
But I have no idea about how to give weight my regression.
Does anyone know how the weight be given and how it work?
import numpy as np
import statsmodels.api as sm
Y = [1,2,3,4,5,6,7]
X = range(1,8)
W= [1,1,1,1,1,1,1]
X = sm.add_constant(X)
wls_model = sm.WLS(Y,X, weights=W)
results = wls_model.fit()
results.params
print results.params
#[ -1.55431223e-15 1.00000000e+00]
import numpy as np
import statsmodels.api as sm
Y = [1,2,3,4,5,6,7]
X = range(1,8)
W= range(1,8)
X = sm.add_constant(X)
wls_model = sm.WLS(Y,X, weights=W)
results = wls_model.fit()
results.params
print results.params
#[0 1]
why when weight is range(1,8) the slope and intercept is 1 and 0.
but when weight is "1" the intercept is not 0.

In your example, the data is linear anyway, so the the regression will be a perfect fit, no matter what your weights. But if you change your data to have an outlier in the first position like this
Y = [-5,2,3,4,5,6,7]
then with constant weights you get
[-3.42857143 1.64285714]
but with W = range(1,8) you get
[-1.64285714 1.28571429]
which is closer to what you want without the outlier.

Related

PCA algorithm makes iris dataset to be reversed on y axis

I am making PCA in python with this code:
def OWN_PCA(X,num_components):
#Step-1
X_meaned = X - np.mean(X , axis = 0)
#creating covariance matrix
cov_mat = np.cov(X_meaned,rowvar=False)
#calculating eigenvector and eigen value
eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)
#sorting the vectors based on eigen values
sorted_index = np.argsort(eigen_values)[::-1]
sorted_eigenvalue = eigen_values[sorted_index]
sorted_eigenvectors = eigen_vectors[:,sorted_index]
#choosing number of components
eigenvector_subset = sorted_eigenvectors[:,0:num_components]
X_reduced = np.dot( eigenvector_subset.transpose(), X.transpose() ).transpose()
return X_reduced
The problem is when I apply it to the iris dataset and plot it, I will get this:
and when I use PCA in sklearn the image is reversed:
What is wrong with my code?

How can I get a value from a polynomial defined with np.poly1d?

I made a model from a series of data. My model is represented by the red line which has the following formula:
p4=np.poly1d(np.polyfit(x,y,4)) #0.04253 x - 3.593 x + 89.6 x - 470.3 x + 666.4
How can I retrieve a value from my model (from the red polynomial line)?
I tried with this code but results are not coherent:
y=np.arange(len(x))
X=scale.fit_transform(y.values)
X=np.array(X)
X.reshape(-1,1)
est = sm.OLS(y, X).fit()
scaled = scale.transform(50)
predicted = est.predict(scaled[0])
With x=50 I retrieve 1 as prediction that's obviously not coherent with the model.
Could you help me?
You can get the value by using the polynomial function returned by np.poly1d.
See the example shown in the documentation:
import numpy as np
p = np.poly1d([1, 2, 3])
print(np.poly1d(p))
# Evaluate the polynomial at x = 0.5:
y = p(0.5)
print(y)

Python Polynomial Regression with Gradient Descent

I try to implement Polynomial Regression with Gradient Descent. I want to fit the following function:
The code I use is:
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(seed=42)
def create_data():
x = PolynomialFeatures(degree=5).fit_transform(np.linspace(-10,10,100).reshape(100,-1))
l = lambda x_i: (1/3)*x_i**3-2*x_i**2+2*x_i+2
data = l(x[:,1])
noise = np.random.normal(0,0.1,size=np.shape(data))
y = data+noise
y= y.reshape(100,1)
return {'x':x,'y':y}
def plot_function(x,y):
fig = plt.figure(figsize=(10,10))
plt.plot(x[:,1],[(1/3)*x_i**3-2*x_i**2+2*x_i+2 for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
plt.scatter(x[:,1],y)
plt.show()
def w_update(y,x,batch,w_old,eta):
derivative = np.sum([(y[i]-np.dot(w_old.T,x[i,:]))*x[i,:] for i in range(np.shape(x)[0])])
print(derivative)
return w_old+eta*(1/batch)*derivative
# initialize variables
w = np.random.normal(size=(6,1))
data = create_data()
x = data['x']
y = data['y']
plot_function(x,y)
# Update w
w_s = []
Error = []
for i in range(500):
error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
Error.append(error)
w_prime = w_update(y,x,np.shape(x)[0],w,0.001)
w = w_prime
w_s.append(w)
# Plot the predicted function
plt.plot(x[:,1],np.dot(x,w))
plt.show()
# Plot the error
fig3 = plt.figure()
plt.scatter(range(len(Error[10:])),Error[10:])
plt.show()
But as result I receive smth. strange which is completely out of bounds...I have also tried to alter the number of iterations as well as the parameter theta but it did not help. I assume I have made an mistake in the update of w.
I have found the solution. The Problem is indeed in the part where I calculate the weights. Specifically in:
np.sum([(y[d]-np.dot(w_old.T,x[d,:]))*x[d,:] for d in range(np.shape(x)[0])])
which should be like:
np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)
We have to add np.sum(axis=0) to get the dimensionality we want --> Dimensionality must be equal to w. The numpy sum documentation sais
The default, axis=None, will sum all of the elements of the input
array.
This is not what we want to achieve. Adding axis = 0 sums over the first axis of our array which is of dimensionality (100,7,1) hence the 100 elements of dimensionality (7,1) are summed up and the resulting array is of dimensionality (7,1) which is exactly what we want. Implementing this and cleaning up the code yields:
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
np.random.seed(seed=42)
def create_data():
x = PolynomialFeatures(degree=6).fit_transform(np.linspace(-2,2,100).reshape(100,-1))
x[:,1:] = MinMaxScaler(feature_range=(-2,2),copy=False).fit_transform(x[:,1:])
l = lambda x_i: np.cos(0.8*np.pi*x_i)
data = l(x[:,1])
noise = np.random.normal(0,0.1,size=np.shape(data))
y = data+noise
y= y.reshape(100,1)
# Normalize Data
return {'x':x,'y':y}
def plot_function(x,y,w,Error,w_s):
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(40,10))
ax[0].plot(x[:,1],[np.cos(0.8*np.pi*x_i) for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
ax[0].scatter(x[:,1],y)
ax[0].plot(x[:,1],np.dot(x,w))
ax[0].set_title('Function')
ax[1].scatter(range(iterations),Error)
ax[1].set_title('Error')
plt.show()
# initialize variables
data = create_data()
x = data['x']
y = data['y']
w = np.random.normal(size=(np.shape(x)[1],1))
eta = 0.1
iterations = 10000
batch = 10
def stochastic_gradient_descent(x,y,w,eta):
derivative = -(y-np.dot(w.T,x))*x.reshape(np.shape(w))
return eta*derivative
def batch_gradient_descent(x,y,w,eta):
derivative = np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)
return eta*(1/len(x))*derivative
def mini_batch_gradient_descent(x,y,w,eta,batch):
gradient_sum = np.zeros(shape=np.shape(w))
for b in range(batch):
choice = np.random.choice(list(range(len(x))))
gradient_sum += -(y[choice]-np.dot(w.T,x[choice,:]))*x[choice,:].reshape(np.shape(w))
return eta*(1/batch)*gradient_sum
# Update w
w_s = []
Error = []
for i in range(iterations):
# Calculate error
error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
Error.append(error)
# Stochastic Gradient Descent
"""
for d in range(len(x)):
w-= stochastic_gradient_descent(x[d,:],y[d],w,eta)
w_s.append(w.copy())
"""
# Minibatch Gradient Descent
"""
w-= mini_batch_gradient_descent(x,y,w,eta,batch)
"""
# Batch Gradient Descent
w -= batch_gradient_descent(x,y,w,eta)
# Show predicted weights
print(w_s)
# Plot the predicted function and the Error
plot_function(x,y,w,Error,w_s)
As result we receive:
Which surely can be improved by altering eta and the number of iterations as well as switching to Stochastic or Mini Batch Gradient Descent or more sophisticated optimization algorithms.

getting the standard error of linear regression coefficient using bootstrap

I would like to calculate the standard error of linear regression coefficient using bootstrap technique (100 resamples) but the result I got is zero, which is not normal. I think something is wrong with the bootstrap part of the code. Do you know how to fix my code?
x, y = np.genfromtxt("input.txt", unpack=True)
#regression part
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print std_err
#bootstrap part of the code
A = np.random.choice(x, size=100, replace=True)
B = np.random.choice(y, size=100, replace=True)
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(A,B)
print std_err2
input.txt:
-1.08 -1.07
-2.62 -2.56
-2.84 -2.79
-2.22 -2.16
-3.47 -3.55
-2.81 -2.79
-2.86 -2.71
-3.41 -3.42
-4.18 -4.21
-3.50 -3.48
-5.67 -5.55
-6.83 -6.95
-6.13 -6.13
-8.34 -8.19
-7.82 -7.83
-9.86 -9.58
-8.67 -8.62
-9.81 -9.81
-8.39 -8.30
I had no issues with your above code running in Python 3.6.1. Maybe check that your scipy version is current?
from scipy import stats
import numpy as np
x, y = np.genfromtxt("./input.txt", unpack=True)
slope_1, intercept_1, r_val_1, p_val_1, stderr_1 = stats.linregress(x, y)
print(slope_1) # 0.9913080927081567
print(stderr_1) # 0.007414734102169809
A = np.random.choice(x, size=100, replace=True)
B = np.random.choice(y, size=100, replace=True)
slope_2, incercept_2, r_val_2, p_val_2, stderr_2 = stats.linregress(A, B)
print(slope_2) # 0.11429903085322253
print(stderr_2) # 0.10158283281966374
Correctly Bootstrapping the Data
The correct way to do this would be to use the resample method from sklearn.utils. This method handles the data in a consistent array format. Since your data is an x, y pair, the y value is dependent on your x value. If you randomly sample x and y independently you lose that dependency and your resampled data will not accurately represent your population.
from scipy import stats
from sklearn.utils import resample
import numpy as np
x, y = np.genfromtxt("./input.txt", unpack=True)
slope_1, intercept_1, r_val_1, p_val_1, stderr_1 = stats.linregress(x, y)
print(slope_1) # 0.9913080927081567
print(stderr_1) # 0.007414734102169809
A, B = resample(x, y, n_samples=100) # defaults to w/ replacement
slope_2, incercept_2, r_val_2, p_val_2, stderr_2 = stats.linregress(A, B)
print(slope_2) # 0.9864339054638176
print(stderr_2) # 0.002669659193615103

Linear Regression Returns Different Results Than Synthetic Parameters

trying this code:
from sklearn import linear_model
import numpy as np
x1 = np.arange(0,10,0.1)
x2 = x1*10
y = 2*x1 + 3*x2
X = np.vstack((x1, x2)).transpose()
reg_model = linear_model.LinearRegression()
reg_model.fit(X,y)
print reg_model.coef_
# should be [2,3]
print reg_model.predict([5,6])
# should be 2*5 + 3*6 = 28
print reg_model.intercept_
# perfectly at the expected value of 0
print reg_model.score(X,y)
# seems to be rather confident to be right
The results are
[ 0.31683168 3.16831683]
20.5940594059
0.0
1.0
and therefore not what I expected - they are not the same as the parameters used to synthesize the data. Why is this so?
Your problem is with the uniqueness of solutions, as both dimensions are the same (applying a linear transform to one dimension does not make unique data in the eyes of this model), you get an infinite number of possible solutions that will fit you data. Applying a non-linear transformation to your second dimension you will see the desired output.
from sklearn import linear_model
import numpy as np
x1 = np.arange(0,10,0.1)
x2 = x1**2
X = np.vstack((x1, x2)).transpose()
y = 2*x1 + 3*x2
reg_model = linear_model.LinearRegression()
reg_model.fit(X,y)
print reg_model.coef_
# should be [2,3]
print reg_model.predict([[5,6]])
# should be 2*5 + 3*6 = 28
print reg_model.intercept_
# perfectly at the expected value of 0
print reg_model.score(X,y)
Outputs are
[ 2. 3.]
[ 28.]
-2.84217094304e-14
1.0

Categories

Resources