Understanding Sklearn's Linear Regression Weighting - python

I'm having difficulty getting the weighting array in sklearn's Linear Regression to affect the output.
Here's an example with no weighting.
import numpy as np
import seaborn as sns
from sklearn import linear_model
x = np.arange(0,100.)
y = (x**2.0)
xr = np.array(x).reshape(-1, 1)
yr = np.array(y).reshape(-1, 1)
regr = linear_model.LinearRegression()
regr.fit(xr, yr)
y_pred = regr.predict(xr)
sns.scatterplot(x=x, y = y)
sns.lineplot(x=x, y = y_pred.T[0].tolist())
Now when adding weights, I get the same best fit line back. I expected to see the regression favor the steeper part of the curve. What am I doing wrong?
w = [p**2 for p in x.reshape(-1)]
wregr = linear_model.LinearRegression()
wregr.fit(xr,yr, sample_weight=w)
yw_pred = regr.predict(xr)
wregr = linear_model.LinearRegression(fit_intercept=True)
wregr.fit(xr,yr, sample_weight=w)
yw_pred = regr.predict(xr)
sns.scatterplot(x=x, y = y) #plot curve
sns.lineplot(x=x, y = y_pred.T[0].tolist()) #plot non-weighted best fit line
sns.lineplot(x=x, y = yw_pred.T[0].tolist()) #plot weighted best fit line

This is due to an error in your code. Fitting of your weighted model should be:
yw_pred = wregr.predict(xr)
rather than
yw_pred = regr.predict(xr)
With this you get:

Related

Plotting classification results with confusion matrices on python

I am performing least squares classification on my data and I was able to obtain my weights and I decided to plot a decision boundary line. However I require to use a confusion matrix to show my classification results. I was going to use from sklearn.metrics import confusion_matrix and I was going to assign t as my prediction however I am not sure how to obtain my actual results to work out the matrix. I have never plotted one so I might be getting all this wrong.
import numpy as np
import matplotlib.pyplot as plt
data=np.loadtxt("MyData_A.txt")
x=data[:,0:2] #the data points
t=data[:,2] #class which data points belong to either 1s or 0s
x0=np.ones((len(x),1)) # creat array of ones as matrix (nx1) where n is number of points
X=np.append(x, x0, axis=1) # add column x0 to data
# w= ( (((X^T)X)^-1 )X^T )t
XT_X=np.dot(X.T, X) # (X^T)X
inv_XT_X=np.linalg.inv(XT_X) # (X^T)X)^-1
X_tot=np.dot(inv_XT_X, X.T) # ((X^T)X)^-1 )X^T
w=np.dot(X_tot, t) # ( (((X^T)X)^-1 )X^T )t
x1_line = np.array([-1, 2])
x2_line = -w[2] / w[1] - (w[0] / w[1]) * x1_line
color_cond=['r' if t==1 else 'b' for t in t]
plt.scatter(x[:,0],x[:,1],color=color_cond)
plt.plot(x1_line,x2_line,color='k')
plt.xlabel('X1')
plt.ylabel('X2')
plt.ylim(-2,2)
plt.title('Training Data (X1,X2)')
plt.show()
The following is the plot obtained.
from sklearn.metrics import confusion_matrix
import seaborn as sns
def predict(x1_line, x2_line, x):
d = (x[0] - x1_line[0]) * (x2_line[1] - x2_line[0]) - (x[1] - x2_line[0]) * (x1_line[1] - x1_line[0])
pred = 0 if d > 0 else 1
return pred
preds = np.array([predict(x1_line, x2_line, x12) for x12 in x])
conf_mat = confusion_matrix(t, preds)
sns.heatmap(conf_mat, annot=True);
plt.show()
LogisticRegression, confusion_matrix and ConfusionMatrixDisplay get the job done:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
data = np.loadtxt("MyData_A.txt")
X = data[:, :-1]
y = data[:, -1].astype(int)
clf = LogisticRegression().fit(X, y)
pred = clf.predict(X)
cm = confusion_matrix(y, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

dbscan not making sense for small amounts of points

I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()
ended up going with kmeans and doing a modified elbow method:
print(__doc__)
# Author: Phil Roth <mr.phil.roth#gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
print(inertia)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
print(clusterIdx)
plt.show()

Trouble calculating slope and intercept in Numpy/Scypy using linear regression

i'm new in this forum.
I have a small problem to understand how to calcolate slope and intercept from value that are in a csv file.
This is my working codes(minquadbasso.py is the programme's name):
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
data = pd.read_csv('TelefonoverticaleAsseY.csv') # load data set
X = data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='black')
plt.show()
If I use:
from scipy.stats import linregress
linregress(X, Y)
compiler give me this error:
Traceback (most recent call last):
File "minquadbasso.py", line 11, in <module>
linregress(X, Y)
File "/usr/local/lib/python3.7/dist-packages/scipy/stats/_stats_mstats_common.py", line 116, in linregress
ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
ValueError: too many values to unpack (expected 4)
Can you make me understand what i'm doing wrong and suggest what change in order to calculate seccesfully slope and intercept?
My go-to for linear regression is np.polyfit. If you have an array (or list) of x data, and an array or list of y data just use
coeff = np.polyfit(x,y, deg = 1)
coeff is now a list of least square coefficients to fit your data, with highest degree of x first. So for a first degree fit y = ax + b,
a = coeff[0] and b = coeff[1] 'deg' is the degree of the polynomial you want to fit to your data. To evaluate your regression (predict) you can use np.polyval
y_prediction = np.polyval(coeff, x)
If you want the covariance matrix for the fit
coeff, cov = np.polyfit(x,y, deg = 1, cov = True)
you can find more on it here.

getting the standard error of linear regression coefficient using bootstrap

I would like to calculate the standard error of linear regression coefficient using bootstrap technique (100 resamples) but the result I got is zero, which is not normal. I think something is wrong with the bootstrap part of the code. Do you know how to fix my code?
x, y = np.genfromtxt("input.txt", unpack=True)
#regression part
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print std_err
#bootstrap part of the code
A = np.random.choice(x, size=100, replace=True)
B = np.random.choice(y, size=100, replace=True)
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(A,B)
print std_err2
input.txt:
-1.08 -1.07
-2.62 -2.56
-2.84 -2.79
-2.22 -2.16
-3.47 -3.55
-2.81 -2.79
-2.86 -2.71
-3.41 -3.42
-4.18 -4.21
-3.50 -3.48
-5.67 -5.55
-6.83 -6.95
-6.13 -6.13
-8.34 -8.19
-7.82 -7.83
-9.86 -9.58
-8.67 -8.62
-9.81 -9.81
-8.39 -8.30
I had no issues with your above code running in Python 3.6.1. Maybe check that your scipy version is current?
from scipy import stats
import numpy as np
x, y = np.genfromtxt("./input.txt", unpack=True)
slope_1, intercept_1, r_val_1, p_val_1, stderr_1 = stats.linregress(x, y)
print(slope_1) # 0.9913080927081567
print(stderr_1) # 0.007414734102169809
A = np.random.choice(x, size=100, replace=True)
B = np.random.choice(y, size=100, replace=True)
slope_2, incercept_2, r_val_2, p_val_2, stderr_2 = stats.linregress(A, B)
print(slope_2) # 0.11429903085322253
print(stderr_2) # 0.10158283281966374
Correctly Bootstrapping the Data
The correct way to do this would be to use the resample method from sklearn.utils. This method handles the data in a consistent array format. Since your data is an x, y pair, the y value is dependent on your x value. If you randomly sample x and y independently you lose that dependency and your resampled data will not accurately represent your population.
from scipy import stats
from sklearn.utils import resample
import numpy as np
x, y = np.genfromtxt("./input.txt", unpack=True)
slope_1, intercept_1, r_val_1, p_val_1, stderr_1 = stats.linregress(x, y)
print(slope_1) # 0.9913080927081567
print(stderr_1) # 0.007414734102169809
A, B = resample(x, y, n_samples=100) # defaults to w/ replacement
slope_2, incercept_2, r_val_2, p_val_2, stderr_2 = stats.linregress(A, B)
print(slope_2) # 0.9864339054638176
print(stderr_2) # 0.002669659193615103

Canonical Discriminant Function in Python sklearn

I am learning about Linear Discriminant Analysis and am using the scikit-learn module. I am confused by the "coef_" attribute from the LinearDiscriminantAnalysis class. As far as I understand, these are the discriminant function coefficients (sklearn calls them weight vectors). Since there should be (n_classes-1) discriminant functions, I would expect the coef_ attribute to be an array with shape (n_components, n_features), but instead it prints an (n_classes, n_features) array. Below is an example of this using the Iris dataset example from sklearn. Since there are 3 classes and 2 components, I would expect print(lda.coef_) to give me a 2x4 array instead of a 3x4 array...
Maybe I'm misinterpreting what the weight vectors are, perhaps they are the coefficients for the classification function?
And how do I get the coefficients for each variable in each discriminant/canonical function?
screenshot of jupyter notebook
Code here:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
lda = LinearDiscriminantAnalysis(n_components=2,store_covariance=True)
X_r = lda.fit(X, y).transform(X)
plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('Function 1 (%.2f%%)' %(lda.explained_variance_ratio_[0]*100))
plt.ylabel('Function 2 (%.2f%%)' %(lda.explained_variance_ratio_[1]*100))
plt.title('LDA of IRIS dataset')
print(lda.coef_)
#output -> [[ 6.24621637 12.24610757 -16.83743427 -21.13723331]
# [ -1.51666857 -4.36791652 4.64982565 3.18640594]
# [ -4.72954779 -7.87819105 12.18760862 17.95082737]]
You can calculate the coefficients with the following code:
def LDA_coefficients(X,lda):
nb_col = X.shape[1]
matrix= np.zeros((nb_col+1,nb_col), dtype=int)
Z=pd.DataFrame(data=matrix,columns=X.columns)
for j in range(0,nb_col):
Z.iloc[j,j] = 1
LD = lda.transform(Z)
nb_funct= LD.shape[1]
results = pd.DataFrame();
index = ['const']
for j in range(0,LD.shape[0]-1):
index = np.append(index,'C'+str(j+1))
for i in range(0,LD.shape[1]):
coef = [LD[-1][i]]
for j in range(0,LD.shape[0]-1):
coef = np.append(coef,LD[j][i]-LD[-1][i])
result = pd.Series(coef)
result.index = index
column_name = 'LD' + str(i+1)
results[column_name] = result
return results
Before calling this function you need to complete the linear discriminant analysis:
lda = LinearDiscriminantAnalysis()
lda.fit(X,y)

Categories

Resources