Why isn't this Linear Regression line a straight line? - python

I have points with x and y coordinates I want to fit a straight line to with Linear Regression but I get a jagged looking line.
I am attemting to use LinearRegression from sklearn.
To create the points run a for loop that randomly crates one hundred points into an array that is 100 x 2 in shape. I slice the left side of it for the xs and the right side of it for the ys.
I expect to have a straight line when I print m.predict.
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression
X = []
adder = 0
for z in range(100):
r = random.random() * 20
r2 = random.random() * 15
X.append([r+adder-0.4, r2+adder])
adder += 0.6
X = np.array(X)
plt.scatter(X[:,0], X[:,1], s=10)
plt.show()
m = LinearRegression()
m.fit(X[:,0].reshape(1, -1), X[:,1].reshape(1, -1))
plt.plot(m.predict(X[:,0].reshape(1, -1))[0])

I am not good with numpy but, I think it is because the use of reshape() function to convert X[:,0] and X[:,1] from 1D to 2D, the resulting 2D array contains only one element, instead of creating a 2D array of len(X[:,0]) and len(X[:,1]) respectively. And resulting into an undesired regressor.
I am able to recreate this model using pandas and able to plot the desired result. Code as follows
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression
import pandas as pd
X = []
adder = 0
for z in range(100):
r = random.random() * 20
r2 = random.random() * 15
X.append([r+adder-0.4, r2+adder])
adder += 0.6
X = np.array(X)
y_train = pd.DataFrame(X[:,1],columns=['y'])
X_train = pd.DataFrame(X[:,0],columns=['X'])
//plt.scatter(X_train, y_train, s=10)
//plt.show()
m = LinearRegression()
m.fit(X_train, y_train)
plt.scatter(X_train,y_train)
plt.plot(X_train,m.predict(X_train),color='red')

Related

Plotting classification results with confusion matrices on python

I am performing least squares classification on my data and I was able to obtain my weights and I decided to plot a decision boundary line. However I require to use a confusion matrix to show my classification results. I was going to use from sklearn.metrics import confusion_matrix and I was going to assign t as my prediction however I am not sure how to obtain my actual results to work out the matrix. I have never plotted one so I might be getting all this wrong.
import numpy as np
import matplotlib.pyplot as plt
data=np.loadtxt("MyData_A.txt")
x=data[:,0:2] #the data points
t=data[:,2] #class which data points belong to either 1s or 0s
x0=np.ones((len(x),1)) # creat array of ones as matrix (nx1) where n is number of points
X=np.append(x, x0, axis=1) # add column x0 to data
# w= ( (((X^T)X)^-1 )X^T )t
XT_X=np.dot(X.T, X) # (X^T)X
inv_XT_X=np.linalg.inv(XT_X) # (X^T)X)^-1
X_tot=np.dot(inv_XT_X, X.T) # ((X^T)X)^-1 )X^T
w=np.dot(X_tot, t) # ( (((X^T)X)^-1 )X^T )t
x1_line = np.array([-1, 2])
x2_line = -w[2] / w[1] - (w[0] / w[1]) * x1_line
color_cond=['r' if t==1 else 'b' for t in t]
plt.scatter(x[:,0],x[:,1],color=color_cond)
plt.plot(x1_line,x2_line,color='k')
plt.xlabel('X1')
plt.ylabel('X2')
plt.ylim(-2,2)
plt.title('Training Data (X1,X2)')
plt.show()
The following is the plot obtained.
from sklearn.metrics import confusion_matrix
import seaborn as sns
def predict(x1_line, x2_line, x):
d = (x[0] - x1_line[0]) * (x2_line[1] - x2_line[0]) - (x[1] - x2_line[0]) * (x1_line[1] - x1_line[0])
pred = 0 if d > 0 else 1
return pred
preds = np.array([predict(x1_line, x2_line, x12) for x12 in x])
conf_mat = confusion_matrix(t, preds)
sns.heatmap(conf_mat, annot=True);
plt.show()
LogisticRegression, confusion_matrix and ConfusionMatrixDisplay get the job done:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
data = np.loadtxt("MyData_A.txt")
X = data[:, :-1]
y = data[:, -1].astype(int)
clf = LogisticRegression().fit(X, y)
pred = clf.predict(X)
cm = confusion_matrix(y, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

Obtaining an isosurface from 3D data and the corresponding indices

I have a 3D numpy array of temperature values on a grid. From this I can compute the gradients using dTdx, dTdy, dYdz = np.gradient(T). Now I'm only interested in the values of the gradients on the isosurface where the temperature is 900. What I want to do is something like (pseudo-codish):
import nympy as np
def regular(x,y,z,q=100,k=175,a=7.1e-5):
R = np.sqrt(x**2+y**2+z**2)
return 100 / (2*np.pi*k) * (1/R) * np.exp(-0.5/a*(R+x))
x = np.arange(-1.5,0.5+res/2,res)*1e-3
y = np.arange(-1.0,1.0+res/2,res)*1e-3
z = np.arange(0.0,0.5+res/2,res)*1e-3
Y,X,Z = np.meshgrid(y,x,z)
T = regular(X,Y,Z)
dTdx, dTdy, dYdz = np.gradient(T)
(xind,yind,zind) = <package>.get_contour_indices(X,Y,Z,T,value=900)
x_gradients_at_isosurface = dTdx[xind,yind,zind]
...
I've tried:
import numpy as np
from skimage import measure
contour_data = measure.find_contours(T[:,:,0],900)
contour_data = np.int_(np.round(contour_data[0]))
xs,ys = contour_data[:,0],contour_data[:,1]
gradients_of_interest = np.array([G[x,y,0] for x,y in zip( xs,ys )])
which works fine, but only works for 2D data. I'm looking for the 3D equivalent. I've found the following:
import plotly.graph_objects as go
surf = go.Isosurface(x=X.flatten(),y=Y.flatten(),z=Z.flatten(),value=T.flatten(),isomin=900,isomax=900)
fig = go.Figure(data=surf)
plt.show()
But I'm not interested in plotting it. I want to know the indices where the temperature is T=900 so I can use it on the gradients. Any ideas?
You need skimage.measure.marching_cubes.

IndexError when ploting sklearn manifold TSNE

I try to run a t-sne but python shows me this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
Data is being provided by this link.
Here's the code:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
#scatter plot the sample points among 5 classes
markers=('s','d','o','^','v')
color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(x_test_2d)):
plt.scatter(x=x_test_2d[cl, 0],y =x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx], label=cl)
plt.show()
What do I have to change in order to make this work?
The error is due to the following line:
plt.scatter(x_test_2d[cl, 0], x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx])
Here, cl can take and takes not integer values (from np.unique(x_test_2d)) and this raises the error, e.g. the last value that cl takes is 99.46295 and then you use: x_test_2d[cl, 0] which translates into x_test_2d[99.46295, 0]
Define a variable y that hold the class labels, then use:
# variable holding the classes
y = dataframe_all.classe.values
y = np.array([ord(i) for i in y])
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()
FULL CODE:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
# variable holding the classes
y = dataframe_all.classe.values # you need this for the colors
y = np.array([ord(i) for i in y]) # convert letters to numbers
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()

Fitting a quadratic function in python without numpy polyfit

I am trying to fit a quadratic function to some data, and I'm trying to do this without using numpy's polyfit function.
Mathematically I tried to follow this website https://neutrium.net/mathematics/least-squares-fitting-of-a-polynomial/ but somehow I don't think that I'm doing it right. If anyone could assist me that would be great, or If you could suggest another way to do it that would also be awesome.
What I've tried so far:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
ones = np.ones(3)
A = np.array( ((0,1),(1,1),(2,1)))
xfeature = A.T[0]
squaredfeature = A.T[0] ** 2
b = np.array( (1,2,0), ndmin=2 ).T
b = b.reshape(3)
features = np.concatenate((np.vstack(ones), np.vstack(xfeature), np.vstack(squaredfeature)), axis = 1)
featuresc = features.copy()
print(features)
m_det = np.linalg.det(features)
print(m_det)
determinants = []
for i in range(3):
featuresc.T[i] = b
print(featuresc)
det = np.linalg.det(featuresc)
determinants.append(det)
print(det)
featuresc = features.copy()
determinants = determinants / m_det
print(determinants)
plt.scatter(A.T[0],b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
p2 = np.polyfit(A.T[0],b,2)
plt.plot(u, np.polyval(p2,u), 'b--')
plt.show()
As you can see my curve doesn't compare well to nnumpy's polyfit curve.
Update:
I went through my code and removed all the stupid mistakes and now it works, when I try to fit it over 3 points, but I have no idea how to fit over more than three points.
This is the new code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
ones = np.ones(3)
A = np.array( ((0,1),(1,1),(2,1)))
xfeature = A.T[0]
squaredfeature = A.T[0] ** 2
b = np.array( (1,2,0), ndmin=2 ).T
b = b.reshape(3)
features = np.concatenate((np.vstack(ones), np.vstack(xfeature), np.vstack(squaredfeature)), axis = 1)
featuresc = features.copy()
print(features)
m_det = np.linalg.det(features)
print(m_det)
determinants = []
for i in range(3):
featuresc.T[i] = b
print(featuresc)
det = np.linalg.det(featuresc)
determinants.append(det)
print(det)
featuresc = features.copy()
determinants = determinants / m_det
print(determinants)
plt.scatter(A.T[0],b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
p2 = np.polyfit(A.T[0],b,2)
plt.plot(u, np.polyval(p2,u), 'r--')
plt.show()
Instead using Cramer's Rule, actually solve the system using least squares. Remember that Cramer's Rule will only work if the total number of points you have equals the desired order of polynomial plus 1.
If you don't have this, then Cramer's Rule will not work as you're trying to find an exact solution to the problem. If you have more points, the method is unsuitable as we will create an overdetermined system of equations.
To adapt this to more points, numpy.linalg.lstsq would be a better fit as it solves the solution to the Ax = b by computing the vector x that minimizes the Euclidean norm using the matrix A. Therefore, remove the y values from the last column of the features matrix and solve for the coefficients and use numpy.linalg.lstsq to solve for the coefficients:
import numpy as np
import matplotlib.pyplot as plt
ones = np.ones(4)
xfeature = np.asarray([0,1,2,3])
squaredfeature = xfeature ** 2
b = np.asarray([1,2,0,3])
features = np.concatenate((np.vstack(ones),np.vstack(xfeature),np.vstack(squaredfeature)), axis = 1) # Change - remove the y values
determinants = np.linalg.lstsq(features, b)[0] # Change - use least squares
plt.scatter(xfeature,b)
u = np.linspace(0,3,100)
plt.plot(u, u**2*determinants[2] + u*determinants[1] + determinants[0] )
plt.show()
I get this plot now, which matches what the dashed curve is in your graph, also matching what numpy.polyfit gives you:

graphing non-linear decision boundary

data can be found here: ex2data2.txt
I'm not sure what call to plt.contour() I should be using to reproduce this.
the related Matlab function call would be:
contour(u, v, z, [0, 0], 'LineWidth', 2)
I'm trying to plot the decision boundary for a non-linear logistic regression like the following image
import scikitplot.plotters as skplt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn import metrics
from ggplot import *
import time
def mapFeature(X1, X2, df=True):
"""
X1, X2: dtype = pd.DataFrame, float, int
either a single value or a vector of values
df : dtype = boolean
whether it's a single scalar value or a vector of values
----------
Return: dtype = m row vector or m x n vector of feature values
Calculates each feature and returns its value
"""
# add a column of ones for intercept parameter
out = pd.DataFrame({'1':np.ones(X1.size)})
# max 6th degree polynomial
for i in range(1,7):
for j in range(i+1):
# all the combinations of polynomials up to 7th degree
value = (X1**(i-j))*(X2**j)
col_name = 'X1**{} * X2**{}'.format(i-j, j)
# When we give a vector with only one dimension, we need to specify
# whether to add it as a column or a row. 0 denotes adding a row,
# and 1 would be a column.
if df:
out = out.join(pd.DataFrame({col_name: value}))
else:
out = out.join(pd.DataFrame({col_name: value}, index=[0]))
return out
if __name__ == '__main__':
data = pd.read_csv('ex2data2.txt', header=None,
names=['Test1', 'Test2', 'Pass'])
X = data.iloc[:, :2]
y = data.iloc[:,2]
X = mapFeature(X.iloc[:,0], X.iloc[:,1])
clf = LogisticRegression().fit(X, y)
theta = clf.coef_
u = np.linspace(start, end, 30)
v = np.linspace(start, end, 30)
uu, vv = np.meshgrid(u, v)
z = np.zeros((30, 30))
for i in range(30):
for j in range(30):
z[i,j] = mapFeature(u[i], v[i], df=False).values.dot(theta.T)
plt.contour(uu, vv, z, [0])
plt.show()

Categories

Resources