Python Sklearn Linear Regression Yields Incorrect Coefficient Values - python

I'm trying to find the slope and y-intercept coefficients for a linear equation. I created a test domain and range to make sure the numbers I was receiving were correct. The equation should be y = 2x + 1, but the model is saying the slope is 24 and the y-intercept is 40.3125. The model accurately predicts every value I give it, but I'm questioning how I can get the proper values.
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = np.arange(0, 40)
y = (2 * X) + 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)
X_train = [[i] for i in X_train]
X_test = [[i] for i in X_test]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Coefficients: \n', regr.coef_)
print('Y-intercept: \n', regr.intercept_)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f'
% r2_score(y_test, y_pred))
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
print(X_test)
plt.xticks()
plt.yticks()
plt.show()

This is happening because you scaled your training and testing data. So even though you generated y as a linear function of X, you converted X_train and X_test onto another scale by standardizing it (subtract the mean and divide by the standard deviation).
If we run your code but omit the lines where you scale the data, you get the expected results.
X = np.arange(0, 40)
y = (2 * X) + 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)
X_train = [[i] for i in X_train]
X_test = [[i] for i in X_test]
# Skip the scaling of X_train and X_test
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Coefficients: \n', regr.coef_)
> Coefficients:
[2.]
print('Y-intercept: \n', regr.intercept_)
> Y-intercept:
1.0

Related

Python: An index error that inhibits a Random forest model from working

I have some kind of an indexing flaw in my Random forest model. I did not encounter this with Gaussian process regressor. The data is the same in both cases.
Data
#Decision matrix X
sampler = qmc.LatinHypercube(d=4)
u_bounds = np.array([5.0, 5.0, 10.0, 10.0])
l_bounds = np.array([0.125, 0.125, 0.1, 0.1])
data = sampler.lhs_method(100)*(u_bounds-(l_bounds)) + (l_bounds)
#Output value matrix y
y=np.zeros((100,2))
for i in range(np.shape(data)[0]):
y[i,0], y[i,1] = objectives(data[i,0], data[i,1], data[i,2], data[i,3])
Training the model and implementing it. This works.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.2, random_state = 42)
kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel,
random_state=0).fit(X_train, y_train)
gpr.score(X_test, y_test)
gpr.fit(data, y[:,0])
gpr.score(data, y[:,0])
f1_score = gpr.score(data, y[:,0])
print(f1_score)
The same with Random forest
X, y = make_regression(n_features=4, n_informative=2,
random_state=0, shuffle=False)
RF = RandomForestRegressor(max_depth=2, random_state=0)
RF.fit(X, y)
RF.score(X, y)
But why this does not work?
RF.fit(data, y[:,0])
RF.score(data, y[:,0])
Error message: IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed
Should I try to fix it with something like this: np.atleast_2d(y[i,0])? This gives the same error.

Predicting the percentage accuracy of predicted handwritten numbers

I have two arrays with some differences when predicting handwritten numbers. How would I calculate the percentage accuracy for each number?
Loading the data:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
Loading data:
digits = load_digits()
X = digits.data
y = digits.target
Splitting data into test train:
#Perform test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Logistic regression:
#Create a logistic regression object
clf = LogisticRegression(random_state=0,penalty='none')
#Fit model to data
clf.fit(X_train,y_train)
#Print the coefficients
b0 = clf.intercept_[0]
b1 = clf.coef_[0,0]
print('beta_0 =', b0)
print('beta_1 =', b1)
#Calculate the test error rate
yp = clf.predict(X_test)
err = (yp!=y_test).mean()
print('Error rate = {}'.format(err))
I want to calculate the percentage error for each number as yp doesn't equal y_test

How to solve ValueError: x and y must be the same size issue on Python?

I'm trying to do a linear regression, however I keep running into the same problem of "ValueError: x and y must be the same size". I'm very confused, and have been on every single website there is to try to fix it. If anyone would know that would be a massive help. I don't understand what to do.
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
#load datatset
df = pd.read_csv('Real_estate.csv')
X = df[['transaction date', 'house age', 'distance to the nearest MRT station','number of convenience stores', 'latitude','longitude']]
y = df['house price of unit area']
x= df.iloc[:,0:-7].values
y= df.iloc[:,1:].values
x, y = np.array(x), np.array(y)
model = LinearRegression()
model.fit(x, y)
model = LinearRegression().fit(x, y)
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.4)
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
regr = linear_model.LinearRegression()
regr.fit(x_train_std, y_train)
y_pred = regr.predict(x_test)
r_sq = model.score(x, y)
print("Intercept: ", regr.intercept_)
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
##Model evaluation
print("Mean absolute error: %.2f" % mean_absolute_error(y_test,y_pred))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')
plt.scatter(x_test,y_test, color="black")
plt.plot(x_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
This is my code but I don't know understand what's going wrong. I'm trying to use 7 columns, including the y value. I'm a beginner to Python, so I apologize if this is a very silly question. Thank you.
plt.plot(x_test, y_pred, color="blue", linewidth=3)
Both arguments need to be of the same shape, but y_pred is prediction over entire x, instead of x_test
change
y_pred = model.predict(x)
to
y_pred = model.predict(x_test)

I'm not sure what needs to be reshaped in my data

I'm trying to use a LinearRegression() algorithm to predict the price of a house.
Here's my code:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
df = pd.read_csv('data.csv')
df = df.drop(columns=['date', 'street', 'city', 'statezip', 'country'])
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred.reshape((-1, 1))
acc = lr.score(pred, y_test)
However, I keep on getting this error:
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
I've tried to reshape all the attributes in my data, but the only thing that I'm able to reshape is pred, and I still get the same error after doing that?
How should I fix this error?
Thanks in advance.
Base on Documentation of sklearn.linear_model.LinearRegression.score:
score(X, y, sample_weight=None)
return R^2 score of self.predict(X) wrt. y.
You need to pass X as the first argument like below:
lr.fit(X_train, y_train)
acc = lr.score(X_test, y_test)
print(acc)
Or You can use sklearn.metrics.r2_score:
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Example:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
acc = lr.score(X_test, y_test)
print(acc)
# Or
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Output:
0.8888888888888888
0.8888888888888888

UndefinedMetricWarning in Sklearn

I'm not able to see my resultant accuracy score in my final graph and I get precision/recall being ill-defined where I don't see any 0's.
I'm using this yeast data: https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data
I've tried making the whole set my training set by making train_frac=1.
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv("<my_dir>",names = ['sample','mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc','site'])
df=df.drop(columns=['sample'])
model_type = GaussianNB()
target = 'site'
train_frac = 0.5
Y = df[target]
df2 = df.drop(columns=[target])
dtype='object'). Everything but site.
X = df[df2.columns[:]]
def naive_split(X, Y, n):
# Take first n lines of X and Y for training and the rest for testing
X_train = X[:n]
X_test = X[n:]
Y_train = Y[:n]
Y_test = Y[n:]
return (X_train, X_test, Y_train, Y_test)
def train_model(n=int(train_frac*df.shape[0])):
X_train, X_test, Y_train, Y_test = naive_split(X, Y, n)
clf = model_type
clf = clf.fit(X_train, Y_train)
return (X_test, Y_test, clf)
X_test, Y_test, clf = train_model()
import sklearn.metrics as metrics
from sklearn import model_selection
sizes = np.arange(0.98,0.01, -0.02)
result = {}
for size in sizes:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
X, Y, test_size=size, random_state=200)
clf = model_type
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
precision = metrics.precision_score(Y_test, clf.predict(X_test), average='weighted')
recall = metrics.recall_score(Y_test, clf.predict(X_test), average='weighted')
result[len(Y_train)] = (score, precision, recall)
result = pd.DataFrame(result).transpose()
result.columns = ['Accuracy','Precision', 'Recall']
result.plot(marker='*', figsize=(15,5))
plt.title('Metrics measures using random train/test splitting')
plt.xlabel('Size of training set')
plt.ylabel('Value');
I get the following results when I expect it to run without error:
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.'precision', 'predicted', average, warn_for)
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. 'recall', 'true', average, warn_for)

Categories

Resources