I'm not sure what needs to be reshaped in my data - python

I'm trying to use a LinearRegression() algorithm to predict the price of a house.
Here's my code:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
df = pd.read_csv('data.csv')
df = df.drop(columns=['date', 'street', 'city', 'statezip', 'country'])
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred.reshape((-1, 1))
acc = lr.score(pred, y_test)
However, I keep on getting this error:
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
I've tried to reshape all the attributes in my data, but the only thing that I'm able to reshape is pred, and I still get the same error after doing that?
How should I fix this error?
Thanks in advance.

Base on Documentation of sklearn.linear_model.LinearRegression.score:
score(X, y, sample_weight=None)
return R^2 score of self.predict(X) wrt. y.
You need to pass X as the first argument like below:
lr.fit(X_train, y_train)
acc = lr.score(X_test, y_test)
print(acc)
Or You can use sklearn.metrics.r2_score:
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Example:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
acc = lr.score(X_test, y_test)
print(acc)
# Or
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Output:
0.8888888888888888
0.8888888888888888

Related

How to merge predicted values to original pandas test data frame where X_test has been converted using CountVectorizer before splitting

I want to merge my predicted results of my test data to my X_test. I was able to merge it with y_test but since my X_test is a corpus I'm not sure how I can identify the indexes to merge.
My codes are as below
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Create corpus as a list
corpus = df['text'].tolist()
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Train Logistic Regression on Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Merge true vs predicted labels
true_vs_pred = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
return true_vs_pred
This gives me the y_test and y_pred but I'm not sure how I can add the X_test as an original data frame (the ids of the X_test) to this.
Any guidance is much appreciated. Thanks
Using a pipeline can help you link the original X_test with the prediction:
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.pipeline import Pipeline
# Defining X and y
cv = CountVectorizer()
X = df['text']
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Create a pipeline
pipeline = Pipeline([
('CountVectorizer', cv),
('LogisticRegression', LogisticRegression(random_state = 0)),
])
# Train pipeline on Training set
pipeline.fit(X_train, y_train)
# Predicting the Test set results
y_pred = pipeline.predict(X_test)
return X_test, y_test, y_pred

How to fetch R2 for each target in sklearn MultiOutputRegressor() rather than the overall R2?

For example, Xs has 5 independent variables, and Ys has 5 dependent variables:
x_train, x_test, y_train, y_test = train_test_split(Xs, Ys, test_size=0.2, random_state=2)
model = lgb.LGBMRegressor()
wrapper = MultiOutputRegressor(model)
model.fit(x_train, y_train)
model.score(x_test, y_test)
Could only get the overall R2 through the code above, what if I want to check the R2 for each Y?
Is it possible?
Thanks
You can use scikit-learn r2_score with multioutput='raw_values':
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
import lightgbm as lgb
# generate the data
X, Y = make_regression(n_targets=5, n_features=10, n_samples=1000, random_state=42)
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# instantiate the model
model = MultiOutputRegressor(estimator=lgb.LGBMRegressor())
# fit the model
model.fit(X_train, Y_train)
# generate the model predictions
Y_pred = model.predict(X_test)
# calculate the individual R2's
print(r2_score(Y_test, Y_pred, multioutput='raw_values'))
# [0.907924 0.925267 0.906492 0.939653 0.881619]
print([r2_score(Y_test[:, i], Y_pred[:, i]) for i in range(Y_test.shape[1])])
# [0.907924, 0.925267, 0.906492, 0.939653, 0.881619]
# calculate the overall R2
print(model.score(X_test, Y_test))
# 0.9121908184618046
print(r2_score(Y_test, Y_pred, multioutput='uniform_average'))
# 0.9121908184618046

Why Score from LinearRegression is giving different result than r2_score from sklearn.metrics

Ideally I should get same result as score is nothing but R-Square. But not sure why results are coming different.
from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()
data.data.shape
data.feature_names
data.target_names
import pandas as pd
house_data = pd.DataFrame(data.data, columns=data.feature_names)
house_data.describe()
house_data['Price'] = data.target
X = house_data.iloc[:, 0:8].values
y = house_data.iloc[:, -1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
#Check R-square on training data
from sklearn.metrics import mean_squared_error, r2_score
y_pred = linear_model.predict(X_test)
print(linear_model.score(X_test, y_test))
print(r2_score(y_pred, y_test))
Output
0.5957643114594776
0.34460597952465033
from the docs: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
sklearn.metrics.r2_score(y_true, y_pred,...)
You are passing y_true and y_pred the wrong way around. If you switch them you get the correct result.
print(linear_model.score(X_test, y_test))
print(r2_score(y_test, y_pred))
0.5957643114594777
0.5957643114594777

how to predict the output value using logistic regression?

I simulated successfully my classification function to predict the single value of output binary by ANN utilizing pandas and sklearn libraries. Now I want to simulate my model to predict another feature which is not binary, as the input columns are (0,1,4,6,7,8,11,12,13,14) and the output column is (15) of my data set. A typical example of the input data is [4096,0.07324,1.7,20,5.2,64,0.142,0.5,35,30,584.232] as some values are float. How can I predict 584.232 by the first ten numbers utilizing logistic regression?
thank you all.
dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
for avoiding type error, I converted the input values into float using the following way:
dataset['ColumnsName'] = dataset['ColumnsName'].astype(float)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
but after compiling the code up to now, it gives the error:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
Traceback (most recent call last):
File "<ipython-input-5-f18c8875152f>", line 3, in <module>
classifier.fit(X_train, y_train)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1528, in fit
check_classification_targets(y)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
I have already converted the predefined columns from string to float!
dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Activation Function
model = Sequential()
model.add(Dense(6, input_dim=9, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(1))
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

k nearest neighbors with cross validation for accuracy score and confusion matrix

I have the following data where for each column, the rows with numbers are the input and the letter is the output.
A,A,A,B,B,B
-0.979090189,0.338819904,-0.253746508,0.213454999,-0.580601104,-0.441683968
-0.48395313,0.436456904,-1.427424032,-0.107093825,0.320813402,0.060866105
-1.098818173,-0.999161692,-1.371721698,-1.057324962,-1.161752652,-0.854872591
-1.53191442,-1.465454248,-1.350414216,-1.732518018,-1.674040715,-1.561568496
2.522796162,2.498153298,3.11756171,2.125738509,3.003929536,2.514411247
-0.060161596,-0.487513844,-1.083513761,-0.908023322,-1.047536921,-0.48276759
0.241962669,0.181365373,0.174042637,-0.048013217,-0.177434916,0.42738621
-0.603856395,-1.020531402,-1.091134021,-0.863008165,-0.683233589,-0.849059931
-0.626159165,-0.348144322,-0.518640038,-0.394482485,-0.249935646,-0.543947259
-1.407263942,-1.387660115,-1.612988118,-1.141282747,-0.944745366,-1.030944216
-0.682567673,-0.043613473,-0.105679403,0.135431139,0.059104888,-0.132060832
-1.10107164,-1.030047313,-1.239075022,-0.651818656,-1.043589073,-0.765992541
I am trying to perform KNN LOOCV to get accuracy score and confusion matrix.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
import pandas as pd
def main():
csv = 'data.csv'
df = pd.read_csv(csv)
X = df.values.T
y = df.columns.values
clf = KNeighborsClassifier()
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
y_true = y_test
y_pred = clf.predict(X_test)
ac = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
print ac
print cm
if __name__ == '__main__':
main()
However my results are all 0s. Where am I going wrong?
I think your model does not get trained properly and because it only has to guess one value it doesn't get it right. May I suggest switching to KFold or StratifiedKFold. LOO has the disadvantage that for large samples it becomes extemely time consuming. Here is what happened when I implemented StratifiedKFold with 3 splits on your X data. I have randomly filled y with 0 and 1, instead of using A and B and have not trasposed the data so it has 12 rows:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
import pandas as pd
csv = 'C:\df_low_X.csv'
df = pd.read_csv(csv, header=None)
print(df)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
clf = KNeighborsClassifier()
kf = StratifiedKFold(n_splits = 3)
ac = []
cm = []
for train_index, test_index in kf.split(X,y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
ac.append(accuracy_score(y_test, y_pred))
cm.append(confusion_matrix(y_test, y_pred))
print(ac)
print(cm)
# ac
[0.25, 0.75, 0.5]
# cm
[array([[1, 1],
[2, 0]], dtype=int64),
array([[1, 1],
[0, 2]], dtype=int64),
array([[0, 2],
[0, 2]], dtype=int64)]

Categories

Resources