how to predict the output value using logistic regression? - python

I simulated successfully my classification function to predict the single value of output binary by ANN utilizing pandas and sklearn libraries. Now I want to simulate my model to predict another feature which is not binary, as the input columns are (0,1,4,6,7,8,11,12,13,14) and the output column is (15) of my data set. A typical example of the input data is [4096,0.07324,1.7,20,5.2,64,0.142,0.5,35,30,584.232] as some values are float. How can I predict 584.232 by the first ten numbers utilizing logistic regression?
thank you all.
dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
for avoiding type error, I converted the input values into float using the following way:
dataset['ColumnsName'] = dataset['ColumnsName'].astype(float)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
but after compiling the code up to now, it gives the error:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
Traceback (most recent call last):
File "<ipython-input-5-f18c8875152f>", line 3, in <module>
classifier.fit(X_train, y_train)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1528, in fit
check_classification_targets(y)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
I have already converted the predefined columns from string to float!

dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Activation Function
model = Sequential()
model.add(Dense(6, input_dim=9, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(1))
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

Related

How to merge predicted values to original pandas test data frame where X_test has been converted using CountVectorizer before splitting

I want to merge my predicted results of my test data to my X_test. I was able to merge it with y_test but since my X_test is a corpus I'm not sure how I can identify the indexes to merge.
My codes are as below
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
# Create corpus as a list
corpus = df['text'].tolist()
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Train Logistic Regression on Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Merge true vs predicted labels
true_vs_pred = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
return true_vs_pred
This gives me the y_test and y_pred but I'm not sure how I can add the X_test as an original data frame (the ids of the X_test) to this.
Any guidance is much appreciated. Thanks
Using a pipeline can help you link the original X_test with the prediction:
def lr_model(df):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.pipeline import Pipeline
# Defining X and y
cv = CountVectorizer()
X = df['text']
y = df.iloc[:, -1].values
# Splitting to testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Create a pipeline
pipeline = Pipeline([
('CountVectorizer', cv),
('LogisticRegression', LogisticRegression(random_state = 0)),
])
# Train pipeline on Training set
pipeline.fit(X_train, y_train)
# Predicting the Test set results
y_pred = pipeline.predict(X_test)
return X_test, y_test, y_pred

I'm not sure what needs to be reshaped in my data

I'm trying to use a LinearRegression() algorithm to predict the price of a house.
Here's my code:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
df = pd.read_csv('data.csv')
df = df.drop(columns=['date', 'street', 'city', 'statezip', 'country'])
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
pred.reshape((-1, 1))
acc = lr.score(pred, y_test)
However, I keep on getting this error:
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
I've tried to reshape all the attributes in my data, but the only thing that I'm able to reshape is pred, and I still get the same error after doing that?
How should I fix this error?
Thanks in advance.
Base on Documentation of sklearn.linear_model.LinearRegression.score:
score(X, y, sample_weight=None)
return R^2 score of self.predict(X) wrt. y.
You need to pass X as the first argument like below:
lr.fit(X_train, y_train)
acc = lr.score(X_test, y_test)
print(acc)
Or You can use sklearn.metrics.r2_score:
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Example:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
acc = lr.score(X_test, y_test)
print(acc)
# Or
from sklearn.metrics import r2_score
acc = r2_score(y_test, pred)
print(acc)
Output:
0.8888888888888888
0.8888888888888888

Sklearn - Pipeline with StandardScaler, PolynomialFeatures and Regression

I have the following model which scales the data, then uses polynomial features and finally feeds the data into a regression model with regularization, like so:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
polynomial = PolynomialFeatures(degree=3, include_bias=False)
polynomial.fit(X_train_scaled)
X_train_model = polynomial.transform(X_train_scaled)
X_test_model = polynomial.transform(X_test_scaled)
reg_model = Ridge(alpha=alpha)
reg_model.fit(X_train_model, y_train)
y_pred_train_model = reg_model.predict(X_train_model)
r2_train = r2_score(y_train, y_pred_train_model)
y_pred_test_model = reg_model.predict(X_test_model)
r2_test = r2_score(y_test, y_pred_test_model)
It works fine, but seems a bit cumbersome with many fits and transformations. I've heard about this Pipeline() method in sklearn. How can I use it above in order to simplify the process?
You can rewrite your code with Pipeline() as follows:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
# generate the data
X, y = make_regression(n_samples=1000, n_features=100, noise=10, bias=1, random_state=42)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# define the pipeline
pipe = Pipeline(steps=[
('scaler', StandardScaler()),
('preprocessor', PolynomialFeatures(degree=3, include_bias=False)),
('estimator', Ridge(alpha=1))
])
# fit the pipeline
pipe.fit(X_train, y_train)
# generate the model predictions
y_pred_train_pipe = pipe.predict(X_train)
print(y_pred_train_pipe[:5])
# [11.37182811 89.22027129 -106.51012773 79.5912864 -241.0138516]
y_pred_test_pipe = pipe.predict(X_test)
print(y_pred_test_pipe[:5])
# [16.88238278 57.50116009 50.35705205 -20.92005052 -76.04156972]
# calculate the r-squared
print(pipe.score(X_train, y_train))
# 0.9999999999787197
print(pipe.score(X_test, y_test))
# 0.463044896596684
Equivalent code without Pipeline():
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
# generate the data
X, y = make_regression(n_samples=1000, n_features=100, noise=10, bias=1, random_state=42)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# extract the polynomial features
polynomial = PolynomialFeatures(degree=3, include_bias=False)
polynomial.fit(X_train_scaled)
X_train_model = polynomial.transform(X_train_scaled)
X_test_model = polynomial.transform(X_test_scaled)
# fit the model
reg_model = Ridge(alpha=1)
reg_model.fit(X_train_model, y_train)
# generate the model predictions
y_pred_train_model = reg_model.predict(X_train_model)
print(y_pred_train_model[:5])
# [11.37182811 89.22027129 -106.51012773 79.5912864 -241.0138516]
y_pred_test_model = reg_model.predict(X_test_model)
print(y_pred_test_model[:5])
# [16.88238278 57.50116009 50.35705205 -20.92005052 -76.04156972]
# calculate the r-squared
print(r2_score(y_train, y_pred_train_model))
# 0.9999999999787197
print(r2_score(y_test, y_pred_test_model))
# 0.463044896596684

scikit multilearn: accuracy_score ValueError: multiclass-multioutput is not supported

I want to predict samples that can be in more than 1 label at a time (multi label classification). So I use the scikit-multilearn library and have successfully fitted a classifier, and can even predict test data. It just fails at outputting the accuracy of the classifier.
My data (up to 1100 rows):
The dependent vars (the vars I'm predicting) are the last 4: N/xN, Sex, Maturity, and CType. The rest are the independent vars.
The accuracy I'm talking about is how close the classifier is to predicting all the labels.
Here's the code:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from skmultilearn.problem_transform import BinaryRelevance
# Prepare data
df = pd.read_csv("Data_Numeric.csv")
# remove crab_id for now
del df['Crab_id']
# independent vars: the rest
# dependent vars: N/xN, Gender, Maturity, CType
# n_samples = 1100
# n_features = 6
# n_labels = 4
X = df.iloc[:, :6].values
y = df.iloc[:, 6:df.shape[1]].astype(np.int64).values
X = sparse.csr_matrix(X)
y = sparse.csr_matrix(y, dtype=np.int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# generate model
classifier = BinaryRelevance(SVC())
# train
classifier.fit(X_train, y_train)
# predict
y_pred = classifier.predict(X_test)
y_pred_array = y_pred.toarray()
# my_data = X_test[0:4, :]
# my_data[0] = [64.7, 46, 12, 13, 0, 0]
# my_data_prediction = classifier.predict(my_data).toarray()
# my_data_true = y_test[0:4, :].toarray()
# error here
score = accuracy_score(y_test.toarray(), y_pred.toarray())
The error is
Traceback (most recent call last):
File "<input>", line 42, in <module>
File "/home/f4ww4z/anaconda3/envs/ayah/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 185, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/home/f4ww4z/anaconda3/envs/ayah/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 97, in _check_targets
raise ValueError("{0} is not supported".format(y_type))
ValueError: multiclass-multioutput is not supported
y_test
>>> y_test
<330x4 sparse matrix of type '<class 'numpy.longlong'>'
with 578 stored elements in Compressed Sparse Row format>
y_test.toarray(), shape is 330x4:
y_pred
>>> y_pred
<330x4 sparse matrix of type '<class 'numpy.longlong'>'
with 408 stored elements in Compressed Sparse Column format>
y_pred.toarray():
How do I correctly see the accuracy of the classifier?
from sklearn.model_selection import cross_validate, KFold
clf = BinaryRelevance(SVC())
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(clf, X_train, y_train, cv=k_fold, scoring=['accuracy'])
OR
scores = cross_val_score(clf, X_train, y_train, cv=5)
By using cross validation methods you can obtain the 5 accuracy scores and then take a mean of them.
You can go by basic by using a MultioutputClassifier and RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.multioutput import MultiOutputClassifier
clf=MultiOutputClassifier(RandomForestClassifier(random_state=42,class_weight="balanced"))
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(clf, X_train_tf, y_train, cv=k_fold, scoring=['f1_weighted'])
Maybe this will help you :)

LDA(n_components = 2) + fit_transform return 1-dim matrix instead of 2-dim

While applying some LDA on my Churn_Modelling.csv file, eveything goes well until the point where my X_train return (8000, 1) except of (8000, 2) as expected :
lda = LDA(n_components = 2)
X_train = lda.fit_transform(X_train, y_train)
X_train is before-hand "hot-encoded" and "feature scaled" as followed :
# LDA
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
While doing the same on an other .csv file I have no troubles... do you have any idea why ?
Thank you very very much for your help !
I think I have the answer but I would prefer to have confirmation if possible :-)
The maximal number of columns I can hope to obtain using transform. is n-1 so, in my case, 2 classes (True, False) yields maximally 1 column (n-1).
Am I right ? Thank you again.

Categories

Resources