Unable to execute RFECV over LogisticRegression - python

My ML project is about "Loan Eligibility prediction"
For that I used data below : https://www.kaggle.com/code/sazid28/home-loan-prediction/data?select=train.csv
and my code is as shown :
import random
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import \
SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import \
OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
df_train_original = pd.read_csv("train.csv.xls")
df = df_train_original.drop(df_train_original.columns[0], axis=1)
# Remplace 'Credit_History' by random value (0 or 1)
random.seed(0)
df['Credit_History'] = \
df['Credit_History'].apply(
lambda x: np.random.choice(df['Credit_History'].dropna().values)
if np.isnan(x) else x)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Data pre-processing
numerical_feature, categorical_feature = [], []
for i in X.columns:
if X[i].dtype == 'O':
categorical_feature.append(i)
else:
numerical_feature.append(i)
imputer = IterativeImputer(random_state=0)
scaler = StandardScaler()
encoder = OrdinalEncoder()
# Replace categorical features with the most frequent value of the column
# Gender (-13) , Married (-3), Self_Employed (-32)
# LoanAmount (-22) Loan_Amount_Term (-14) Credit_History (-50)
numerical_pipeline = make_pipeline(imputer, scaler)
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), encoder)
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
params = {
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'logisticregression__C': np.linspace(0.001, 0.1, 30),
}
model = make_pipeline(preprocessor, clf)
selector = RFECV(model, step=1, min_features_to_select=2, cv=5)
selector.fit(X_train, y_train)
when I run the code I get :
ValueError: could not convert string to float: 'Male'
I think that the data is not fitted and transformed before going through RFECV.
How to fix this?

RFECV does not work with a pipeline as the estimator, as it requires the estimator to expose either a coef_ or a feature_importances_. Pipelines do not, and even if they did, there would be no guarantee that the feature importances of the final estimator correspond to the features input to the pipeline with arbitrary transformations in the intermediate.
What you can do is make the RFECV transformer an element of your pipeline between the preprocessing and the final estimator, ie
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf_fs = LogisticRegression(random_state=0, max_iter=df.shape[0])
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
feature_selector = RFECV(clf_fs , step=1, min_features_to_select=2, cv=5)
model = make_pipeline(preprocessor, feature_selector, clf)

Related

Extract feature names after Pipeline usage with ColumnTransformer (sklearn)

I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape

How to implement my for loop and call my function to get a result in my Ridge regression model?

I am a novice and I start in the data and I try to predict thanks to a regression model the price of a house, a price exercise on kaggle file: train.csv , i'm going to explore data
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score , cross_val_predict, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, LassoCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
%matplotlib inline
data = pd.read_csv('train.csv', index_col = 0)
pd.set_option('display.max_row',80)
pd.set_option('display.max_column',80)
hp = data.copy()
hp = hp.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna(hp.mean()))
hp.isna().sum()
from sklearn.preprocessing import StandardScaler
numeric_features = hp.select_dtypes(exclude = ['object'])
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = hp.select_dtypes('object')
categorical_features= categorical_features.fillna('Z')
categorical_transformer =
OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer,
categorical_features)])
y = numeric_features['SalePrice']
X = numeric_features.drop('SalePrice', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
scaler = preprocessing.StandardScaler().fit(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(clf, X_train_scaled, y_train,
scoring="neg_mean_squared_error", cv = 5))
return(rmse)
How to implement my for loop and call my function to get a result in my Ridge regression model?
coefs = []
rmse = []
alphas = [0.01, 0.05 , 0.1, 0.3, 0.8, 1, 5,10, 15, 30,50]
for a in alphas:
ridge = Ridge(alpha=a, fit_intercept=True)
ridge.fit(X_train_scaled, y_train)
coefs.append(ridge.coef_)
rmse_cv()
In the documentation of sklearn, a method that returns the predictions have been included.
Another online source, smith.edu, explains how to use the method with an example.
Here is how you can integrate them into your solution:
coefs = []
rmse = []
alphas = [0.01, 0.05 , 0.1, 0.3, 0.8, 1, 5,10, 15, 30,50]
for a in alphas:
ridge = Ridge(alpha=a, fit_intercept=True)
ridge.fit(X_train_scaled, y_train)
pred = ridge.predict(X_test)
coefs.append(ridge.coef_)
rmse.append = mean_squared_error(y_test, pred) # Calculate the test MSE

Calculating AUC for LogisticRegression model

Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]

Bad MSE while using Pipes

I'm trying to predict some prices from a dataset that I scraped. I never used Python for this (I usually use tidyverse, but this time I wanted to explore pipeline.
So here is the code snippet:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
df = pd.read_csv("https://raw.githubusercontent.com/norhther/idealista/main/idealistaBCN.csv")
df.drop("info", axis = 1, inplace = True)
df["floor"].fillna(1, inplace=True)
df.drop("neigh", axis = 1, inplace = True)
df.dropna(inplace = True)
df = df[df["habs"] < 11]
X = df.drop("price", axis = 1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
ct = ColumnTransformer(
[("standardScaler", StandardScaler(), ["habs", "m2", "floor"]),
("onehot", OneHotEncoder(), ["type"]
)], remainder="passthrough")
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR())])
param_grid = {
"svr__kernel" : ['linear', 'poly', 'rbf', 'sigmoid'],
"svr__degree" : range(3,6),
"svr__gamma" : ['scale', 'auto'],
"svr__coef0" : np.linspace(0.01, 1, 2)
}
search = GridSearchCV(pipe, param_grid, scoring = ['neg_mean_squared_error'], refit='neg_mean_squared_error')
search.fit(X_train, y_train)
print(search.best_score_)
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR(coef0 = search.best_params_["svr__coef0"],
degree = search.best_params_["svr__degree"],
kernel =
search.best_params_["svr__kernel"]))])
from sklearn.metrics import mean_squared_error
pipe.fit(X_train, y_train)
preds = pipe.predict(X_train)
mean_squared_error(preds, y_train)
And search.best_score_ here is -443829697806.1671, and the MSE is 608953977916.3896
I think I messed up with something, maybe with the transformer, but I'm not completely sure. I think this is an exagerated MSE. I did a fearly similar approach with tidymodels and I got much better results.
So here I wanted to know if there is something wrong with the transformer, or is just that the model is this bad.
The reason is that you did not include C in parameter and you need to cover a whole range of Cs to fit. If we fit it with the default C = 1, you can see where the problem lies:
import matplotlib.pyplot as plt
o = pipe.named_steps["Transformer"].fit_transform(X_train)
mdl = SVR(C=1)
mdl.fit(o,y_train)
plt.scatter(mdl.predict(o),y_train)
There are some price values that are 10x the average values (1e7 versus median of 5e5). If you use mse or r^2, these will be heavily decided by these extreme values. So we need to follow the data a bit more closely and this is decided by C, which you can read more about here. We try a range:
ct = ColumnTransformer(
[("standardScaler", StandardScaler(), ["habs", "m2", "floor"]),
("onehot", OneHotEncoder(), ["type"]
)], remainder="passthrough")
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR())])
#, 'poly', 'rbf', 'sigmoid'
param_grid = {
"svr__kernel" : ['rbf'],
"svr__gamma" : ['auto'],
"svr__coef0" : [1,2],
"svr__C" : [1e-03,1e-01,1e1,1e3,1e5,1e7]
}
search = GridSearchCV(pipe, param_grid, scoring = ['neg_mean_squared_error'],
refit='neg_mean_squared_error')
search.fit(X_train, y_train)
print(search.best_score_)
-132061065775.25969
Your y values are high and the MSE values are going to be in the range of the the variance of your y values, so if we check that:
y_train.var()
545423126823.4545
132061065775.25969 / y_train.var()
0.24212590057261346
It is pretty ok, you reduce MSE to about 25% of the variance. We can check this with the test data, and I guess in this case it is quite lucky that the C values are pretty ok:
from sklearn.metrics import mean_squared_error
o = pipe.named_steps["Transformer"].fit_transform(X_train)
mdl = SVR(C=10000000.0, coef0=1, gamma='auto')
mdl.fit(o,y_train)
o_test = pipe.named_steps["Transformer"].fit_transform(X_test)
pred = mdl.predict(o_test)
print( mean_squared_error(pred,y_test) , mean_squared_error(pred,y_test)/y_test.var())
plt.scatter(mdl.predict(o_test),y_test)

How to combine SGD like SVM Kernel Approximation with Feature Selection on a Multiclass Dataset

I want to train a relatively large recordset. (200000 rows and 400 columns) in a pipeline. Only a weak notebook is available for the task.
This dataset has 15 independent classes and mixed categorical and numerical features. An SVM-like algorithm should be chosen.
I already tried to put some code together.
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer,StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.multiclass import OneVsRestClassifier
X, y= make_classification(n_samples=200000, n_features=130, n_informative=105,
n_redundant=25, n_classes=15, n_clusters_per_class=15)
#add some categorical columns
X [:,:2]= np.abs(X[:,:2]).astype(int)
X = pd.DataFrame(X, columns=[f'F{i}' for i in range(X.shape[1])])
cols = X.columns.tolist()
y = LabelBinarizer().fit_transform(y)
#%%Transformation
full_pipeline = ColumnTransformer([
('numerical', StandardScaler(), cols[2:]),
('categorical', OneHotEncoder(categories='auto'), cols[:2])
])
#Sparse matrix
X = full_pipeline.fit_transform(X)
#set start
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = SGDClassifier(loss="hinge", penalty="l2", max_iter=50)
clf_pipe = Pipeline([
('rbf', rbf),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
grid_search = RFECV(estimator=OneVsRestClassifier(clf_pipe), step=3, cv=cv,
scoring='accuracy', n_jobs=-1, verbose=10)
grid_search.fit(X, y)
ValueError: bad input shape (200000, 15)
How to handle the multiclass error in this case?
The following solution worked for me:
...
y = LabelEncoder().fit_transform(y)
...
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = OneVsOneClassifier(SGDClassifier(loss="hinge", penalty="l2", max_iter=5000))
selection = SelectKBest(k=1)
clf_pipe = Pipeline([
('rbf', rbf),
('features', selection ),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
param_grid = dict(features__k=np.logspace(1,6, num=5, base=2).round().astype(int),
rbf__gamma = [0.1,1])
grid_search = GridSearchCV(estimator=clf_pipe, cv=cv, param_grid = param_grid,
scoring='f1', n_jobs=-1, verbose=10)

Categories

Resources