How to find most relevant variables with PCA dimension reduction? - python

I'm new to python coding, and working on a project but stuck at coding part.
I have one target variable and 23 relevant variables.
My dataset is 11(simples)*23(descriptors) and one target dataset 11(simples)*1(target variable).
How do I find the most relevant variables from these 23 descriptors with PCA dimensional reduction?
pca = PCA()
pca.fit(df2)
transformed = pca.transform(df2)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
steps = [('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)`
from sklearn.preprocessing import MinMaxScaler
steps = [('norm', MinMaxScaler()), ('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
print(X.shape, y.shape)
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
steps = [('pca', PCA(n_components=4)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
enter image description heredata
enter image description here
I'm supposed to get this image but I don't know how to do.

Related

Extract feature names after Pipeline usage with ColumnTransformer (sklearn)

I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape

How to use Gridsearchcv to tune BaseEstimators within AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
# generate dataset
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.50], flip_y=0, random_state=4)
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'base_estimator__max_depth':[i for i in range(2,11,2)],
'base_estimator__min_samples_leaf':[1,2],
'n_estimators':[10,50,250,1000],
'learning_rate':[0.01,0.1]}
clf = GridSearchCV(abc, parameters,verbose=3,scoring='f1',n_jobs=-1)
clf.fit(X,y)
Here I am using GridsearchCV to tune AdaBoostClassifier as well as the base estimator which is
DecisionTreeClassifier. Can I use GridsearchCV to tune AdaBoostClassifier and more than one base estimator, (decisiontree and SVC,...) at the same time?**
Specify a list of parameter grids of every base estimator:
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
parameters = [
{'boost': [AdaBoostClassifier(base_estimator=DecisionTreeClassifier())],
'boost__base_estimator__max_depth': (None, 10),
'boost__learning_rate': (0.01, 0.1)},
{'boost': [AdaBoostClassifier(base_estimator=SVC())],
'boost__base_estimator__degree': (2, 3),
'boost__learning_rate': (0.01, 0.1)},
]
boost_pipe = Pipeline([('boost', DummyClassifier())])
grid = GridSearchCV(boost_pipe, parameters)

Scikit-learn SequentialFeatureSelector Input contains NaN, infinity or a value too large for dtype('float64'). even with pipeline

I'm trying to use SequentialFeatureSelector and for estimator parameter I'm passing it a pipeline that includes a step that inputes the missing values:
model = Pipeline(steps=[('preprocessing',
ColumnTransformer(transformers=[('pipeline-1',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value=-1,
strategy='constant')),
('preprocessing',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300013d0>),
('pipeline-2',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value='missing',
strategy='constant')),
('encoding',
OrdinalEncoder(handle_unknown='ignore'))]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300015b0>)])),
('model',
LGBMClassifier(class_weight='balanced', random_state=1,
reg_lambda=0.1))])
Nonetheless when passing this to selector it shows an error, what does not make any sense since I have already fit and evaluated my model and it runs ok
fselector = SequentialFeatureSelector(estimator = model, scoring= "roc_auc", cv = 3, n_jobs= -1, ).fit(X, target)
_assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
EDIT:
Reproducible example:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),("model",LogisticRegression(random_state = 1))])
SequentialFeatureSelector(estimator = clf,
scoring= "accuracy",
cv = 3).fit(X, y)
It shows the same error, in spite of the clf can be fit without problems
ScikitLearn's documentation does not state that the SequentialFeatureSelector works with pipeline objects. It only states that the class accepts an unfitted estimator. In view of this, you could remove the classifier from your pipeline, preprocess X, and then pass it along with an unfitted classifier for feature selection as shown in the example below.
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
pipe = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),
('scaler', MaxAbsScaler())])
# Preprocess your data
X = pipe.fit_transform(X)
# Run the SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = LogisticRegression(),
scoring= "accuracy",
cv = 3).fit(X, y)
# Check which features are important and transform X
sfs.get_support()
X = sfs.transform(X)
You can use SequentialFeatureSelection from mlxtend package
https://rasbt.github.io/mlxtend/
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([
("preprocessing", SimpleImputer(missing_values= np.NaN)),
("model",LogisticRegression(random_state = 1))
])
sfs = SequentialFeatureSelector(estimator = clf,
forward = True,
k_features = 'best',
scoring = "accuracy",
cv = 3, n_jobs=-1).fit(X, y)
sfs.k_feature_idx_
>>> (0, 1, 2, 3)

Calculating AUC for LogisticRegression model

Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]

sklearn.exceptions.NotFittedError: Estimator not fitted, call `fit` before exploiting the model

I tried Random Forests regression.
The code is given below.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
pipe = make_pipeline(RFECV(estimator=regr, step=3, cv=kf, scoring =
'neg_mean_squared_error', n_jobs=-1),
GridSearchCV(regr, param_grid={'n_estimators': [100, 300]},
cv=kf, scoring = 'neg_mean_squared_error',
n_jobs=-1))
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)
rmse = mean_squared_error(Y, ypredicts)
print (rmse)
However, I got the following error:
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.
I also tried:
model = pipe.fit(X,Y)
ypredicts = cross_val_predict(model, X, Y, cv=kf, n_jobs=-1)
But got the same error.
Edit 1:
I also tried:
pipe.fit(X,Y)
But got the same error.
In Python 2.7 (Sklearn 0.20), for the same code I got different error:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
In Python 2.7 (Sklearn 0.20.3):
NotFittedError: Estimator not fitted, callfitbefore exploiting the model.
It seems that you are trying to select best parameters for your classifier by using grid search their is a another to do so. You are using pipelines but in this method I am not using pipeline but I am getting best parameters through random search.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
n_iter_search = 20
random_search = RandomizedSearchCV(regr, param_distributions={'n_estimators': [100, 300]},
n_iter=20, cv=kf,verbose=1,return_train_score=True)
random_search.fit(X, Y)
ypredicts=random_search.predict(X)
rmse = mean_squared_error(Y, ypredicts)
print(rmse)
print(random_search.best_params_)
random_search.cv_results_
Try this piece of code. I hope this code is fullfilling your problem.
Instead of
model = pipe.fit(X,Y)
have you tried
pipe.fit(X,Y)
instead?
so that would be
pipe.fit(X,Y)
# change model to pipe
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)

Categories

Resources