How could I use t-SNE inside my pipeline?
I have managed without pipelining to successfully run t-SNE and on it a classification algorithm.
Do I need to write a custom method that can be called in the pipeline that returns a dataframe, or how does it work?
# How I used t-SNE
%%time
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
print(X_tsne.shape)
feature_list = []
for i in range(1,X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
rfc= RandomForestClassifier()
# Train Decision Tree Classifer
rfc= rfc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = rfc.predict(X_test)
What I want to use it
# How could I use TSNE() inside the the pipeline?
%%time
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = {'rfc__max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
'rfc__criterion':['gini', 'entropy']}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
[OUT] TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE()' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't
Should I build a custom method or how ? If so how should it look like ?
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self):
# don't know
def fit(self, X, y = None):
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
return self
def transform(self, X, y = None):
feature_list = []
for i in range(1,shelf.X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
return X, y
...
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
I think you misunderstood the use of pipeline. From help page:
Pipeline of transforms with a final estimator.
Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be ‘transforms’, that is, they
must implement fit and transform methods. The final estimator only
needs to implement fit
So this means if your pipeline is:
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
You are going to apply standscaler to your features first, then transform the result of this with tsne, before passing it to the classifier. I don't think it makes much sense to train on the tsne output.
If you really want to latch onto pipeline, then you will need to store the results of tsne as an attribute, then just return the feature, training as it is, so that the classifier can work on it.
Something like
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self,n_components,random_state=None,method='exact'):
self.n_components = n_components
self.method = method
self.random_state = random_state
def fit(self, X, y = None):
ts = TSNE(n_components = self.n_components,
method = self.method, random_state = self.random_state)
self.X_tsne = ts.fit_transform(X)
return self
def transform(self, X, y = None):
return X
Then:
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE(2)),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X,y = make_classification()
pipeline.fit(X,y)
You can retrieve your tsne like this:
pd.DataFrame(pipeline.steps[1][1].X_tsne)
0 1
0 -38.756626 -4.693253
1 46.516308 53.633842
2 49.107910 16.482645
3 18.306377 9.432504
4 33.551056 -27.441383
.. ... ...
95 -31.337574 -16.913471
96 -57.918224 -39.959976
97 55.282658 37.582535
98 66.425125 19.717241
99 -50.692646 11.545088
Related
I am trying to scale my data within the crossvalidation folds of a MLENs Superlearner pipeline. When I use StandardScaler in the pipeline (as demonstrated below), I receive the following warning:
/miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)
Of note, when I omit the StandardScaler() the warning disappears, but the data is not scaled.
breast_cancer_data = load_breast_cancer()
X = breast_cancer_data['data']
y = breast_cancer_data['target']
from sklearn.model_selection import train_test_split
X, X_val, y, y_val = train_test_split(X, y, test_size=.3, random_state=0)
from sklearn.base import BaseEstimator
class RFBasedFeatureSelector(BaseEstimator):
def __init__(self, n_estimators):
self.n_estimators = n_estimators
self.selector = None
def fit(self, X, y):
clf = RandomForestClassifier(n_estimators=self.n_estimators, random_state = RANDOM_STATE, class_weight = 'balanced')
clf = clf.fit(X, y)
self.selector = SelectFromModel(clf, prefit=True, threshold = 0.001)
def transform(self, X):
if self.selector is None:
raise AttributeError('The selector attribute has not been assigned. You cannot call transform before first calling fit or fit_transform.')
return self.selector.transform(X)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
N_FOLDS = 5
RF_ESTIMATORS = 1000
N_ESTIMATORS = 1000
RANDOM_STATE = 42
from mlens.metrics import make_scorer
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
accuracy_scorer = make_scorer(balanced_accuracy_score, average='micro', greater_is_better=True)
from mlens.ensemble.super_learner import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
ensemble = SuperLearner(folds=N_FOLDS, shuffle=True, random_state=RANDOM_STATE, n_jobs=10, scorer=balanced_accuracy_score, backend="multiprocessing")
preprocessing1 = {'pipeline-1': [StandardScaler()]
}
preprocessing2 = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS)]
}
estimators = {'pipeline-1': [RandomForestClassifier(RF_ESTIMATORS, random_state=RANDOM_STATE, class_weight='balanced'),
MLPClassifier(hidden_layer_sizes=(10, 10, 10), activation='relu', solver='sgd',
max_iter=5000)
]
}
ensemble.add(estimators, preprocessing2, preprocessing1)
ensemble.add_meta(LogisticRegression(solver='liblinear', class_weight = 'balanced'))
ensemble.fit(X,y)
yhat = ensemble.predict(X_val)
balanced_accuracy_score(y_val, yhat)```
>Error text: /miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)
You are currently passing your preprocessing steps as two separate arguments when calling the add method.
You can instead combine them as follows:
preprocessing = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS),StandardScaler()]}
Please refer to the documentation for the add method found here:
https://mlens.readthedocs.io/en/0.1.x/source/mlens.ensemble.super_learner/
I am trying to make a project for Machine Learning and I wanted to perform an accuracy evaluation of multiple alhorithms. I am using this CSV and I am loading only Date, Time and CO columns ( I manually renamed it in the CSV). After I prepare my training data, I am trying to perform the evaluations, but I am getting:
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.
The shapes for the vectors used for evaluations (X_train and Y_train) are:
(9357, 2)
(9357,)
The class:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class Models:
test_size: float
random_state: int
def __init__(self, test_size: float = 0.20, random_state: int = 1) -> None:
super().__init__()
self.test_size = test_size
self.random_state = random_state
#staticmethod
def init_models() -> []:
return [
('LR', LogisticRegression(solver='liblinear', multi_class='ovr')),
('LDA', LinearDiscriminantAnalysis()),
('KNN', KNeighborsClassifier()),
('CART', DecisionTreeClassifier()),
('NB', GaussianNB()),
('SVM', SVC(gamma='auto'))
]
def train(self, x: [], y: []):
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=self.test_size,
random_state=self.random_state)
return x_train, x_validation, y_train, y_validation
def evaluate(self, x_train: [], y_train: [], splits: int = 10, random_state: int = 1):
results = []
names = []
models = self.init_models()
for name, model in models:
kfold = StratifiedKFold(n_splits=splits, random_state=random_state)
cv_results = cross_val_score(estimator=model, X=x_train, y=y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
And I am calling my class as:
models_helper = Models()
array = dataset.values
X = array[:, 1:3]
Y = array[:, 2]
prepared = models_helper.train(X, Y)
classification = models_helper.evaluate(prepared[0], prepared[2])
I avoided this problem by first calculating predicted values with cross_val_predict and then using the predicted values with y_test to get score with metrics.accuracy_score.
# Function that runs the requested algorithm and returns the accuracy metrics.
# Passing the sklearn model as an argument along with cv values and training data.
def fit_ml_algo(algo, X_train, y_train, cv):
# One Pass
model = algo.fit(X_train, y_train)
acc = round(model.score(X_train, y_train) * 100, 2)
# Cross Validation
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
# Cross-validation accuracy metric
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, acc, acc_cv
I've got a dataset containing a lot of missing values (NAN). I want to use linear or multilinear regression in python and fill all the missing values. You can find the dataset here: Dataset
I have used f_regression(X_train, Y_train) to select which feature should I use.
first of all I convert df['country'] to dummy then used important features then I have used regression but the results Not good.
I have defined following functions to select features and missing values:
def select_features(target,df):
'''Get dataset and terget and print which features are important.'''
df_dummies = pd.get_dummies(df,prefix='',prefix_sep='',drop_first=True)
df_nonan = df_dummies.dropna()
X = df_nonan.drop([target],axis=1)
Y = df_nonan[target]
X = pd.get_dummies(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=40)
f,pval = f_regression(X_train, Y_train)
inds = np.argsort(pval)[::1]
results = pd.DataFrame(np.vstack((f[inds],pval[inds])), columns=X_train.columns[inds], index=['f_values','p_values']).iloc[:,:15]
print(results)
And I have defined following function to predict missing values.
def train(target,features,df,deg=1):
'''Get dataset, target and features and predict nan in target column'''
df_dummies = pd.get_dummies(df,prefix='',prefix_sep='',drop_first=True)
df_nonan = df_dummies[[*features,target]].dropna()
X = df_nonan.drop([target],axis=1)
Y = df_nonan[target]
pol = PolynomialFeatures(degree=deg)
X=X[features]
X = pd.get_dummies(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40, random_state=40)
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=0.50, random_state=40)
# X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
X_train_n = pol.fit_transform(X_train)
reg = linear_model.Lasso()
reg.fit(X_train_n,Y_train);
X_test_n = pol.fit_transform(X_test)
Y_predtrain = reg.predict(X_train_n)
print('train',r2_score(Y_train, Y_predtrain))
Y_pred = reg.predict(X_test_n)
print('test',r2_score(Y_test, Y_pred))
# val
X_val_n = pol.fit_transform(X_val)
X_val_n.shape,X_train_n.shape,X_test_n.shape
Y_valpred = reg.predict(X_val_n)
print('val',r2_score(Y_val, Y_valpred))
X_names = X.columns.values
X_new = df_dummies[X_names].dropna()
X_new = X_new[df_dummies[target].isna()]
X_new_n = pol.fit_transform(X_new)
Y_new = df_dummies.loc[X_new.index,target]
Y_new = reg.predict(X_new_n)
Y_new = pd.Series(Y_new, index=X_new.index)
Y_new.head()
return Y_new, X_names, X_new.index
Then I am using these functions to fill nan for features with p_values<0.05.
But I am not sure is it a good way or not.
With this way many missing remain unpredicted.
I'm trying to make a heart disease prediction program using Naive Bayes. When I finished the classifier, the cross validation showed a mean accuracy of 80% However when I try to make a prediction on a given sample, the prediction is all wrong! The dataset is the heart disease dataset from UCI repository, it contains 303 samples. There are two classes 0: healthy and 1: ill, when I try making a prediction on a sample from the dataset, it doesn't predicts its true value, except for very few samples. Here is the code:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
class Predict:
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Train_Test_Split_data(self,dataset):
Y = dataset['Num'].apply(lambda x: 1 if x > 0 else 0)
X = dataset.drop('Num', axis=1)
validation_size = 0.20
seed = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
return X_train, X_test, Y_train, Y_test
def Scaler(self, X_train, X_test):
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, X_test
def Cross_Validate(self, clf, X_train, Y_train, cv=5):
scores = cross_val_score(clf, X_train, Y_train, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
return score, scores
def Fit_Score(self, clf, X_train, Y_train, X_test, Y_test, label='x'):
clf.fit(X_train, Y_train)
fit_score = clf.score(X_train, Y_train)
pred_score = clf.score(X_test, Y_test)
print("%s: fit score %.5f, predict score %.5f" % (label, fit_score, pred_score))
return pred_score
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
data = self.Read_Clean(dataset_path)
X_train, X_test, Y_train, Y_test = self.Train_Test_Split_data(data)
X_train, X_test = self.Scaler(X_train, X_test)
self.NB = GaussianNB()
self.Fit_Score(self.NB, X_train, Y_train, X_test, Y_test, label='NB')
self.Cross_Validate(self.NB, X_train, Y_train, 10)
return self.ReturnPredictionValue(self.NB, sample)
When I run:
if __name__ == '__main__':
sample = [41.0, 0.0, 2.0, 130.0, 204.0, 0.0, 2.0, 172.0, 0.0, 1.4, 1.0, 0.0, 3.0]
p = Predict()
print "Prediction value: {}".format(p.PredictionMain(sample))
The result is:
NB: fit score 0.84711, predict score 0.83607 CV scores mean: 0.8000
Prediction value: 1
I get 1 instead of 0 (this sample is already one of the dataset samples).
I did this for more than one sample from the dataset and I get wrong result most of the time, it's as if the accuracy is not 80%!
Any help would be appreciated.
Thanks in advance.
Edit:
Problem solved using Pipeline. The final code is:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
class Predict:
def __init__(self):
self.X = []
self.Y = []
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Split_Dataset(self, df):
self.Y = df['Num'].apply(lambda x: 1 if x > 0 else 0)
self.X = df.drop('Num', axis=1)
def Create_Pipeline(self):
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('bayes', GaussianNB()))
model = Pipeline(estimators)
return model
def Cross_Validate(self, clf, cv=5):
scores = cross_val_score(clf, self.X, self.Y, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
def Fit_Score(self, clf, label='x'):
clf.fit(self.X, self.Y)
fit_score = clf.score(self.X, self.Y)
print("%s: fit score %.5f" % (label, fit_score))
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
print "dataset: "+ dataset_path
data = self.Read_Clean(dataset_path)
self.Split_Dataset(data)
self.model = self.Create_Pipeline()
self.Fit_Score(self.model, label='NB')
self.Cross_Validate(self.model, 10)
return self.ReturnPredictionValue(self.model, sample)
Now making a prediction on the same sample in the question returns [0] which is the true value. Actually by running the following method:
def CheckTrue(self):
clf = self.Create_Pipeline()
out = cross_val_predict(clf, self.X, self.Y)
p = [out == self.Y]
c = 0
for i in range(303):
if p[0][i] == True:
c += 1
print "Samples with true values: {}".format(c)
I get 249 true samples using the pipeline code, whereas I got only 150 before.
You're not applying StandardScaler to the sample. Classifier expects scaled data as it was trained on StandardScaler.transform output, but sample is not scaled the same way as in training.
It is easy to make such mistakes when combining multiple steps (scaling, preprocessing, classification) manually. To avoid such issues it is a good idea to use scikit-learn Pipeline.
I would like to predict the probability from Logistic Regression model with cross-validation. I know you can get the cross-validation scores, but is it possible to return the values from predict_proba instead of the scores?
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import (StratifiedKFold, cross_val_score,
train_test_split)
from sklearn import datasets
# setup data
iris = datasets.load_iris()
X = iris.data
y = iris.target
# setup model
cv = StratifiedKFold(y, 10)
logreg = LogisticRegression()
# cross-validation scores
scores = cross_val_score(logreg, X, y, cv=cv)
# predict probabilities
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
logreg.fit(Xtrain, ytrain)
proba = logreg.predict_proba(Xtest)
This is now implemented as part of scikit-learn version 0.18. You can pass a 'method' string parameter to the cross_val_predict method. Documentation is here.
Example:
proba = cross_val_predict(logreg, X, y, cv=cv, method='predict_proba')
Also note that this is part of the new sklearn.model_selection package so you will need this import:
from sklearn.model_selection import cross_val_predict
An easy workaround for this is to create a wrapper class, which for your case would be
class proba_logreg(LogisticRegression):
def predict(self, X):
return LogisticRegression.predict_proba(self, X)
and then pass an instance of it as the classifier object to cross_val_predict
# cross validation probabilities
probas = cross_val_predict(proba_logreg(), X, y, cv=cv)
There is a function cross_val_predict that gives you the predicted values, but there is no such function for "predict_proba" yet. Maybe we could make that an option.
This is easy to implement:
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x: m.predict_proba(x),
combine=np.vstack
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)
This one returns predict_proba.
If you need both predict and predict_proba just change predict and combine arguments:
def stack(arrs):
if arrs[0].ndim == 1:
return np.hstack(arrs)
else:
return np.vstack(arrs)
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x:[ m.predict(x)
, m.predict_proba(x)
],
combine=lambda preds: list(map(stack, zip(*preds)))
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)