Multi-class, multi-label, ordinal classification with sklearn - python

I was wondering how to run a multi-class, multi-label, ordinal classification with sklearn. I want to predict a ranking of target groups, ranging from the one that is most prevalant at a certain location (1) to the one that is least prevalent (7).
I don't seem to be able to get it right. Could you please help me out?
# Random Forest Classification
# Import
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Import dataset
dataset = pd.read_excel('alle_probs_edit.v2.xlsx')
X = dataset.iloc[:,4:-1].values
Y = dataset.iloc[:,-1].values
# Split in Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42 )
# Scaling the features (alle Variablen auf eine gleiche Ebene), necessary depend on the choosen method
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Creat classifier
classifier = RandomForestClassifier(criterion = 'entropy')
# Choose some parameter combinations to try
parameters = {'bootstrap': [True, False],
'max_depth': [50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 3, 4],
'min_samples_split': [9, 10, 11, 12, 13],
'n_estimators': [500,1000,1500]}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
# Run the grid search
grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer, cv = 3, n_jobs = -1)
grid_obj = grid_obj.fit(X_train, Y_train)
# Set the classifier to the best combination of parameters
classifier = grid_obj.best_estimator_
# Fit the best algorithm to the data
classifier.fit(X_train, Y_train)
#Prediction the Test data
Y_pred = classifier.predict(X_test)
#Confusion Matrix
cm = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
#Accuracy
accuracy1 = accuracy_score(Y_test, Y_pred)
print("Accuracy1: %.2f%%" % (accuracy1 * 100.0))
# k-Fold Class Validation
accuracy1 = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10)
kfold = accuracy1.mean()
accuracy1.std()

This may not be the precise answer you're looking for, this article outlines a technique as follows:
We can take advantage of the ordered class value by transforming a k-class ordinal regression problem to a k-1 binary classification problem, we convert an ordinal attribute A* with ordinal value V1, V2, V3, … Vk into k-1 binary attributes, one for each of the original attribute’s first k − 1 values. The ith binary attribute represents the test A* > Vi
Essentially, aggregate multiple binary classifiers (predict target > 1, target > 2, target > 3, target > 4) to be able to predict whether a target is 1, 2, 3, 4 or 5. The author creates an OrdinalClassifier class that stores multiple binary classifiers in a Python dictionary.
class OrdinalClassifier():
def __init__(self, clf):
self.clf = clf
self.clfs = {}
def fit(self, X, y):
self.unique_class = np.sort(np.unique(y))
if self.unique_class.shape[0] > 2:
for i in range(self.unique_class.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.unique_class[i]).astype(np.uint8)
clf = clone(self.clf)
clf.fit(X, binary_y)
self.clfs[i] = clf
def predict_proba(self, X):
clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
predicted = []
for i, y in enumerate(self.unique_class):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[i][:,1])
elif i in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[i-1][:,1] - clfs_predict[i][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[i-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
def score(self, X, y, sample_weight=None):
_, indexed_y = np.unique(y, return_inverse=True)
return accuracy_score(indexed_y, self.predict(X), sample_weight=sample_weight)
The technique originates in A Simple Approach to Ordinal Classification

Here is an example using KNN that should be tuneable in an sklearn pipeline or grid search.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import check_classification_targets
class KNeighborsOrdinalClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_neighbors=5, *, weights='uniform',
algorithm='auto', leaf_size=30, p=2,
metric='minkowski', metric_params=None, n_jobs=None):
self.n_neighbors = n_neighbors
self.weights = weights
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.metric = metric
self.metric_params = metric_params
self.n_jobs = n_jobs
def fit(self, X, y):
X, y = check_X_y(X, y)
check_classification_targets(y)
self.clf_ = KNeighborsClassifier(**self.get_params())
self.clfs_ = {}
self.classes_ = np.sort(np.unique(y))
if self.classes_.shape[0] > 2:
for i in range(self.classes_.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.classes_[i]).astype(np.uint8)
clf = clone(self.clf_)
clf.fit(X, binary_y)
self.clfs_[i] = clf
return self
def predict_proba(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
clfs_predict = {k:self.clfs_[k].predict_proba(X) for k in self.clfs_}
predicted = []
for i,y in enumerate(self.classes_):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[y][:,1])
elif y in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[y-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
return np.argmax(self.predict_proba(X), axis=1)

Building off both David Diaz, the white paper, and Kartik above along with others linked to on Medium and attributed in the readme, I'm working on an OrdinalClassifier that is built on the sklearn framework and which works well with sklearn pipelines, scoring, and cross validation.
The OC performs very well vs. standard non ordinal mc classification and gives greater control over optimizing for precision/recall on the positive class (ie. "high" in for example the diabetes disease progression of low<medium<high classes. It supports any sklearn classifier that supports pred_proba. Cross validation scores are shown on repo.
OrdinalClassifer based on sklearn
https://github.com/leeprevost/OrdinalClassifier
At this time, I would not call it multi-label.

Related

How to use t-SNE inside the pipeline

How could I use t-SNE inside my pipeline?
I have managed without pipelining to successfully run t-SNE and on it a classification algorithm.
Do I need to write a custom method that can be called in the pipeline that returns a dataframe, or how does it work?
# How I used t-SNE
%%time
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
print(X_tsne.shape)
feature_list = []
for i in range(1,X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
rfc= RandomForestClassifier()
# Train Decision Tree Classifer
rfc= rfc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = rfc.predict(X_test)
What I want to use it
# How could I use TSNE() inside the the pipeline?
%%time
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = {'rfc__max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
'rfc__criterion':['gini', 'entropy']}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
[OUT] TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE()' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't
Should I build a custom method or how ? If so how should it look like ?
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self):
# don't know
def fit(self, X, y = None):
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
return self
def transform(self, X, y = None):
feature_list = []
for i in range(1,shelf.X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
return X, y
...
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
I think you misunderstood the use of pipeline. From help page:
Pipeline of transforms with a final estimator.
Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be ‘transforms’, that is, they
must implement fit and transform methods. The final estimator only
needs to implement fit
So this means if your pipeline is:
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
You are going to apply standscaler to your features first, then transform the result of this with tsne, before passing it to the classifier. I don't think it makes much sense to train on the tsne output.
If you really want to latch onto pipeline, then you will need to store the results of tsne as an attribute, then just return the feature, training as it is, so that the classifier can work on it.
Something like
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self,n_components,random_state=None,method='exact'):
self.n_components = n_components
self.method = method
self.random_state = random_state
def fit(self, X, y = None):
ts = TSNE(n_components = self.n_components,
method = self.method, random_state = self.random_state)
self.X_tsne = ts.fit_transform(X)
return self
def transform(self, X, y = None):
return X
Then:
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE(2)),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X,y = make_classification()
pipeline.fit(X,y)
You can retrieve your tsne like this:
pd.DataFrame(pipeline.steps[1][1].X_tsne)
0 1
0 -38.756626 -4.693253
1 46.516308 53.633842
2 49.107910 16.482645
3 18.306377 9.432504
4 33.551056 -27.441383
.. ... ...
95 -31.337574 -16.913471
96 -57.918224 -39.959976
97 55.282658 37.582535
98 66.425125 19.717241
99 -50.692646 11.545088

Poor accuarcy score for Semi-Supervised Support Vector machine

I am using a Semi-Supervised approach for Support Vector Machine in Python for the image classification from PASCAL VOC 2007 data.
I have tried with the default parameters from the libraries and also tuned them but it get extremely bad accuracy of about only ~ 2%.
Below is my code:
import pandas as pd
import numpy as np
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from numpy import concatenate
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import decomposition
import warnings
warnings.filterwarnings("ignore")
color_layout_features = pd.read_pickle("color_layout_descriptor.pkl")
bow_surf = pd.read_pickle("bow_surf.pkl")
color_hist_features = pd.read_pickle("hist.pkl")
labels = pd.read_pickle("labels.pkl")
# Feat. Scaling
def scale(X, x_min, x_max):
nom = (X-X.min(axis=0))*(x_max-x_min)
denom = X.max(axis=0) - X.min(axis=0)
denom[denom==0] = 1
return x_min + nom/denom
# normalization
def normalize(x):
return (x - np.min(x))/(np.max(x) - np.min(x))
color_layout_features_scaled = scale(color_layout_features, 0, 1)
color_hist_features_scaled = scale(color_hist_features, 0, 1)
bow_surf_scaled = scale(bow_surf, 0, 1)
features = np.hstack([color_layout_features_scaled, color_hist_features_scaled, bow_surf_scaled])
# define dataset
X, Y = features, labels
X = normalize(X)
pca = decomposition.PCA(n_components=100)
pca.fit(X)
X = pca.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1, stratify=Y)
# split train into labeled and unlabeled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.30, random_state=1, stratify=y_train)
# create the training dataset input
X_train_mixed = concatenate((X_train_lab, X_test_unlab))
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_test_unlab))]
# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))
from semisupervised import S3VM
model = S3VM(kernel="Linear", C = 1e-2, gamma = 0.5, lamU = 1.0, probability=True)
#model.fit(X_train_mixed, _train_mixed)
model.fit(np.vstack((X_train_lab, X_test_unlab)), np.append(y_train_lab, nolabel))
#model.fit(np.vstack((label_X_train, unlabel_X_train)), np.append(label_y_train, unlabel_y))
# predict
predict = model.predict(X_test)
acc = metrics.accuracy_score(y_test, predict)
# metric
print("accuracy", acc*100)
accuracy 2.6692291266282298
I am using a Transductive version of SVM (TSVM) from the semisupervised library. But not sure what am I doing wrong so that even after tweaking the parameters I still get the same result. Any inputs would be helpful.
I refer https://github.com/rosefun/SemiSupervised/blob/master/semisupervised/TSVM.py to make the implementation. Any inputs would be helpful.
Please consider that according to link Documentation "The unlabeled samples should be labeled as -1".

how to use instance weight to custom loss function in AdaBoost implementation

From ageron's hand-on ML with scipy and tensorflow, the Adaboost funcion equations are described in detail, except for how to use instance weight when training.
Below are codes i use skelearn's DecisionTree classifier, i guess sample_weight could be Weights when fit(), but accuracies are unstable when changing n_estimators. What's wrong with codes?
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y)
# implement AdaBoost classification ,TBD: score weired with n_estimators and max_depth
eta = 0.5 # learning_rate
n_estimators = 10 # simple start
# initial
clfs = [DecisionTreeClassifier(max_depth=1),] * n_estimators # predictors
W = np.ones((X_train.shape[0])) / X_train.shape[0] # instance weight
R = np.zeros(n_estimators) # weighted error rate of predictors
Alpha = np.zeros(n_estimators) # predictor weight
# build_trees
for j in range(n_estimators):
clf = clfs[j]
plt.plot(W)
clf.fit(X_train, y_train, sample_weight=W)
y_pred_train = clf.predict(X_train)
# Equation7-1,
R[j] = W[y_pred_train != y_train].sum() / (W.sum())
# 7-2,
Alpha[j] = eta * np.log((1 - R[j]) / R[j])
# 7-3, update Weight
W[y_pred_train != y_train] *= np.exp(Alpha[j])
# normalize
W /= W.sum()
# predict
K = np.zeros((y_test.shape[0], n_estimators), dtype=np.int32)
for j in range(n_estimators):
K[:,j] = clfs[j].predict(X_test)
# find max k with sum(alpha)
V = np.zeros((y_test.shape[0], 2))
for i in range(y_test.shape[0]):
for j in range(n_estimators):
if K[i,j] == y_test[i]:
V[i,y_test[i]] += Alpha[j]
y_pred = np.argmax(V, axis=1)
print(accuracy_score(y_test, y_pred))
plt.legend(range(n_estimators))
plt.show()

Custom 'Precision at k' scoring object in sklearn for GridSearchCV

I am currently trying to tune hyperparameters using GridSearchCV in scikit-learn using a 'Precision at k' scoring metric which will give me precision if I classify the top kth percentile of my classifier's score as the positive class. I know it is possible to create a custom scorer using make_scorer and creating a score function. This is what I have now:
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
def precision_at_k(y_true, y_score, k):
df = pd.DataFrame({'true': y_true, 'score': y_score}).sort('score')
threshold = df.iloc[int(k*len(df)),1]
y_pred = pd.Series([1 if i >= threshold else 0 for i in df['score']])
return metrics.precision_score(y_true, y_pred)
custom_scorer = metrics.make_scorer(precision_at_k, needs_proba=True, k=0.1)
X = np.random.randn(100, 10)
Y = np.random.binomial(1, 0.3, 100)
train_index = range(0, 70)
test_index = range(70, 100)
train_x = X[train_index]
train_Y = Y[train_index]
test_x = X[test_index]
test_Y = Y[test_index]
clf = LogisticRegression()
params = {'C': [0.01, 0.1, 1, 10]}
clf_gs = GridSearchCV(clf, params, scoring=custom_scorer)
clf_gs.fit(train_x, train_Y)
However, attempting to call fit gives me Exception: Data must be 1-dimensional and I'm not sure why. Can anyone help? Thanks in advance.
Arguments for pd.DataFrame should be 'list' not 'numpy.arrays'
So, just try converting y_true to python list...
df = pd.DataFrame({'true': y_true.tolist(), 'score': y_score.tolist()}).sort('score')

predict_proba for a cross-validated model

I would like to predict the probability from Logistic Regression model with cross-validation. I know you can get the cross-validation scores, but is it possible to return the values from predict_proba instead of the scores?
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import (StratifiedKFold, cross_val_score,
train_test_split)
from sklearn import datasets
# setup data
iris = datasets.load_iris()
X = iris.data
y = iris.target
# setup model
cv = StratifiedKFold(y, 10)
logreg = LogisticRegression()
# cross-validation scores
scores = cross_val_score(logreg, X, y, cv=cv)
# predict probabilities
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
logreg.fit(Xtrain, ytrain)
proba = logreg.predict_proba(Xtest)
This is now implemented as part of scikit-learn version 0.18. You can pass a 'method' string parameter to the cross_val_predict method. Documentation is here.
Example:
proba = cross_val_predict(logreg, X, y, cv=cv, method='predict_proba')
Also note that this is part of the new sklearn.model_selection package so you will need this import:
from sklearn.model_selection import cross_val_predict
An easy workaround for this is to create a wrapper class, which for your case would be
class proba_logreg(LogisticRegression):
def predict(self, X):
return LogisticRegression.predict_proba(self, X)
and then pass an instance of it as the classifier object to cross_val_predict
# cross validation probabilities
probas = cross_val_predict(proba_logreg(), X, y, cv=cv)
There is a function cross_val_predict that gives you the predicted values, but there is no such function for "predict_proba" yet. Maybe we could make that an option.
This is easy to implement:
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x: m.predict_proba(x),
combine=np.vstack
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)
This one returns predict_proba.
If you need both predict and predict_proba just change predict and combine arguments:
def stack(arrs):
if arrs[0].ndim == 1:
return np.hstack(arrs)
else:
return np.vstack(arrs)
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x:[ m.predict(x)
, m.predict_proba(x)
],
combine=lambda preds: list(map(stack, zip(*preds)))
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)

Categories

Resources