How can I use yellowbrick on the output of non-scikit models - python

How can I use yellowbrick on the output of non-Scikit models?
I have a PyTorch multi-class classifier network and would like to use the ClassificationReport functionality on the results of applying this model to data. How can I do this?

If you use the skorch library which makes Pytorch models sci-kit learn compatible then you can use yellowbrick's Third party wrappers then you can possibly make your models work. Here is some example code
import numpy as np
from sklearn.datasets import make_classification
from torch import nn
from sklearn.model_selection import train_test_split
from skorch import NeuralNetClassifier
X, y = make_classification(1000, 20, n_informative=10, random_state=0)
X = X.astype(np.float32)
y = y.astype(np.int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
class MyModule(nn.Module):
def __init__(self, num_units=10, nonlin=nn.ReLU()):
super(MyModule, self).__init__()
self.dense0 = nn.Linear(20, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(0.5)
self.dense1 = nn.Linear(num_units, num_units)
self.output = nn.Linear(num_units, 2)
self.softmax = nn.Softmax(dim=-1)
def forward(self, X, **kwargs):
X = self.nonlin(self.dense0(X))
X = self.dropout(X)
X = self.nonlin(self.dense1(X))
X = self.softmax(self.output(X))
return X
net = NeuralNetClassifier(
MyModule,
max_epochs=10,
lr=0.1,
# Shuffle training data on each epoch
iterator_train__shuffle=True,
)
# Import the wrap function and a Yellowbrick visualizer
from yellowbrick.contrib.wrapper import wrap
from yellowbrick.classifier import classification_report
# Instantiate the third party estimator and wrap it, optionally fitting it
model = wrap(net)
model.fit(X_train, y_train)
# Use the visualizer
oz = classification_report(model, X_train, y_train, X_test=X_test, y_test=y_test, support=True, is_fitted=True)

Related

TypeError: Cannot iterate over a Tensor with unknown first dimension

Getting the following error when I execute the below code:
TypeError: Cannot iterate over a Tensor with unknown first dimension.
How to solve this? The error is in the line output_gcn = gcn(input_layer)
I tried reshaping the input_layer, but it didnt work
What is the problem and how to solve it?
Please let me know the solution as early as possible, as I am doing something apart from learning and have deadlines to meet
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from stellargraph.data import UniformRandomWalk
#from stellargraph.layer import GCN
from stellargraph import StellarGraph
from tensorflow.keras import layers, Model, optimizers
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN
from stellargraph.layer import node2vec
from stellargraph import StellarGraph
#from stellargraph.draw import draw
#generator = PaddedGraphGenerator(graphs=graphs)`
pro_tweets = pprocess[0:10000]
labels = df_encoded[['label_mild', 'label_moderate', 'label_non-depressed',
'label_severe']]
np.array(labels)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(pro_tweets)
#print(vectors)
print(vectors.shape)
similarity_matrix = cosine_similarity(vectors)
adj_matrix = np.zeros(similarity_matrix.shape)
adj_matrix[similarity_matrix > 0] = similarity_matrix[similarity_matrix > 0]
#print(adj_matrix)
#print(adj_matrix.shape[0])
graph = StellarGraph(adj_matrix, node_features=vectors)
rw = UniformRandomWalk(graph)
walks = rw.run(nodes=list(range(adj_matrix.shape[0])), length=5, n=1)
gcn = GCN(layer_sizes=[32, 16], activations=["relu", "relu"], generator =
FullBatchNodeGenerator(graph, method="gcn"))
#input_layer = GCN.get_input_layer(graph)
input_layer = layers.Input(shape = (vectors.shape[1],), dtype="float32", name="input")
print(input_layer.shape)
print(input_layer)
#reshaped_input_layer = tf.reshape(input_layer, [vectors.shape[1],])
import tensorflow as tf
output_gcn = gcn(input_layer)
#input_layer = layers.Input(shape=(adj_matrix.shape[0],adj_matrix.shape[1]),
dtype="int32", name="input")
#output_layer = gcn(input_layer)
output_embedding = node2vec(output_dim=16)(output_gcn)
dense_layer = layers.Dense(16, activation="relu")(output_embedding)
output_layer = layers.Dense(4, activation="softmax")(dense_layer)
'''create the final dense layer
dense_layer = layers.Dense(16, activation="relu")(output_layer)
output_layer = layers.Dense(1, activation="sigmoid")(dense_layer)'''
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=optimizers.Adam(lr=0.01), loss="binary_crossentropy", metrics=
["acc"])
X_train, X_test, y_train, y_test = train_test_split(walks, labels, test_size=0.2,
random_state=42)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50)
test_predictions = model.predict(X_test)
test_predictions = np.round(test_predictions)
accuracy = (test_predictions == y_test).mean()
print("Accuracy: {:.4f}".format(accuracy))
train_predictions = model.predict(X_train)
train_predictions = np.round(train_predictions)
accuracy = (train_predictions == y_train).mean()
print("Accuracy: {:.4f}".format(accuracy))]

StackingCVClassifier pre-trained base models

I haven't been able to find any information on whether or not StackingCVClassifiers accept pre-trained models.
Probably not. StackedCVClassifiers and StackingClassifier currently take a list of base estimators, then apply fit and predict on them.
It's pretty straightforward to implement this though. The main idea behind stacking is to fit a "final model" using the predictions of earlier models.
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
X, y = make_regression(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y)
Here X_train is (750, 100) and X_test is (250, 100).
We'll emulate "pre-trained" three models fit on X_train, y_train and produce predictions using the training set and the test set:
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
# Emulate "pre-trained" models
models = [RidgeCV(), LassoCV(), KNeighborsRegressor(n_neighbors=5)]
X_train_new = np.zeros((X_train.shape[0], len(models))) # (750, 3)
X_test_new = np.zeros((X_test.shape[0], len(models))) # (250, 3)
for i, model in enumerate(models):
model.fit(X_train, y_train)
X_train_new[:, i] = model.predict(X_train)
X_test_new[:, i] = model.predict(X_test)
The final model is fit on X_train_new and can make predictions using (N, 3) matrices produced by our base models:
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor()
clf.fit(X_train_new, y_train)
clf.score(X_test_new, y_test)
# 0.9998247

how to predict multiple dependent columns from 1 independent column

is it possible to predict multiple dependent columns from independent columns?
Problem Statement: I have to predict 5 factors(cEXT, cNEU,cAGR, cCON, cOPN) on the basis of STATUS column, so input variable will be STATUS column only and target variables are (cEXT, cNEU,cAGR, cCON, cOPN).
here in the above data STATUS is an independent column and cEXT, cNEU,cAGR, cCON, cOPN are the dependent columns, how can I predict those?
# independent and dependent variable split
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
right now I am predicting only one column so repeating the same thing 5 times so I am creating 5 models for 5 target variables.
Code:
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
ct = ColumnTransformer([
('step1', TfidfVectorizer(), 'STATUS')
],remainder='drop')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn import metrics
from sklearn.pipeline import Pipeline
# ##########
# RandomForest
# ##########
model = Pipeline([
('column_transformers', ct),
('model', RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto')),
])
# creating 5 models, can I create 1 model?
model_cEXT = model.fit(X_train, y_train['cEXT'])
model_cNEU = model.fit(X_train, y_train['cNEU'])
model_cAGR = model.fit(X_train, y_train['cAGR'])
model_cCON = model.fit(X_train, y_train['cCON'])
model_cOPN = model.fit(X_train, y_train['cOPN'])
You can use multioutput classifier from scikit-learn.
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
clf = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
clf.predict(X_test)
Reference:
Official document of MultiOutputClassifier
There is a library scikit-multilearn which is very good for these tasks. There are several ways to do multi-label classification such as PowerSet, ClassifierChain etc. These are very well covered in this library.
Below is a sample of how it will replace your current code.
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
# Rest of your code
==========================
# The new code
from skmultilearn.problem_transform import BinaryRelevance
from scipy.sparse import csr_matrix
classifier = BinaryRelevance(
classifier = RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto'),
require_dense = [False, True]
)
model = Pipeline([
('column_transformers', ct),
('classifier', classifier),
])
model.fit(X_train, y_train.values)
res = model.predict(X_test)
res = csr_matrix(res)
res.todense()
You can explore other methods here.
In TensorFlow you can do this using sigmoid activation and binaryCE loss on all the units. As below:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
tfidf_calculator = TextVectorization(
standardize = 'lower_and_strip_punctuation',
split = 'whitespace',
max_tokens = 100,
output_mode ='tf-idf',
pad_to_max_tokens=False)
tfidf_calculator.adapt(df['Status'].values)
tfids = tfidf_calculator(df['Status'])
X = tfids.numpy()
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(100,)),
tf.keras.layers.Dense(10, activation='relu'),
tf.keras.layers.Dense(5, activation='sigmoid')
])
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy())
model.fit(X_train, y_train, epochs=20, batch_size=32)
The thing to take note of in TensorFlow is that you need a dense matrix as input. There might be a way to use sparse but I didn't find any.

Using StandardScaler as Preprocessor in Mlens Pipeline generates Classification Warning

I am trying to scale my data within the crossvalidation folds of a MLENs Superlearner pipeline. When I use StandardScaler in the pipeline (as demonstrated below), I receive the following warning:
/miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)
Of note, when I omit the StandardScaler() the warning disappears, but the data is not scaled.
breast_cancer_data = load_breast_cancer()
X = breast_cancer_data['data']
y = breast_cancer_data['target']
from sklearn.model_selection import train_test_split
X, X_val, y, y_val = train_test_split(X, y, test_size=.3, random_state=0)
from sklearn.base import BaseEstimator
class RFBasedFeatureSelector(BaseEstimator):
def __init__(self, n_estimators):
self.n_estimators = n_estimators
self.selector = None
def fit(self, X, y):
clf = RandomForestClassifier(n_estimators=self.n_estimators, random_state = RANDOM_STATE, class_weight = 'balanced')
clf = clf.fit(X, y)
self.selector = SelectFromModel(clf, prefit=True, threshold = 0.001)
def transform(self, X):
if self.selector is None:
raise AttributeError('The selector attribute has not been assigned. You cannot call transform before first calling fit or fit_transform.')
return self.selector.transform(X)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
N_FOLDS = 5
RF_ESTIMATORS = 1000
N_ESTIMATORS = 1000
RANDOM_STATE = 42
from mlens.metrics import make_scorer
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
accuracy_scorer = make_scorer(balanced_accuracy_score, average='micro', greater_is_better=True)
from mlens.ensemble.super_learner import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
ensemble = SuperLearner(folds=N_FOLDS, shuffle=True, random_state=RANDOM_STATE, n_jobs=10, scorer=balanced_accuracy_score, backend="multiprocessing")
preprocessing1 = {'pipeline-1': [StandardScaler()]
}
preprocessing2 = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS)]
}
estimators = {'pipeline-1': [RandomForestClassifier(RF_ESTIMATORS, random_state=RANDOM_STATE, class_weight='balanced'),
MLPClassifier(hidden_layer_sizes=(10, 10, 10), activation='relu', solver='sgd',
max_iter=5000)
]
}
ensemble.add(estimators, preprocessing2, preprocessing1)
ensemble.add_meta(LogisticRegression(solver='liblinear', class_weight = 'balanced'))
ensemble.fit(X,y)
yhat = ensemble.predict(X_val)
balanced_accuracy_score(y_val, yhat)```
>Error text: /miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)
You are currently passing your preprocessing steps as two separate arguments when calling the add method.
You can instead combine them as follows:
preprocessing = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS),StandardScaler()]}
Please refer to the documentation for the add method found here:
https://mlens.readthedocs.io/en/0.1.x/source/mlens.ensemble.super_learner/

How to do regression using tensorflow with series output?

I want to build a regression model with 2 output nodes using tensorflow. I search a code which can build regression model but with 1 output nodes.
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/skflow/boston.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.contrib import learn
def main(unused_argv):
# Load dataset
boston = learn.datasets.load_dataset('boston')
x, y = boston.data, boston.target
# Split dataset into train / test
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
x, y, test_size=0.2, random_state=42)
# Scale data (training set) to 0 mean and unit standard deviation.
scaler = preprocessing.StandardScaler()
x_train = scaler.fit_transform(x_train)
# Build 2 layer fully connected DNN with 10, 10 units respectively.
feature_columns = learn.infer_real_valued_columns_from_input(x_train)
regressor = learn.DNNRegressor(
feature_columns=feature_columns, hidden_units=[10, 10])
# Fit
regressor.fit(x_train, y_train, steps=5000, batch_size=1)
# Predict and score
y_predicted = list(
regressor.predict(scaler.transform(x_test), as_iterable=True))
score = metrics.mean_squared_error(y_predicted, y_test)
print('MSE: {0:f}'.format(score))
if __name__ == '__main__':
tf.app.run()
I am new to tensorflow, so I searched for the code which has similarity to how mine works, but the output of the code is one.
In my model, the input is N*1000, and the output is N*2. I wonder are there effective and efficient code for regression. Please give me some example.
Actually, I find a workable code using DNNRegressor:
import numpy as np
from sklearn.cross_validation import train_test_split
from tensorflow.contrib import learn
import tensorflow as tf
import logging
#logging.getLogger().setLevel(logging.INFO)
#Some fake data
N=200
X=np.array(range(N),dtype=np.float32)/(N/10)
X=X[:,np.newaxis]
#Y=np.sin(X.squeeze())+np.random.normal(0, 0.5, N)
Y = np.zeros([N,2])
Y[:,0] = X.squeeze()
Y[:,1] = X.squeeze()**2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
train_size=0.8,
test_size=0.2)
reg=learn.DNNRegressor(hidden_units=[10,10])
reg.fit(X_train,Y_train[:,0],steps=500)
But, this code will work only if the shape of Y_train is N*1, and it will fail when the shape of Y_train is N*2.
However, I want to build a regress model and the input is N*1000, the output is N*2. And I can't fix it.

Categories

Resources