Stratified train/val/test split in Pytorch - python

I have an image classification dataset with 6 categories that I'm loading using the torchvision ImageFolder class. I have written the below to split the dataset into 3 sets in a stratified manner:
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
train_indices, test_indices, _, _ = train_test_split
(
range(len(dataset)),
dataset.targets,
stratify=dataset.targets,
test_size=0.1,
random_state=1
)
train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)
train_targets = [label for _, label in train_dataset]
train_indices, val_indices, _, _ = train_test_split
(
range(len(train_dataset)),
train_targets,
stratify=train_targets,
test_size=0.111,
random_state=1
)
train_dataset = Subset(dataset, train_indices)
This correctly splits the data into a 90/10/10% split. All the classes follow the same distribution between all three sets except for the 6th class. All samples belonging to the 6th class end up in the test set.
What am I doing wrong?
Thanks.

Related

MLServer GRPC cannot reshape array of size 347 into shape (1,)

I am following the example to serve sklearn model https://github.com/SeldonIO/MLServer/blob/master/docs/examples/sklearn/README.md
I am able to train and genreate the model, and then do a REST call for the inference successfully. However, I am trying to craft a gRPC call now, and the only example I could find is this https://mlserver.readthedocs.io/en/latest/examples/custom-json/README.html?highlight=grpc#send-test-inference-request-grpc
However, this is using another model. SO I try to follow this example but replace it with inference request data from my current infer.py, please see infer-grpc.py below.
train.py
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
# The digits dataset
digits = datasets.load_digits()
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
data, digits.target, test_size=0.5, shuffle=False)
# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)
import joblib
model_file_name = "mnist-svm.joblib"
joblib.dump(classifier, model_file_name)
infer.py (http)
import requests
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
# The digits dataset
digits = datasets.load_digits()
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
data, digits.target, test_size=0.5, shuffle=False)
# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)
x_0 = X_test[0:1]
inference_request = {
"inputs": [
{
"name": "predict",
"shape": x_0.shape,
"datatype": "FP32",
"data": x_0.tolist()
}
]
}
endpoint = "http://localhost:8089/v2/models/mnist-svm/versions/v0.1.0/infer"
response = requests.post(endpoint, json=inference_request)
print(response.json())
infer-grpc.py
import mlserver.types
import requests
import json
import grpc
import mlserver.grpc.converters as converters
import mlserver.grpc.dataplane_pb2_grpc as dataplane
import mlserver.types as types
import requests
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
# The digits dataset
digits = datasets.load_digits()
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
data, digits.target, test_size=0.5, shuffle=False)
# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)
x_0 = X_test[0:1]
model_name = "mnist-svm"
ip = {"input": x_0.tolist()}
inputs_bytes = json.dumps(x_0.tolist()).encode("UTF-8")
print([len(inputs_bytes)])
print(inputs_bytes)
inference_request = types.InferenceRequest(
inputs=[
types.RequestInput(
name="predict",
shape=[len(inputs_bytes)],
datatype="BYTES",
data=[inputs_bytes],
)
]
)
inference_request_g = converters.ModelInferRequestConverter.from_types(
inference_request,
model_name=model_name,
model_version=None
)
grpc_channel = grpc.insecure_channel("localhost:8081")
grpc_stub = dataplane.GRPCInferenceServiceStub(grpc_channel)
response = grpc_stub.ModelInfer(inference_request_g)
response
throws an error:
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "Unexpected <class 'ValueError'>: cannot reshape array of size 347 into shape (1,)"
The V2 Inference Protocol lets you send tensors directly. Therefore, since you are trying to send a Numpy tensor, you should be able to encode it as (i.e. without the intermediate json.dumps step):
inference_request = types.InferenceRequest(
inputs=[
types.RequestInput(
name="predict",
shape=x_0.shape,
datatype="BYTES",
data=x_0.tolist()
)
]
)
Note that, since MLServer 1.1.0, you can also use codecs to encode your payloads. Therefore, you should also be able to do the following:
from mlserver.codecs import NumpyRequestCodec
inference_request = NumpyRequestCodec.encode_request(x_0)

how to predict multiple dependent columns from 1 independent column

is it possible to predict multiple dependent columns from independent columns?
Problem Statement: I have to predict 5 factors(cEXT, cNEU,cAGR, cCON, cOPN) on the basis of STATUS column, so input variable will be STATUS column only and target variables are (cEXT, cNEU,cAGR, cCON, cOPN).
here in the above data STATUS is an independent column and cEXT, cNEU,cAGR, cCON, cOPN are the dependent columns, how can I predict those?
# independent and dependent variable split
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
right now I am predicting only one column so repeating the same thing 5 times so I am creating 5 models for 5 target variables.
Code:
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
ct = ColumnTransformer([
('step1', TfidfVectorizer(), 'STATUS')
],remainder='drop')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn import metrics
from sklearn.pipeline import Pipeline
# ##########
# RandomForest
# ##########
model = Pipeline([
('column_transformers', ct),
('model', RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto')),
])
# creating 5 models, can I create 1 model?
model_cEXT = model.fit(X_train, y_train['cEXT'])
model_cNEU = model.fit(X_train, y_train['cNEU'])
model_cAGR = model.fit(X_train, y_train['cAGR'])
model_cCON = model.fit(X_train, y_train['cCON'])
model_cOPN = model.fit(X_train, y_train['cOPN'])
You can use multioutput classifier from scikit-learn.
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
clf = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
clf.predict(X_test)
Reference:
Official document of MultiOutputClassifier
There is a library scikit-multilearn which is very good for these tasks. There are several ways to do multi-label classification such as PowerSet, ClassifierChain etc. These are very well covered in this library.
Below is a sample of how it will replace your current code.
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
# Rest of your code
==========================
# The new code
from skmultilearn.problem_transform import BinaryRelevance
from scipy.sparse import csr_matrix
classifier = BinaryRelevance(
classifier = RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto'),
require_dense = [False, True]
)
model = Pipeline([
('column_transformers', ct),
('classifier', classifier),
])
model.fit(X_train, y_train.values)
res = model.predict(X_test)
res = csr_matrix(res)
res.todense()
You can explore other methods here.
In TensorFlow you can do this using sigmoid activation and binaryCE loss on all the units. As below:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
tfidf_calculator = TextVectorization(
standardize = 'lower_and_strip_punctuation',
split = 'whitespace',
max_tokens = 100,
output_mode ='tf-idf',
pad_to_max_tokens=False)
tfidf_calculator.adapt(df['Status'].values)
tfids = tfidf_calculator(df['Status'])
X = tfids.numpy()
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(100,)),
tf.keras.layers.Dense(10, activation='relu'),
tf.keras.layers.Dense(5, activation='sigmoid')
])
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy())
model.fit(X_train, y_train, epochs=20, batch_size=32)
The thing to take note of in TensorFlow is that you need a dense matrix as input. There might be a way to use sparse but I didn't find any.

how to use cross-validation with ktrain?

I am using the ktrain package to perform multiclass text classification. The example on the official ktrain website works great (https://github.com/amaiya/ktrain)
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True)
(x_train, y_train) = (train_b.data, train_b.target)
(x_test, y_test) = (test_b.data, test_b.target)
# build, train, and validate model (Transformer is wrapper around transformers library)
import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(5e-5, 4)
learner.validate(class_names=t.get_classes())
Accuracy is pretty high.
However, I am comparing this model with other models trained with scikit-learn and, in particular, the other models' accuracy is assessed using cross validation
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
How can I adapt the code above to make sure the transformer model used with ktrain is also evaluated with the same cross validation methodology?
You can try something like this:
from ktrain import text
import ktrain
import pandas as pd
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
# load text data
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True)
(x_train, y_train) = (train_b.data, train_b.target)
(x_test, y_test) = (test_b.data, test_b.target)
df = pd.DataFrame({'text':x_train, 'target': [train_b.target_names[y] for y in y_train]})
# CV with transformers
N_FOLDS = 2
EPOCHS = 3
LR = 5e-5
def transformer_cv(MODEL_NAME):
predictions,accs=[],[]
data = df[['text', 'target']]
for train_index, val_index in KFold(N_FOLDS).split(data):
preproc = text.Transformer(MODEL_NAME, maxlen=500)
train,val=data.iloc[train_index],data.iloc[val_index]
x_train=train.text.values
x_val=val.text.values
y_train=train.target.values
y_val=val.target.values
trn = preproc.preprocess_train(x_train, y_train)
model = preproc.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, batch_size=16)
learner.fit_onecycle(LR, EPOCHS)
predictor = ktrain.get_predictor(learner.model, preproc)
pred=predictor.predict(x_val)
acc=accuracy_score(y_val,pred)
print('acc',acc)
accs.append(acc)
return accs
print( transformer_cv('distilbert-base-uncased') )
# output:
# [0.9627989371124889, 0.9689716312056738]
REFERENCE: See this Kaggle notebook for a regression example.

How to split test and train data in a dataset based on number of targets in each category

I have an imageFolder in PyTorch which holds my categorized data images. Each folder is the name of the category and in the folder are images of that category.
I've loaded data and split train and test data via a sampler with random train_test_split. But the problem is my data distribution isn't good and some classes have lots of images and some classes have fewer.
So for solving this problem I want to choose 20% of each class as my test data and the rest would be the train data
ds = ImageFolder(filePath, transform=transform)
batch_size = 64
validation_split = 0.2
indices = list(range(len(ds))) # indices of the dataset
# TODO: fix spliting
train_indices,test_indices = train_test_split(indices,test_size=0.2)
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, sampler=train_sampler, num_workers=16)
test_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, sampler=test_sampler, num_workers=16)
Any idea of how I should fix it?
Use the stratify argument in train_test_split according to the docs. If your label indices is an array-like called y, do:
train_indices,test_indices = train_test_split(indices, test_size=0.2, stratify=y)
Try using StratifiedKFold or StratifiedShuffleSplit.
According to the docs:
This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
In your case you can try:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
for train_index, test_index in sss.split(ds):
train = torch.utils.data.Subset(dataset, train_index)
test = torch.utils.data.Subset(dataset, test_index)
trainloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
testloader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)

Get panda Series from csv

I am totally new to machine learning, I am currently playing with MNIST machine learning, using RandomForestClassifier.
I use sklearn and panda.
I have a training CSV data set.
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
train = pd.read_csv("train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
user_train = pd.read_csv("input.csv")
user_features = user_train.columns[1:]
y_train = user_train[user_features]
user_y = user_train['label']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X/255.,y,test_size=1,random_state=0)
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("pred : ", y_pred_rf)
print("random forest accuracy: ",acc_rf)
I have the current code, which works well. It takes the training set, split and take one element for testing, and does the prediction.
What I want now is to use the testing data from an input, I have a new csv called "input.csv", and I want to predict the value inside this csv.
How can I replace the model_selection.train_test_split with my input data ?
I am sure the response is very obvious, and I didn't find anything.
The following part of your code is unused
user_train = pd.read_csv("input.csv")
user_features = user_train.columns[1:]
y_train = user_train[user_features]
user_y = user_train['label']
If input.csv has the same structure of train.csv you may want to:
train a classifier and test it on a split of the input.csv dataset: (please refer to http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html to know how to set the test size)
input_train = pd.read_csv("input.csv")
input_features = user_train.columns[1:]
input_data = user_train[input_features]
input_labels = user_train['label']
data_train, data_test, labels_train, labels_test = model_selection.train_test_split(input_data/255.,input_labels,test_size=1,random_state=0)
clf_rf = RandomForestClassifier()
clf_rf.fit(data_train, labels_train)
labels_pred_rf = clf_rf.predict(data_test)
acc_rf = accuracy_score(labels_test, labels_pred_rf)
test the previously trained classifier on the whole input.csv file
input_train = pd.read_csv("input.csv")
input_features = user_train.columns[1:]
input_data = user_train[input_features]
input_labels = user_train['label']
labels_pred_rf = clf_rf.predict(input_data)
acc_rf = accuracy_score(input_labels, labels_pred_rf)

Categories

Resources