how to use cross-validation with ktrain? - python

I am using the ktrain package to perform multiclass text classification. The example on the official ktrain website works great (https://github.com/amaiya/ktrain)
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True)
(x_train, y_train) = (train_b.data, train_b.target)
(x_test, y_test) = (test_b.data, test_b.target)
# build, train, and validate model (Transformer is wrapper around transformers library)
import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(5e-5, 4)
learner.validate(class_names=t.get_classes())
Accuracy is pretty high.
However, I am comparing this model with other models trained with scikit-learn and, in particular, the other models' accuracy is assessed using cross validation
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
How can I adapt the code above to make sure the transformer model used with ktrain is also evaluated with the same cross validation methodology?

You can try something like this:
from ktrain import text
import ktrain
import pandas as pd
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
# load text data
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True)
(x_train, y_train) = (train_b.data, train_b.target)
(x_test, y_test) = (test_b.data, test_b.target)
df = pd.DataFrame({'text':x_train, 'target': [train_b.target_names[y] for y in y_train]})
# CV with transformers
N_FOLDS = 2
EPOCHS = 3
LR = 5e-5
def transformer_cv(MODEL_NAME):
predictions,accs=[],[]
data = df[['text', 'target']]
for train_index, val_index in KFold(N_FOLDS).split(data):
preproc = text.Transformer(MODEL_NAME, maxlen=500)
train,val=data.iloc[train_index],data.iloc[val_index]
x_train=train.text.values
x_val=val.text.values
y_train=train.target.values
y_val=val.target.values
trn = preproc.preprocess_train(x_train, y_train)
model = preproc.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, batch_size=16)
learner.fit_onecycle(LR, EPOCHS)
predictor = ktrain.get_predictor(learner.model, preproc)
pred=predictor.predict(x_val)
acc=accuracy_score(y_val,pred)
print('acc',acc)
accs.append(acc)
return accs
print( transformer_cv('distilbert-base-uncased') )
# output:
# [0.9627989371124889, 0.9689716312056738]
REFERENCE: See this Kaggle notebook for a regression example.

Related

how to predict multiple dependent columns from 1 independent column

is it possible to predict multiple dependent columns from independent columns?
Problem Statement: I have to predict 5 factors(cEXT, cNEU,cAGR, cCON, cOPN) on the basis of STATUS column, so input variable will be STATUS column only and target variables are (cEXT, cNEU,cAGR, cCON, cOPN).
here in the above data STATUS is an independent column and cEXT, cNEU,cAGR, cCON, cOPN are the dependent columns, how can I predict those?
# independent and dependent variable split
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
right now I am predicting only one column so repeating the same thing 5 times so I am creating 5 models for 5 target variables.
Code:
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
ct = ColumnTransformer([
('step1', TfidfVectorizer(), 'STATUS')
],remainder='drop')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn import metrics
from sklearn.pipeline import Pipeline
# ##########
# RandomForest
# ##########
model = Pipeline([
('column_transformers', ct),
('model', RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto')),
])
# creating 5 models, can I create 1 model?
model_cEXT = model.fit(X_train, y_train['cEXT'])
model_cNEU = model.fit(X_train, y_train['cNEU'])
model_cAGR = model.fit(X_train, y_train['cAGR'])
model_cCON = model.fit(X_train, y_train['cCON'])
model_cOPN = model.fit(X_train, y_train['cOPN'])
You can use multioutput classifier from scikit-learn.
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
clf = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
clf.predict(X_test)
Reference:
Official document of MultiOutputClassifier
There is a library scikit-multilearn which is very good for these tasks. There are several ways to do multi-label classification such as PowerSet, ClassifierChain etc. These are very well covered in this library.
Below is a sample of how it will replace your current code.
X = df[['STATUS']]
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
# Rest of your code
==========================
# The new code
from skmultilearn.problem_transform import BinaryRelevance
from scipy.sparse import csr_matrix
classifier = BinaryRelevance(
classifier = RandomForestClassifier(criterion = 'gini', n_estimators=100, n_jobs = -1, class_weight = 'balanced', max_features = 'auto'),
require_dense = [False, True]
)
model = Pipeline([
('column_transformers', ct),
('classifier', classifier),
])
model.fit(X_train, y_train.values)
res = model.predict(X_test)
res = csr_matrix(res)
res.todense()
You can explore other methods here.
In TensorFlow you can do this using sigmoid activation and binaryCE loss on all the units. As below:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
tfidf_calculator = TextVectorization(
standardize = 'lower_and_strip_punctuation',
split = 'whitespace',
max_tokens = 100,
output_mode ='tf-idf',
pad_to_max_tokens=False)
tfidf_calculator.adapt(df['Status'].values)
tfids = tfidf_calculator(df['Status'])
X = tfids.numpy()
y = df[["cEXT","cNEU","cAGR","cCON","cOPN"]].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(100,)),
tf.keras.layers.Dense(10, activation='relu'),
tf.keras.layers.Dense(5, activation='sigmoid')
])
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy())
model.fit(X_train, y_train, epochs=20, batch_size=32)
The thing to take note of in TensorFlow is that you need a dense matrix as input. There might be a way to use sparse but I didn't find any.

Newbie : How evaluate model to increase accuracy model in classification

my data
how do I increase the accuracy of the model, if some of my models when run produce results like the one below
`
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.6780893042575286
`
Random Forest Classifier : Accuracy: 0.6780893042575286
There are several ways to achieve this:
Look at the data. Are they in the best shape for the algorithm? Regarding NaN, Covariance and so on? Are they normalized, are the categorical ones translated well? This is a question too far-reaching for a forum.
Look at the problem and the different algorithm suitable for this problem. Maybe
Logistic Regression
SVN
XGBoost
....
Try hyper parameter tuning with RandomisedsearvCV or GridSearchCV
This is quite high-level.
In terms of model selection, you can use a function like the below to find a good model that suits the problem.
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
def mutli_model(X_train, y_train, X_test, y_test):
""" Function to determine best model archietecture """
dfs = []
models = [
('LogReg', LogisticRegression()),
('RF', RandomForestClassifier()),
('KNN', KNeighborsClassifier()),
('SVM', SVC()),
('GNB', GaussianNB()),
('XGB', XGBClassifier(eval_metric="error"))
]
results = []
names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
target_names = ['App_Status_1', 'App_Status_2']
for name, model in models:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(name)
print(classification_report(y_test, y_pred, target_names=target_names))
results.append(cv_results)
names.append(name)
this_df = pd.DataFrame(cv_results)
this_df['model'] = name
dfs.append(this_df)
final = pd.concat(dfs, ignore_index=True)
return final
After model selection, you can do something called Hyperparameter tuning which will further increase the model's performance.
If you want to further improve the model, you implement techniques like Data Augmentation and also revisit the cleaning phase of your data.
If after all that, if it still doesn't improve you could try collecting more data or refocus the problem statement.

Predicting with a trained model

I used Logistic regression to create a model ,later saved the model using joblib. Later i tried loading that model and predicting label in my test.csv . When ever i try this i get an error saying "X has 1433445 features per sample; expecting 3797015"
This is my initial code:-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
#reading data
train=pd.read_csv('train_yesindia.csv')
test=pd.read_csv('test_yesindia.csv')
train=train.iloc[:,1:]
test=test.iloc[:,1:]
test.info()
train.info()
test['label']='t'
test=test.fillna(' ')
train=train.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
print('Accuracy of Lasso classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Lasso classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))
targets = train['label'].values
logreg = LogisticRegression()
logreg.fit(counts, targets)
example_counts = count_vectorizer.transform(test['total'].values)
predictions = logreg.predict(example_counts)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()
#dumping models
from joblib import dump, load
dump(logreg,'mypredmodel1.joblib')
Later i loaded model in a different code that is :-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
test=pd.read_csv('test_yesindia.csv')
test=test.iloc[:,1:]
test['label']='t'
test=test.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
#load_model
logreg = load('mypredmodel1.joblib')
example_counts = count_vectorizer.fit_transform(test['total'].values)
predictions = logreg.predict(example_counts)
When i run it, i get the error:
predictions = logreg.predict(example_counts)
Traceback (most recent call last):
File "<ipython-input-58-f28afd294d38>", line 1, in <module>
predictions = logreg.predict(example_counts)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 289, in predict
scores = self.decision_function(X)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 270, in decision_function
% (X.shape[1], n_features))
ValueError: X has 1433445 features per sample; expecting 3797015
Most probably, this is because you are re-fitting your transformers in the test set. This must not be done - you should also save them fitted in your training set, and use the test (or any other future) set only for transforming data.
This is easier done with pipelines.
So, remove the following code from your first block:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
and replace it with:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('counts', CountVectorizer(ngram_range=(1, 2)),
('tf-idf', TfidfTransformer(smooth_idf=False))
])
pipeline.fit(train['total'].values)
tfidf = pipeline.transform(train['total'].values)
targets = train['label'].values
test_tfidf = pipeline.transform(test['total'].values)
dump(pipeline, 'transform_predict.joblib')
Now, in your second code block, remove this part:
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
and replace it with:
pipeline = load('transform_predict.joblib')
test_tfidf = pipeline.transform(test['total'].values)
And you should be fine, provided that you predict the test_tfidf variable, and not the example_counts which are not transfomed by TF-IDF:
predictions = logreg.predict(test_tfidf)

Get panda Series from csv

I am totally new to machine learning, I am currently playing with MNIST machine learning, using RandomForestClassifier.
I use sklearn and panda.
I have a training CSV data set.
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
train = pd.read_csv("train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
user_train = pd.read_csv("input.csv")
user_features = user_train.columns[1:]
y_train = user_train[user_features]
user_y = user_train['label']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X/255.,y,test_size=1,random_state=0)
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("pred : ", y_pred_rf)
print("random forest accuracy: ",acc_rf)
I have the current code, which works well. It takes the training set, split and take one element for testing, and does the prediction.
What I want now is to use the testing data from an input, I have a new csv called "input.csv", and I want to predict the value inside this csv.
How can I replace the model_selection.train_test_split with my input data ?
I am sure the response is very obvious, and I didn't find anything.
The following part of your code is unused
user_train = pd.read_csv("input.csv")
user_features = user_train.columns[1:]
y_train = user_train[user_features]
user_y = user_train['label']
If input.csv has the same structure of train.csv you may want to:
train a classifier and test it on a split of the input.csv dataset: (please refer to http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html to know how to set the test size)
input_train = pd.read_csv("input.csv")
input_features = user_train.columns[1:]
input_data = user_train[input_features]
input_labels = user_train['label']
data_train, data_test, labels_train, labels_test = model_selection.train_test_split(input_data/255.,input_labels,test_size=1,random_state=0)
clf_rf = RandomForestClassifier()
clf_rf.fit(data_train, labels_train)
labels_pred_rf = clf_rf.predict(data_test)
acc_rf = accuracy_score(labels_test, labels_pred_rf)
test the previously trained classifier on the whole input.csv file
input_train = pd.read_csv("input.csv")
input_features = user_train.columns[1:]
input_data = user_train[input_features]
input_labels = user_train['label']
labels_pred_rf = clf_rf.predict(input_data)
acc_rf = accuracy_score(input_labels, labels_pred_rf)

How to implement GRNN Algorithm with MNIST Dataset (Python)

I'm trying to implement GRNN with MNIST handwritten digit dataset using python,
here is my code, i'm getting Predicted values as NaN
import numpy as np
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from neupy import algorithms
#import sys
print('\nLoading...')
traindata = np.genfromtxt('./MNIST_Dataset_Loader/dataset/mnist_train.csv', skip_header=55000,delimiter=',')
#testdata=np.genfromtxt('./MNIST_Dataset_Loader/dataset/mnist_test.csv',skip_header=9000, delimiter=',')
# Load MNIST Data
print('\nLoading MNIST Data...')
x_train = traindata[:,1:]
y_train = traindata[:,0]
print('\nLoading Testing Data...')
#x_test = testdata[:,1:]
#y_test = testdata[:,0]
x_train, x_test, y_train, y_test = train_test_split(preprocessing.minmax_scale(x_train),preprocessing.minmax_scale(y_train),test_size=0.3)
print("training")
nw = algorithms.GRNN(std=0.1)
nw.train(x_train, y_train)
#nw.fit(x_train, y_train)
print("Predicting")
y_predicted = nw.predict(x_test)
print(y_predicted)
mse = np.mean((y_predicted - y_test) ** 2)
#print(mse)

Categories

Resources