DMatrix in MultiOutput XGBoost Regressor - python

I am trying to convert a hyperparameter tuning algorithm to a MultiOutput regression setup, can someone please help me create DMatrix for the same. Here is the code for reference:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
Any further clarification required, please comment.TIA!

Related

How to use cross validation to select model?

I'm trying to use cross validation to select the best model. This is my code:
models = []
scaler = StandardScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
models.append(('LogisticRegression', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('GaussianNB', GaussianNB()))
models.append(('SVM', svm.SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
for name, model in models:
kfold = KFold(n_splits=n_folds)
cv_results = cross_val_score(model, scaled_x_train, y_train, cv=kfold, scoring='accuracy')
print("%6s %.3f %.3f " % (name, cv_results.mean(), cv_results.std()))
Now I want to add a CNN model that I defined. How can I add it in the cross validation?

Merge these scripts together to iterate over more algorithms

I have 4 types of data.
each one has been pre-processed using:
x1,y1=Standardisation
x2,y2=Normalisation
x3,y3=Rescale
and one is completely unprocessed (x,y).
I have applied logistic regression to each like this:
#Building Logistic Regression model on the UNPROCESSED DATA
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_predict = lr_model.predict(x_test)
print('Logistic Regression - ',accuracy_score(lr_predict,y_test))
#Building Logistic Regression model on the NORMALISED DATA
from sklearn.linear_model import LogisticRegression
lr_norm = LogisticRegression()
lr_norm.fit(x1_train, y1_train)
y_pred = lr_norm.predict(x1_test)
print("Accuracy of logistic regression on test set with Rescaled features: {:.2f}".format(lr_norm.score(x1_test, y1_test)))
and so on...
I want to make one graph, not sure which, that best represents the performance through its accuracy score, or whatever else there may be... but of the other models I wish to test down below:
svm_model = SVC(kernel='linear')
svm_model.fit(x_train,y_train)
svc_predict = svm_model.predict(x_test)
print('SVM - ',accuracy_score(svc_predict,y_test))
print('\t\t\t\tTRAIN DATA\n')
print(classification_report(y_train, svm_model.predict(x_train), target_names=encoder.inverse_transform([0,1,2])))
print('\n')
print('\t\t\t\tTEST DATA\n')
print(classification_report(y_test, svm_model.predict(x_test), target_names=encoder.inverse_transform([0,1,2])))
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
nb_predict = nb_model.predict(x_test)
print('Naive bayes - ',accuracy_score(nb_predict,y_test))
dt_model = DecisionTreeClassifier(max_leaf_nodes=3)
dt_model.fit(x_train,y_train)
dt_predict = dt_model.predict(x_test)
print('Decision Tree - ',accuracy_score(dt_predict,y_test))
rfc_model = RandomForestClassifier(max_depth=3)
rfc_model.fit(x_train,y_train)
rfc_predict = rfc_model.predict(x_test)
print('Random Forest - ',accuracy_score(rfc_predict,y_test))
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train,y_train)
knn_predict = knn_model.predict(x_test)
print('knn - ',accuracy_score(knn_predict,y_test))
Hope this makes sense..
#preprare data
pre_processing=[('NOT PROCESSED', None)]
pre_processing.append(('RESCALED', MinMaxScaler(feature_range=(0, 1))))
pre_processing.append(('STANDARDIZED', StandardScaler()))
pre_processing.append(('NORMALIZED', Normalizer()))
# prepare models
models = []
models.append(( 'LR' , LogisticRegression(max_iter=10000)))
models.append(( 'LDA' , LinearDiscriminantAnalysis()))
models.append(( 'KNN' , KNeighborsClassifier()))
models.append(( 'CART' , DecisionTreeClassifier()))
models.append(( 'NB' , GaussianNB()))
models.append(( 'SVM' , SVC(probability=True)))
results = []
names = []
higher_acc=0
standard=0
best_model=''
for process in pre_processing:
globals()['df_'+process[0]] = pd.DataFrame(index=None, columns=None)
for algo in models:
estimators = [process,algo]
model = Pipeline(estimators)
ss = ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
names.append(algo[0])
for scoring in performance_metrix:
cv_results = cross_val_score(model, X_train, Y_train, cv=ss, scoring=scoring)
globals()['df_'+process[0]].loc[algo[0],scoring]= '%s\u00B1%s'%(round(cv_results.mean()*100.0,2),round(cv_results.std()*100.0,2))
if performance_metrix.index(scoring)==0:
results.append(cv_results)
if cv_results.mean()*100.0 > higher_acc:
higher_acc=cv_results.mean()*100.0
standard=cv_results.std()*100.0
best_model=process[0], algo[0]
elif cv_results.mean()*100.0 == higher_acc:
if cv_results.std()*100.0 < standard:
higher_acc=cv_results.mean()*100.0
best_model=process[0], algo[0]
print('For %s data we produced:\n\n'%(process[0]),globals()['df_'+process[0]],'\n\n')
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithms accuracy comparison for %s data'%(process[0]))
ax = fig.add_subplot(111)
pyplot.boxplot(results[:len(models)])
ax.set_xticklabels(names)
pyplot.show()
# Create a pipeline that standardizes the data then creates a model
print("The overall best performance was the one obtained with %s data, using %s algorithm. \nIt's Accuracy resulted to be %s with a standard deviation of %s" %(best_model[0],best_model[1],round(higher_acc,2),round(standard,2)))
datasets = {
"Unprocessed": (x_train, x_test, y_train, y_test),
"Standardisation": (x1_train, x1_test, y1_train, y1_test),
"Normalisation": (x2_train, x2_test, y2_train, y2_test),
"Rescale": (x3_train, x3_test, y3_train, y3_test),
}
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(max_leaf_nodes=3),
"Random Forest": RandomForestClassifier(max_depth=3)
}
def evaluate_model(model, dataset):
x_train, x_test, y_train, y_test = data
model.fit(x_train, y_train)
pred = model.predict(x_test)
return accuracy_score(pred, y_test)
model_scores_for_datasets = {}
for dataset_name, dataset in datasets.items():
dataset_scores = {}
for model_name, model in models.items():
model_score = evaluate_model(model, dataset)
dataset_scores[model_name] = model_score
model_scores_for_datasets[dataset_name] = dataset_scores
Here, model_scores_for_datasets will contain the accuracy results for every dataset for each model and will look something like:
{
"Unprocessed" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Standardisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Normalisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Rescale" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
}
You now have the results for each dataset and can create your required plots. Something along these line:
for dataset_name, scores in model_scores_for_datasets.items():
# For example:
# dataset_name will be "Unprocessed"
# scores will be a dict like so:
# {
# "Logistic Regression" : 10,
# "Decision Tree": 5,
# "Random Forest": 20
# }
generate_plot(dataset_name scores)
Of course, you need to figure out the generate_plot function. Hope this helps and gives you some idea.

Keras - Multiclass classification and transfer learning, bad validation and test accuracy

I am building a multiclass classification model that would be able to recognize 4 different insects. I am using Resnet50 (weights = imagenet).
The dataset is small, average 100 photos per class (more than 400 in total)
Depends on model, I usually get val_accuracy more than 90% (epochs 200) and test accuracy around 80-85% but when I print confusion matrix or plot actual and predicted labels for given photos, results are terrible (usually around 25%).
I have tried different models (resnet18, resnet50v2, Xception) I was freezing model layers, tried different data augumentation parameters, different model parameters(such as: Dropout(0.5, 0.2), kernel_regularizer='l2' because I read that helps reducing overfitting).
I think problem is while generating images but I don't know what else to change there, I tried val_generator with shuffle=False/True, train_generator seed=1/off but final results are similar.
I am adding images of confusion matrix, accuracy and plotted photos.
I am using jupyter notebook.
Thank you!
directory_train = "keras_splited/train"
directory_test = "keras_splited/test"
directory_val = "keras_splited/val"
BATCH_SIZE = 32
IMG_SIZE = 224
def make_DataImageGenerator(validation_split=None):
image_generator = ImageDataGenerator(
rescale=(1.0/255),
rotation_range=40,
zoom_range=0.1,
horizontal_flip=True,
vertical_flip=True,
validation_split=validation_split
)
return image_generator
train_img_generator = make_DataImageGenerator(validation_split=None)
val_img_generator = make_DataImageGenerator(validation_split=None)
test_img_generator = make_DataImageGenerator(validation_split=None)
def get_generator(img_generator, directory, train_valid=None, seed=None, shuffle=True):
train_generator = img_generator.flow_from_directory(
directory,
batch_size=BATCH_SIZE,
target_size=(IMG_SIZE, IMG_SIZE),
subset=train_valid,
seed=seed,
shuffle=shuffle
)
return train_generator
train_generator = get_generator(train_img_generator, directory_train)
val_generator = get_generator(val_img_generator, directory_val)
test_generator = get_generator(test_img_generator, directory_test)
target_labels = next(os.walk(directory_train))[1]
target_labels.sort()
num_classes = len(target_labels)
model_feature_extraction = tf.keras.applications.ResNet50(weights="imagenet", include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
x = model_feature_extraction.output
x = layers.GlobalAveragePooling2D()(x)
x = Dense(1024, activation="relu")(x)
myModelOut = Dense(4, activation="softmax")(x)
model = Model(inputs=model_feature_extraction.input, outputs=myModelOut)
optimizer = "adam"
loss = "categorical_crossentropy"
def freeze_pretrained_weights(model):
#model.layers[0].trainable=False #wanted to freeze the model but didn't work good
model.compile(
optimizer=optimizer,
loss=loss,
metrics=["accuracy"]
)
return model
frozen_new_model = freeze_pretrained_weights(model)
my_callbacks = [
tf.keras.callbacks.ModelCheckpoint("testno/best_model/", save_best_only=True, monitor="accuracy", save_weights_only=False, mode="max"),
tf.keras.callbacks.ReduceLROnPlateau(monitor="loss", factor=0.2, patience=25, min_lr=0.001)
]
def train_model(model, train_gen, valid_gen, epochs):
train_steps_per_epoch = train_gen.n // train_gen.batch_size
history = model.fit(
train_gen,
steps_per_epoch=train_steps_per_epoch,
epochs=epochs,
callbacks=my_callbacks,
validation_data=valid_gen,
)
return history
history_frozen_model = train_model(frozen_new_model, train_generator, val_generator, epochs=150)
plt.figure(figsize=(15,5))
plt.subplot(121)
plt.plot(history_frozen_model.history['accuracy'])
plt.plot(history_frozen_model.history['val_accuracy'])
plt.title('Accuracy vs. epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='lower right')
plt.subplot(122)
plt.plot(history_frozen_model.history['loss'])
plt.plot(history_frozen_model.history['val_loss'])
plt.title('Loss vs. epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()
test_steps = test_generator.n // test_generator.batch_size
test_generator.reset()
new_model_test_loss, new_model_test_acc = frozen_new_model.evaluate(test_generator)
print('\nTest dataset')
print(f"Loss: {new_model_test_loss}")
print(f"Accuracy: {new_model_test_acc}")
pred = frozen_new_model.predict(test_generator, steps=test_steps, verbose=1)
batch = next(test_generator)
batch_images = np.array(batch[0])
batch_labels = np.array(batch[1])
target_labels = np.asarray(target_labels)
print(target_labels)
plt.figure(figsize=(15,15))
for n, i in enumerate(np.arange(6)):
actual = target_labels[np.argmax(batch_labels[i])]
predicted = target_labels[np.argmax(pred[i])]
confidence = round(100*(np.max(pred[i])),2)
ax = plt.subplot(3,3,n+1)
plt.imshow(batch_images[i])
plt.title(f"Actual: {actual},\n Predicted: {predicted},\n Confidence: {confidence}")
plt.axis('off')
from sklearn.metrics import ConfusionMatrixDisplay
y_true_lista = []
y_pred_lista = []
for i, img in enumerate(batch_labels):
y_true = np.argmax(batch_labels[i]).reshape(-1)
for i in y_true:
y_true_lista.append(i)
y_pred = np.argmax(pred[i]).reshape(-1)
for i in y_pred:
y_pred_lista.append(i)
print("y_true: ", y_true_lista)
print("y_pred: ", y_pred_lista)
matrix = confusion_matrix(y_true, y_pred)
#print(matrix.shape)
labels = target_labels
cm = confusion_matrix(y_true_lista, y_pred_lista)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues, xticks_rotation = 'vertical')
plt.show()
I don't know what to change to get right results when plotting and on the matrix
Can someone point me to the right direction? What did I do wrong with this model or is it something wrong with plotting?

Getting predict.proba from BERT classififer

I have a classifier on top of BERT, and I would like to see the predict probability for creating the ROC curve. How do I get the predict proba?. The predicted probas will be used to calculate the TPR FPR and threshold for ROC curve.
here is the code
class BertBinaryClassifier(nn.Module):
def __init__(self, dropout=0.1):
super(BertBinaryClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, tokens, masks=None):
_, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
prediction = self.sigmoid(linear_output)
return prediction
# Config setting
BATCH_SIZE = 4
EPOCHS = 5
# Making dataloaders
train_dataset = torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = torch.utils.data.RandomSampler(train_dataset)
train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset = torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = torch.utils.data.SequentialSampler(test_dataset)
test_dataloader = torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()
#wandb.watch(bert_clf)
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
# training
for epoch_num in range(EPOCHS):
bert_clf.train()
train_loss = 0
for step_num, batch_data in enumerate(train_dataloader):
token_ids, masks, labels = tuple(t for t in batch_data)
token_ids, masks, labels = token_ids.to(device), masks.to(device), labels.to(device)
preds = bert_clf(token_ids, masks)
loss_func = nn.BCELoss()
batch_loss = loss_func(preds, labels)
train_loss += batch_loss.item()
bert_clf.zero_grad()
batch_loss.backward()
optimizer.step()
#wandb.log({"Training loss": train_loss})
print('Epoch: ', epoch_num + 1)
print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
# evaluating on test
bert_clf.eval()
bert_predicted = []
all_logits = []
probs=[]
with torch.no_grad():
test_loss = 0
for step_num, batch_data in enumerate(test_dataloader):
token_ids, masks, labels = tuple(t for t in batch_data)
token_ids, masks, labels = token_ids.to(device), masks.to(device), labels.to(device)
logits = bert_clf(token_ids, masks)
pr=logits.ravel()
probs+=pr
loss_func = nn.BCELoss()
loss = loss_func(logits, labels)
test_loss += loss.item()
numpy_logits = logits.cpu().detach().numpy()
#print(numpy_logits)
#wandb.log({"Testing loss": test_loss})
bert_predicted += list(numpy_logits[:, 0] > 0.5)
all_logits += list(numpy_logits[:, 0])
I am able to get the prediction score to calculate the accuracy or f1 score. But not the probability for creating ROC curve.
Thanks
In your forward, you:
def forward(self, tokens, masks=None):
_, pooled_output = self.bert(...) # Get output of BERT
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output) # Take linear combination of outputs
# (unconstrained score - "logits")
prediction = self.sigmoid(linear_output) # Normalise scores
# (constrained between [0,1] - "probabilities")
return prediction
Hence the result of calling your model can be directly supplied to calculate the False Positive and True Positive rates e.g:
from sklearn import metrics
...
test_probs = bert_clf(token_ids, masks)
fpr, tpr, thresholds = metrics.roc_curve(labels, test_probs)
roc_auc = metrics.auc(fpr, tpr)

Pytorch convnet isn't learning

I'm new to PyTorch and I'm trying to build a model for a Kaggle competition. I used a pre-trained resnet but the training and the validation loss don't decrease. I suspect I did something wrong in my implementation:
#================================================================================
class TransferResnet(nn.Module):
def __init__(self, classes=4):
super().__init__()
# Use a pretrained model
self.network = models.resnet34(pretrained=True)
# Replace last layer
num_ftrs = self.network.fc.in_features
self.network.fc = nn.Sequential(nn.Linear(num_ftrs, 128),
nn.ReLU(),
nn.Dropout(0.50),
nn.Linear(128,classes))
def forward(self, xb):
out = self.network(xb)
return out
def feed_to_network(self, batch):
images, labels = batch
out = self(images)
out = F.softmax(out, dim=1)
loss = F.cross_entropy(out, labels)
return loss, out
#======================================================
def get_scores(labels, prediction, loss=None):
"Return classification scores"
accuracy = accuracy_score(labels, prediction)
f1 = f1_score(labels, prediction,
average='weighted', zero_division=0)
precision = precision_score(labels, prediction,
average='weighted', zero_division=0)
recall = recall_score(labels, prediction,
average='weighted', zero_division=0)
if loss:
return [accuracy, f1, precision, recall, loss]
else:
return [accuracy, f1, precision, recall]
def get_predictions(model, loader):
"""This function takes a model and a data loader,
returns the list of losses, the predictions and the labels"""
with torch.no_grad():
model.eval()
losses = []
predictions = []
labels = []
for batch in loader:
loss, out = model.feed_to_network(batch)
predictions += torch.max(out, dim=1)[1].tolist()
labels += batch[1].tolist()
losses.append(loss.item())
return labels, predictions, sum(losses)/len(losses)
#=================================================================
def fit(epochs, model, train_loader, val_loader,
opt_func=torch.optim.Adam, lr=3e-4, step_size=100):
def get_parameter(optimizer, parameter="lr"):
"""Retrieve learning rate or parameter"""
if parameter == 'lr':
for param_group in optimizer.param_groups:
return param_group['lr']
torch.cuda.empty_cache()
model.train()
#Dataframe that will store the metrics
train_metrics_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision',
'recall', 'loss'])
valid_metrics_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision',
'recall', 'loss'])
momentum_list = []
lr_list = []
optimizer = opt_func([{"params": model.network.fc.parameters(), "lr": lr},
{"params": model.network.layer4.parameters(), "lr": lr/2},
{"params": model.network.layer3.parameters(), "lr": lr/4},
{"params": model.network.layer2.parameters(), "lr": lr/6},
{"params": model.network.layer1.parameters(), "lr": lr/8},], lr)
for epoch in range(epochs):
# Training Phase
train_label = []
train_prediction = []
train_losses = []
for batch in tqdm(train_loader):
loss, out = model.feed_to_network(batch)
loss.backward()
#momentum_list.append(get_parameter(optimizer, parameter="momentum"))
lr_list.append(get_parameter(optimizer, parameter="lr"))
optimizer.step()
optimizer.zero_grad()
#Extract labels, predictions and loss of the training set
train_prediction += torch.max(out, dim=1)[1].tolist()
train_label += batch[1].tolist()
train_losses.append(loss.item())
#Evaluation phase
val_labels, val_predictions, val_loss = get_predictions(model, val_loader)
train_metrics_df.loc[epoch] = get_scores(train_label,train_prediction,
loss=sum(train_losses)/len(train_losses))
valid_metrics_df.loc[epoch] = get_scores(val_labels, val_predictions,
loss=val_loss)
print_epoch_trainLoss = train_metrics_df.iloc[epoch]["loss"]
print_epoch_validLoss = valid_metrics_df.iloc[epoch]["loss"]
print_epoch_validAccu = valid_metrics_df.iloc[epoch]["accuracy"]
print_epoch_trainAccu = train_metrics_df.iloc[epoch]["accuracy"]
print(f"Epoch: {epoch+1}, train loss: {print_epoch_trainLoss:.2f}, "
f"validation loss: {print_epoch_validLoss:.2f}, "
f"validation accuracy: {print_epoch_validAccu:.2f}, "
f"training accuracy: {print_epoch_trainAccu:.2f}, ")
return train_metrics_df, valid_metrics_df, (momentum_list,lr_list)
All the images are normalized, cropped to proper dimensions (490x490) and some data augmentation is performed (random flip, rotations, etc...). All this code is executed on a GPU using Kaggle notebooks (my GPU is not enough for this dataset). This is my first implementation of a CNN and I do not know what I did wrong. I also tried to learn the classifier with a learning rate = 0.1, but the loss does not decrease.

Categories

Resources