Running Neptune.ai in a loop

Running Neptune.ai in a loop - python

so i created a for loop so I can run various batch sizes, where each loop will open and close a neptune run. The first time runs fine, but the following runs, the accuracy doesn't record into neptune, and python does not throw an error? Can anyone think what the problem may be?
for i in range(len(percentage)):
run = neptune.init(
project="xxx",
api_token="xxx",
)
epochs = 600
batch_perc = percentage[i]
lr = 0.001
sb = 64 #round((43249*batch_perc)*0.00185)
params = {
'lr': lr,
'bs': sb,
'epochs': epochs,
'batch %': batch_perc
}
run['parameters'] = params
torch.manual_seed(12345)
td = 43249 * batch_perc
vd = 0.1*(43249 - td) + td
train_dataset = dataset[:round(td)]
val_dataset = dataset[round(td):round(vd)]
test_dataset = dataset[round(vd):]
print(f'Number of training graphs: {len(train_dataset)}')
run['train'] = len(train_dataset)
print(f'Number of validation graphs: {len(val_dataset)}')
run['val'] = len(val_dataset)
print(f'Number of test graphs: {len(test_dataset)}')
run['test'] = len(test_dataset)
train_loader = DataLoader(train_dataset, batch_size=sb, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=sb, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
model = GCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(1, epochs):
train()
train_acc = test(train_loader)
run['training/batch/acc'].log(train_acc)
val_acc = test(val_loader)
run['training/batch/val'].log(val_acc)

Prince here,
Try using the stop() method to kill the previous run, because currently, you are creating new run objects without killing them, and that might cause some problems.
for i in range(len(percentage)):
run = neptune.init(
project="xxx",
api_token="xxx",
)
run['parameters'] = params
run['train'] = len(train_dataset)
run['val'] = len(val_dataset)
run['test'] = len(test_dataset)
...
for epoch in range(1, epochs):
...
run['training/batch/acc'].log(train_acc)
run['training/batch/val'].log(val_acc)
run.stop()
Docs: https://docs.neptune.ai/api-reference/run#.stop

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

I've initialized two identical ANN with PyTorch (both as structure and initial parameters), and I've noticed that the hyperparameters setting with Ray Tune, returns different results for the two ANN, even if I didn't have any random initialization.
Someone could explain what I'm doing wrong? I'll attach the code:
ANN Initialization:
class Featrues_model(nn.Module):
def __init__(self, n_inputs, dim_hidden, n_outputs):
super().__init__()
self.fc1 = nn.Linear(n_inputs, dim_hidden)
self.fc2 = nn.Linear(dim_hidden, n_outputs)
def forward(self, X):
X = self.fc1(X)
X = self.fc2(X)
return X
features_model_v1 = Featrues_model(len(list_input_variables),5,6)
features_model_v2 = Featrues_model(len(list_input_variables),5,6)
features_model_v2.load_state_dict(features_model_v1.state_dict())
Hyperpamameters setting
config = {
"lr": tune.choice([1e-2, 1e-5]),
"weight_decay": tune.choice([1e-2, 1e-5]),
"batch_size": tune.choice([16,64]),
"epochs": tune.choice([10,50])
}
Train & Validation Dataframe
trainset = df_final.copy()
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
df_train = df_final.iloc[train_subset.indices]
df_val = df_final.iloc[val_subset.indices]
Train function design
def setting_model(config, df_train, df_val, model):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
BATCH_SIZE = config["batch_size"]
for epoch in range(config["epochs"]):
train_epoch_loss = 0
train_epoch_acc = 0
step = 0
for i in tqdm(range(0, df_train.shape[0], BATCH_SIZE)):
batch_X = np.array(
df_train[list_input_variables].iloc[i:i+BATCH_SIZE]
)
batch_X = torch.Tensor([x for x in batch_X])
batch_Y = np.array(
df_train[list_output_variables].iloc[i:i+BATCH_SIZE]
)
batch_Y = torch.Tensor([int(y) for y in batch_Y])
batch_Y = batch_Y.type(torch.int64)
optimizer.zero_grad()
outputs = model.forward(batch_X)
train_loss = criterion(outputs, batch_Y)
train_acc = multi_acc(outputs, batch_Y)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
step += 1
# print statistics
print(f"Epochs: {epoch}")
print(f"Train Loss: {train_epoch_loss/len(df_train)}")
print(f"Train Acc: {train_epoch_acc/step}")
print("\n")
# Validation loss
with torch.no_grad():
X_val = np.array(
df_val[list_input_variables]
)
X_val = torch.Tensor([x for x in X_val])
Y_val = np.array(
df_val[list_output_variables]
)
Y_val = torch.Tensor([int(y) for y in Y_val])
Y_val = Y_val.type(torch.int64)
outputs = model.forward(X_val)
_, predicted = torch.max(outputs.data, 1)
total = Y_val.size(0)
correct = (predicted == Y_val).sum().item()
loss = criterion(outputs, Y_val)
tune.report(loss=(loss.numpy()), accuracy=correct / total)
print(f"Validation Loss: {loss.numpy()/len(df_val)}")
print(f"Validation Acc: {correct / total:.3f}")
print("Finished Training")
Hyperparameters Tune
result_v1 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v1),
config=config,
fail_fast="raise",
)
result_v2 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v2),
config=config,
fail_fast="raise"
)
Output
result_v1.get_best_config()
{'lr': 1e-05, 'weight_decay': 1e-05, 'epochs': 1}
result_v2.get_best_config()
{'lr': 0.01, 'weight_decay': 1e-05, 'epochs': 1}

The issue is the use of torch.random under the hood. Since you are not directly providing a weight matrix for your layers, pytorch initializes it for you. Luckily, you can have a reproducible experiment by setting
torch.manual_seed(x) # where x is an integer
One should use only a few random seeds, otherwise you might overfit on the random seed. See lottery ticket hypothesis at https://arxiv.org/abs/1803.03635)

How to fix value error 'too many values to unpack (expected 2)' that makes in traing CIFAR10 dataset using Torch framework

I am going to train CIFAR10 dataset in the Torch framework. First I download this dataset and load it with two first functions. Then I train using the Pytorch framework. Eventually, I receive this error. It is my appreciate if you help to fix it. My code is long, so I put the summary of functions using in train.
too many values to unpack (expected 2)
def load_cifar10_batch(filename):
""" Load a single batch from CIFAR10 """
with open(filename, 'rb') as f:
datadict = pickle.load(f, encoding='bytes')
X=datadict[b'data']
Y = datadict[b'labels']
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype('float')
Y = np.array(Y)
return X, Y
def load_cifar10(dir):
""" Load all batches of CIFAR10 """
# load train batch file
xs = []
ys = []
for i in range(1, 6):
filename = os.path.join(dir, 'data_batch_%d' % i)
X, Y = load_cifar10_batch(filename)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
# load test batch
Xte, Yte = load_cifar10_batch(os.path.join(dir, 'test_batch'))
return Xtr, Ytr, Xte, Yte
X_train, y_train, X_test, y_test = load_cifar10('cifar-10-batches-py')
'''we used just test set, because of the train set is so big file for train '''
from torch.utils.data import random_split
val_size = 3000
train_size = len(X_test) - val_size
train_ds, val_ds = random_split(X_test, [train_size, val_size])
len(train_ds), len(val_ds)
'''loading data '''
from torch.utils.data.dataloader import DataLoader
batch_size=16
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size, num_workers=4, pin_memory=True)
'''our model '''
class Cifar10CnnModel(ImageClassificationBase):
def __init__(self):
def forward(self, xb):
return self.network(xb)
'''ImageClassificationBase'''
class ImageClassificationBase(nn.Module):
def training_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
accu = accuracy(out,labels)
return loss,accu
def fit(model, train_loader, val_loader,epochs=2,learning_rate=0.001):
best_valid = None
history = []
optimizer = torch.optim.Adam(model.parameters(), learning_rate,weight_decay=0.0005)
for epoch in range(epochs):
# Training Phase
model.train()
train_losses = []
train_accuracy = []
for batch in tqdm(train_loader):
loss,accu = model.training_step(batch)
train_losses.append(loss)
train_accuracy.append(accu)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation phase
result = evaluate(model, val_loader)
result['train_loss'] = torch.stack(train_losses).mean().item()
result['train_accuracy'] = torch.stack(train_accuracy).mean().item()
model.epoch_end(epoch, result)
if(best_valid == None or best_valid<result['Accuracy']):
best_valid=result['Accuracy']
torch.save(model.state_dict(), 'cifar10-cnn.pth')
history.append(result)
return history
'''But the call to this function'''
''' train dataset '''
history = fit(model, train_dl, val_dl)
'''gives this error'''
0%| | 0/438 [00:31<?, ?it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [44], in <cell line: 1>()
----> 1 history = fit(model, train_dl, val_dl)
Input In [43], in fit(model, train_loader, val_loader, epochs, learning_rate)
9 train_accuracy = []
10 for batch in tqdm(train_loader):
---> 11 loss,accu = model.training_step(batch)
12 train_losses.append(loss)
13 train_accuracy.append(accu)
Input In [27], in ImageClassificationBase.training_step(self, batch)
7 def training_step(self, batch):
----> 8 images, labels = batch
9 out = self(images) # Generate predictions
10 loss = F.cross_entropy(out, labels) # Calculate loss
ValueError: too many values to unpack (expected 2)

You perform the split on X_test only, losing the labels this way.
Try something like
dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
train_ds, val_ds = random_split(dataset, [train_size, val_size])

Getting a fixed accuracy: 0.5000 and sometimes 0.0000e+00 in Keras model using Google Colab

I am training a CNN model using Keras on Google Colab for binary image classification, the problem is when i use Sigmoid function i get accuracy fixed on 0.5000, and when i change metrics to 'acc' i get 0.000e+00 as accuracy. Also, when i change the activation function to 'Softmax' my model start learning.
Ps: i am using google colab where Tensorflow version is 2.5.0
My code:
def define_model(input_shape, num_classes):
model=ResNet50(include_top = False, weights = 'imagenet', input_shape = input_shape)
x = model.output
x = GlobalAveragePooling2D()(x)
preds = Dense(num_classes,activation='sigmoid')(x)
model = Model(inputs=model.input,outputs=preds)
return model
def train(epochs):
train_generator = ImageDataGenerator(rescale=1.0/255.0,vertical_flip=True, horizontal_flip=True)
test_generator = ImageDataGenerator(rescale=1.0/255.0)
train_generator = train_generator.flow_from_directory(
'trainset/',
target_size=(image_size, image_size),
batch_size=BATCH_SIZE_TRAINING,
seed = 7)
validation_generator = test_generator.flow_from_directory(
'testset/',
target_size=(image_size, image_size),
batch_size=BATCH_SIZE_VALIDATION,
seed = 7)
input_shape = (CHANNELS, image_size, image_size) if K.image_data_format() == 'channels_first' \
else (image_size, image_size, CHANNELS)
model = define_model(input_shape, NUM_CLASSES)
opt = optimizers.Adam(learning_rate=1e-6, beta_1=0.9, beta_2=0.99, amsgrad=False)
model.summary()
model.compile(loss='binary_crossentropy',
optimizer=opt,
metrics=['acc'])
filepath=path+"weights-improvement-{epoch:02d}-vacc:{val_accuracy:.2f}-tacc:{accuracy:.2f}.hdf5"
'''cb_early_stopper = EarlyStopping(monitor = 'val_accuracy', mode='min', verbose=1, patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = filepath, monitor = 'val_accuracy', save_best_only = True, mode = 'auto')
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.25, patience=5, min_lr=1e-7)'''
fit_history = model.fit(train_generator,
epochs = NUM_EPOCHS,
validation_data=validation_generator,
verbose=1,
class_weight=class_weights)
# callbacks = [cb_checkpointer, cb_early_stopper, reduce_lr],
return model, fit_history
def main():
start_time = time()
model, fit_history = train(epochs=NUM_EPOCHS)
end_time = time()
seconds_elapsed = end_time - start_time
print('token time: ', seconds_elapsed)
hours, rest = divmod(seconds_elapsed, 3600)
minutes, seconds = divmod(rest, 60)
if __name__ == "__main__":
main()

The problem solved by adding this code to the .flow_from_directory() function:
class_mode='binary',
Thanks to this thread on github:
https://github.com/keras-team/keras/issues/13006

Overfitting when fine-tuning BERT sentiment analysis

I am newbie to Machine Learning in general. I am currently trying to follow a tutorial on sentiment analysis using BERT and Transformers https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
However when I train the model it has appeared that the model is overfitting
I do not know how to fix this. I have tried lowering amount of epochs, increasing batch size , shuffling my data (which is ordered) and increasing the validation split. So far nothing has worked. I have even tried changing different learning rate but the one I am using now is the smallest.
Below is my code:
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 40
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
BATCH_SIZE = 32
#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 6
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
import torch
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc

Broadly speaking, to reduce overfitting, you can:
increase regularization
reduce model complexity
perform early stopping
increase training data
From what you've written, you've already tried 3 and 4. In the case of neural networks, you can increase regularization by increasing dropout. You already have the code for it.
# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
I'd recommend trying higher values of the Dropout probability, as I noted in your code above ("INCREASE THIS VALUE"). Keep track of the Dropout probability and the resulting observed overfitting. Try probability values of 0.1, 0.2, 0.3, 0.4, 0.5.
Usually, I've found that dropout over 0.5 doesn't do much good.

Increase of GPU memory usage during training

I was training the network on usual MNIST dataset, and encountered the next problem:
when i start to add valid_metrics to a loss_list and accuracy_list, amount of GPU memory that is being used starts increasing with every 1 or 2 epochs.
This is the code of train_loop:
def train_model(model: torch.nn.Module,
train_dataset: torch.utils.data.Dataset,
valid_dataset: torch.utils.data.Dataset,
loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
optimizer_params: Dict = {},
initial_lr = 0.01,
lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
lr_scheduler_params: Dict = {},
batch_size = 64,
max_epochs = 1000,
early_stopping_patience = 20):
optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
best_valid_loss = None
best_epoch = None
loss_list = list()
accuracy_list = list()
for epoch in range(max_epochs):
print(f'Epoch {epoch}')
start = timer()
train_single_epoch(model, optimizer, loss_function, train_loader)
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
loss_list.append(valid_metrics['loss'])
accuracy_list.append(valid_metrics['accuracy'])
print('time:', timer() - start)
print(f'Validation metrics: \n{valid_metrics}')
lr_scheduler.step(valid_metrics['loss'])
if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
print(f'Best model yet, saving')
best_valid_loss = valid_metrics['loss']
best_epoch = epoch
torch.save(model, './best_model.pth')
if epoch - best_epoch > early_stopping_patience:
print('Early stopping triggered')
return loss_list, accuracy_list
and the code of validate_single_epoch:
def validate_single_epoch(model: torch.nn.Module,
loss_function: torch.nn.Module,
data_loader: torch.utils.data.DataLoader):
loss_total = 0
accuracy_total = 0
for data in data_loader:
X, y = data
X, y = X.view(-1, 784), y.to(device)
X = X.to(device)
output = model(X)
loss = loss_function(output, y)
loss_total += loss
y_pred = output.argmax(dim = 1, keepdim=True).to(device)
accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
loss_avg = loss_total / len(data_loader.dataset)
accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
return {'loss' : loss_avg, 'accuracy' : accuracy_avg}
I use GeForce MX250 as GPU

The problem is likely because the gradients are being computed and stored in the validation loop. To solve that, perhaps the easiest way is to wrap the validation call in a no_grad context:
with torch.no_grad():
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
If you prefer, you can also decorate the validate_single_epoch(...) with #torch.no_grad():
#torch.no_grad()
def validate_single_epoch(...):
# ...
Not related to your problem, but pay attention that you're using a model in training mode during validation, which may not be what you want. Perhaps there is a missing call to model.eval() in the validation function.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Running Neptune.ai in a loop - python

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

How to fix value error 'too many values to unpack (expected 2)' that makes in traing CIFAR10 dataset using Torch framework

Getting a fixed accuracy: 0.5000 and sometimes 0.0000e+00 in Keras model using Google Colab

Overfitting when fine-tuning BERT sentiment analysis

Increase of GPU memory usage during training

Categories

Resources