I have spent all day trying to read everywhere about implementing K-Fold cross-validation in PyTorch but can't get it to work. I have a custom subset written for my dataset but that does not work either... The code having issues is this:
for epoch in range(training_configuration.epochs_count):
mean = [0.5768, 0.4622, 0.3460]
std = [0.2383, 0.2464, 0.2465]
if data_augmentation:
train_transforms = data_augmentation_transforms(mean, std)
print('Data Augmentation: On')
else:
train_transforms = image_common_transforms(mean, std)
print('Data Augmentation: Off')
test_transforms = image_common_transforms(mean, std)
dataset = KenyanFood13Dataset(
root_dir = train_config.root_dir,
img_path = train_config.img_path,
csv_file = train_config.csv_path,
transform = train_transforms)
kf = KFold(n_splits=5, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(dataset)):
train = torch.utils.data.Subset(dataset, train_index)
test = torch.utils.data.Subset(dataset, test_index)
train_loader = DataLoader(train, batch_size=train_config.batch_size, shuffle=True, num_workers=train_config.num_workers)
test_loader = DataLoader(test, batch_size=train_config.batch_size, shuffle=False, num_workers=train_config.num_workers)
print('Data: New fold successfully')
init_val_loss, init_val_accuracy = validate(training_configuration, model, test_loader)
print(f'Initial Validation Loss: {init_val_loss:.6f}, Initial Validation Accuracy: {init_val_accuracy*100:.3f}%\n')
# --------- THIS IS WHERE THE ERROR IS -------------
train_loss, train_acc = train(
train_config=training_configuration,
model=model,
optimizer=optimizer,
train_loader=train_loader,
epoch_idx=epoch)
Traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-c606f47fba53> in <module>
34 tb_writer,
35 scheduler=scheduler,
---> 36 data_augmentation=True)
<ipython-input-18-05bc5c268921> in main(model, optimizer, tb_writer, scheduler, training_configuration, data_augmentation)
65 optimizer=optimizer,
66 train_loader=train_loader,
---> 67 epoch_idx=epoch)
68
69 epoch_train_loss = np.append(epoch_train_loss, [train_loss])
TypeError: 'Subset' object is not callable
Any thoughts on how I can implement this?
Related
so i created a for loop so I can run various batch sizes, where each loop will open and close a neptune run. The first time runs fine, but the following runs, the accuracy doesn't record into neptune, and python does not throw an error? Can anyone think what the problem may be?
for i in range(len(percentage)):
run = neptune.init(
project="xxx",
api_token="xxx",
)
epochs = 600
batch_perc = percentage[i]
lr = 0.001
sb = 64 #round((43249*batch_perc)*0.00185)
params = {
'lr': lr,
'bs': sb,
'epochs': epochs,
'batch %': batch_perc
}
run['parameters'] = params
torch.manual_seed(12345)
td = 43249 * batch_perc
vd = 0.1*(43249 - td) + td
train_dataset = dataset[:round(td)]
val_dataset = dataset[round(td):round(vd)]
test_dataset = dataset[round(vd):]
print(f'Number of training graphs: {len(train_dataset)}')
run['train'] = len(train_dataset)
print(f'Number of validation graphs: {len(val_dataset)}')
run['val'] = len(val_dataset)
print(f'Number of test graphs: {len(test_dataset)}')
run['test'] = len(test_dataset)
train_loader = DataLoader(train_dataset, batch_size=sb, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=sb, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
model = GCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(1, epochs):
train()
train_acc = test(train_loader)
run['training/batch/acc'].log(train_acc)
val_acc = test(val_loader)
run['training/batch/val'].log(val_acc)
Prince here,
Try using the stop() method to kill the previous run, because currently, you are creating new run objects without killing them, and that might cause some problems.
for i in range(len(percentage)):
run = neptune.init(
project="xxx",
api_token="xxx",
)
run['parameters'] = params
run['train'] = len(train_dataset)
run['val'] = len(val_dataset)
run['test'] = len(test_dataset)
...
for epoch in range(1, epochs):
...
run['training/batch/acc'].log(train_acc)
run['training/batch/val'].log(val_acc)
run.stop()
Docs: https://docs.neptune.ai/api-reference/run#.stop
I am going to train CIFAR10 dataset in the Torch framework. First I download this dataset and load it with two first functions. Then I train using the Pytorch framework. Eventually, I receive this error. It is my appreciate if you help to fix it. My code is long, so I put the summary of functions using in train.
too many values to unpack (expected 2)
def load_cifar10_batch(filename):
""" Load a single batch from CIFAR10 """
with open(filename, 'rb') as f:
datadict = pickle.load(f, encoding='bytes')
X=datadict[b'data']
Y = datadict[b'labels']
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype('float')
Y = np.array(Y)
return X, Y
def load_cifar10(dir):
""" Load all batches of CIFAR10 """
# load train batch file
xs = []
ys = []
for i in range(1, 6):
filename = os.path.join(dir, 'data_batch_%d' % i)
X, Y = load_cifar10_batch(filename)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
# load test batch
Xte, Yte = load_cifar10_batch(os.path.join(dir, 'test_batch'))
return Xtr, Ytr, Xte, Yte
X_train, y_train, X_test, y_test = load_cifar10('cifar-10-batches-py')
'''we used just test set, because of the train set is so big file for train '''
from torch.utils.data import random_split
val_size = 3000
train_size = len(X_test) - val_size
train_ds, val_ds = random_split(X_test, [train_size, val_size])
len(train_ds), len(val_ds)
'''loading data '''
from torch.utils.data.dataloader import DataLoader
batch_size=16
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size, num_workers=4, pin_memory=True)
'''our model '''
class Cifar10CnnModel(ImageClassificationBase):
def __init__(self):
def forward(self, xb):
return self.network(xb)
'''ImageClassificationBase'''
class ImageClassificationBase(nn.Module):
def training_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
accu = accuracy(out,labels)
return loss,accu
def fit(model, train_loader, val_loader,epochs=2,learning_rate=0.001):
best_valid = None
history = []
optimizer = torch.optim.Adam(model.parameters(), learning_rate,weight_decay=0.0005)
for epoch in range(epochs):
# Training Phase
model.train()
train_losses = []
train_accuracy = []
for batch in tqdm(train_loader):
loss,accu = model.training_step(batch)
train_losses.append(loss)
train_accuracy.append(accu)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation phase
result = evaluate(model, val_loader)
result['train_loss'] = torch.stack(train_losses).mean().item()
result['train_accuracy'] = torch.stack(train_accuracy).mean().item()
model.epoch_end(epoch, result)
if(best_valid == None or best_valid<result['Accuracy']):
best_valid=result['Accuracy']
torch.save(model.state_dict(), 'cifar10-cnn.pth')
history.append(result)
return history
'''But the call to this function'''
''' train dataset '''
history = fit(model, train_dl, val_dl)
'''gives this error'''
0%| | 0/438 [00:31<?, ?it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [44], in <cell line: 1>()
----> 1 history = fit(model, train_dl, val_dl)
Input In [43], in fit(model, train_loader, val_loader, epochs, learning_rate)
9 train_accuracy = []
10 for batch in tqdm(train_loader):
---> 11 loss,accu = model.training_step(batch)
12 train_losses.append(loss)
13 train_accuracy.append(accu)
Input In [27], in ImageClassificationBase.training_step(self, batch)
7 def training_step(self, batch):
----> 8 images, labels = batch
9 out = self(images) # Generate predictions
10 loss = F.cross_entropy(out, labels) # Calculate loss
ValueError: too many values to unpack (expected 2)
You perform the split on X_test only, losing the labels this way.
Try something like
dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
train_ds, val_ds = random_split(dataset, [train_size, val_size])
I have written the following code to train a bert model on my dataset but when I execute it I get an error at the part where I implement tqdm. I have written the entire training code below with full description of the error. How to fix this?
Code
Model
TRANSFORMERS = {
"bert-multi-cased": (BertModel, BertTokenizer, "bert-base-uncased"),
}
class Transformer(nn.Module):
def __init__(self, model, num_classes=1):
"""
Constructor
Arguments:
model {string} -- Transformer to build the model on. Expects "camembert-base".
num_classes {int} -- Number of classes (default: {1})
"""
super().__init__()
self.name = model
model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
bert_config = BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
bert_config.output_hidden_states = True
self.transformer = BertModel(bert_config)
self.nb_features = self.transformer.pooler.dense.out_features
self.pooler = nn.Sequential(
nn.Linear(self.nb_features, self.nb_features),
nn.Tanh(),
)
self.logit = nn.Linear(self.nb_features, num_classes)
def forward(self, tokens):
"""
Usual torch forward function
Arguments:
tokens {torch tensor} -- Sentence tokens
Returns:
torch tensor -- Class logits
"""
_, _, hidden_states = self.transformer(
tokens, attention_mask=(tokens > 0).long()
)
hidden_states = hidden_states[-1][:, 0] # Use the representation of the first token of the last layer
ft = self.pooler(hidden_states)
return self.logit(ft)
Training
def fit(model, train_dataset, val_dataset, epochs=1, batch_size=8, warmup_prop=0, lr=5e-4):
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
optimizer = AdamW(model.parameters(), lr=lr)
num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
num_training_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
loss_fct = nn.BCEWithLogitsLoss(reduction='mean').cuda()
for epoch in range(epochs):
model.train()
start_time = time.time()
optimizer.zero_grad()
avg_loss = 0
for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)):
y_pred = model(x.to(device))
loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
loss.backward()
avg_loss += loss.item() / len(train_loader)
xm.optimizer_step(optimizer, barrier=True)
#optimizer.step()
scheduler.step()
model.zero_grad()
optimizer.zero_grad()
model.eval()
preds = []
truths = []
avg_val_loss = 0.
with torch.no_grad():
for x, y_batch in tqdm(val_loader):
y_pred = model(x.to(device))
loss = loss_fct(y_pred.detach().view(-1).float(), y_batch.float().to(device))
avg_val_loss += loss.item() / len(val_loader)
probs = torch.sigmoid(y_pred).detach().cpu().numpy()
preds += list(probs.flatten())
truths += list(y_batch.numpy().flatten())
score = roc_auc_score(truths, preds)
dt = time.time() - start_time
lr = scheduler.get_last_lr()[0]
print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')
Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<timed eval> in <module>
<ipython-input-19-e47eae808597> in fit(model, train_dataset, val_dataset, epochs, batch_size, warmup_prop, lr)
22 for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)):
23
---> 24 y_pred = model(x.to(device))
25
26 loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
724 result = self._slow_forward(*input, **kwargs)
725 else:
--> 726 result = self.forward(*input, **kwargs)
727 for hook in itertools.chain(
728 _global_forward_hooks.values(),
<ipython-input-11-2002cc7ec843> in forward(self, tokens)
41 )
42
---> 43 hidden_states = hidden_states[-1][:, 0] # Use the representation of the first token of the last layer
44
45 ft = self.pooler(hidden_states)
TypeError: string indices must be integers
Your code is designed for an older version of the transformers library:
AttributeError: 'str' object has no attribute 'dim' in pytorch
As such you will need to either downgrade to version 3.0.0, or adapt the code to deal with the new-format output of bert.
I am currently training my neural network. Unfortunately I forgot to reserve several samples for validation. How can I incorporate this?
I have a dok matrix that creates a sparisty matrix and then converts the data with get_train_sampels(). How can I now incorporate the validation sampelsfor my code?
Example from https://www.tensorflow.org/guide/keras/train_and_evaluate:
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
.
.
.
print("Fit model on training data")
history = model.fit(
x_train,
y_train,
batch_size=64,
epochs=2,
# We pass some validation for
# monitoring validation loss and metrics
# at the end of each epoch
validation_data=(x_val, y_val),
)
My Code:
def get_train_samples(train_mat, num_negatives):
user_input, item_input, labels = [], [], []
num_user, num_item = train_mat.shape
for (u, i) in train_mat.keys():
user_input.append(u)
item_input.append(i)
labels.append(1)
# negative instances
for t in range(num_negatives):
j = np.random.randint(num_item)
while (u, j) in train_mat.keys():
j = np.random.randint(num_item)
user_input.append(u)
item_input.append(j)
labels.append(0)
return user_input, item_input, labels
.
.
.
train_mat = sp.load_npz('matrix.npz')
num_users, num_items = train_mat.shape
.
.
.
model = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf[0])
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels)
, epochs=epochs, verbose=verbose, shuffle=True, batch_size = batch_size)
Edit
Try this:
train_mat = sp.load_npz('matrix.npz')
val_mat = train_mat[-10000:]
train_mat = train_mat[:-10000]
num_users, num_items = train_mat.shape
.
.
.
model = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf[0])
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)
val_user_input, val_item_input, val_labels = get_train_samples(train_mat, num_negatives)
hist = model.fit([np.array(user_input), np.array(item_input)],
np.array(labels),
epochs=epochs,
verbose=verbose,
validation_data=([np.array(val_user_input), np.array(val_item_input)],
np.array(val_labels)),
shuffle=True, batch_size = batch_size)
You have to restart training. Otherwise your valuation loss may not show overfitting.
I am a beginner, working on CNN for image classification and I have a callback function as bellow;
class Metrics(Callback):
def on_train_begin(self, logs = {}):
self.val_kappas = []
def on_epoch_end(self, epoch, logs = {}):
X_val, y_val = self.validation_data[:2]
y_val = y_val.sum(axis = 1) - 1
y_pred = self.model.predict(X_val) > 0.5
y_pred = y_pred.astype(int).sum(axis = 1) - 1
_val_kappa = cohen_kappa_score(
y_val,
y_pred,
weights = 'quadratic'
)
self.val_kappas.append(_val_kappa)
print(f"val_kappa: {_val_kappa:.4f}")
if _val_kappa == max(self.val_kappas):
print("Validation Kappa has improved. Saving model.")
self.model.save('/path_to/model.h5')
return
When I trained the model;
kappa_metrics = Metrics()
history = model.fit(
data_generator,
steps_per_epoch = x_train.shape[0] / BATCH_SIZE,
epochs = 15,
validation_data = (x_val, y_val),
callbacks = [kappa_metrics]
)
I get the following error;
Unfortunately I don't understand whats the mistake I do. Please note that I am beginner for CNN and Python.
I solved the problem using the following link. I post it here if someone is interested in it.
https://github.com/keras-team/keras/issues/10472