The below code does run, but it's very slow as it's using a for loops. At my university, servers with GPU resources are available. Likewise, I'd like to understand how to use batches to train the model more effectively.
import torch
import torch.nn as nn
import torch.nn.functional as F
class MatrixFactorization(torch.nn.Module):
def __init__(self, n_items=len(movie_ids), n_factors=300):
super().__init__()
self.vectors = nn.Embedding(n_items, n_factors,sparse=True)
def forward(self, i,j):
feat_i = self.vectors(i)
feat_j = self.vectors(j)
result = (feat_i * feat_j).sum(-1)
return result
model = MatrixFactorization(n_items= len(movie_ids),n_factors=300)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 100
for epoch in range(epochs):
loss = 0
for r,c in zip(r_index, c_index):
i = torch.LongTensor([int(r)])
j = torch.LongTensor([int(c)])
rating = torch.FloatTensor([Xij[i, j]])
# predict
prediction = model(i, j)
loss += loss_fn(prediction, rating)
# Reset the gradients to 0
optimizer.zero_grad()
# backpropagate
loss.backward()
# update weights
optimizer.step()
print(loss)
I've tried the below alteration but it produced a warning. I'm not sure why my target sizes are mismatched, but that appears to be the cause of the issue.
epochs = 50
for epoch in range(epochs):
loss = 0
# predict
i = torch.LongTensor(r_index)
j = torch.LongTensor(c_index)
ratings = Xij[i, j]
prediction = model(i, j)
loss += loss_fn(prediction, rating)
# Reset the gradients to 0
optimizer.zero_grad()
# backpropagate
loss.backward()
# update weights
optimizer.step()
print(loss)
And the warning (not sure where I went wrong):
/anaconda3/lib/python3.6/site-packages/torch/nn/modules/loss.py:431: UserWarning: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([5931640])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
There is a typo in your second code snippet,
loss += loss_fn(prediction, ratings) # instead of rating
Related
I am trying to implemented a Optuna Hyperparameter optimization for a Pytorch LSTM. But I do not know how to define my model correctly.
When I just use nn.linear erverything works fine but when I use nn.LSTMCell I get the following error:
AttributeError: 'tuple' object has no attribute 'dim'
The error gets raised because, the LSTM returns a tupel not a tensor. But I do not know how to fix it and can not find an example of an Pytorch LSTM with Optuna optimization online.
Here the Model definition:
def build_model_custom(trail):
# Suggest the number of layers of neural network model
n_layers = trail.suggest_int("n_layers", 1, 3)
layers = []
in_features = 20
for i in range(n_layers):
# Suggest the number of units in each layer
out_features = trail.suggest_int("n_units_l{}".format(i), 4, 18)
layers.append(nn.LSTMCell(in_features, out_features))
in_features = out_features
layers.append(nn.Linear(in_features, 2))
return nn.Sequential(*layers)
I have implemented an example of optuna optimizing LSTM before, I hope it will help you:
def get_best_parameters(args, Dtr, Val):
def objective(trial):
model = TransformerModel(args).to(args.device)
loss_function = nn.MSELoss().to(args.device)
optimizer = trial.suggest_categorical('optimizer',
[torch.optim.SGD,
torch.optim.RMSprop,
torch.optim.Adam])(
model.parameters(), lr=trial.suggest_loguniform('lr', 5e-4, 1e-2))
print('training...')
epochs = 10
val_loss = 0
for epoch in range(epochs):
train_loss = []
for batch_idx, (seq, target) in enumerate(Dtr, 0):
seq, target = seq.to(args.device), target.to(args.device)
optimizer.zero_grad()
y_pred = model(seq)
loss = loss_function(y_pred, target)
train_loss.append(loss.item())
loss.backward()
optimizer.step()
# validation
val_loss = get_val_loss(args, model, Val)
print('epoch {:03d} train_loss {:.8f} val_loss {:.8f}'.format(epoch, np.mean(train_loss), val_loss))
model.train()
return val_loss
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(func=objective, n_trials=5)
pruned_trials = study.get_trials(deepcopy=False,
states=tuple([TrialState.PRUNED]))
complete_trials = study.get_trials(deepcopy=False,
states=tuple([TrialState.COMPLETE]))
best_trial = study.best_trial
print('val_loss = ', best_trial.value)
for key, value in best_trial.params.items():
print("{}: {}".format(key, value))
I implemented a solution by my self. I am not sure if it's the most pythonic but it works.
Suggestions for improvement are welcome.
def train_and_evaluate(param, model, trail):
# Load Data
train_dataloader = torch.utils.data.DataLoader(Train_Dataset, batch_size=batch_size)
Test_dataloader = torch.utils.data.DataLoader(Test_Dataset, batch_size=batch_size)
criterion = nn.MSELoss()
optimizer = getattr(optim, param['optimizer'])(model.parameters(), lr= param['learning_rate'])
acc = nn.L1Loss()
# Training Loop
for epoch_num in range(EPOCHS):
# Training
total_loss_train = 0
for train_input, train_target in train_dataloader:
output = model.forward(train_input.float())
batch_loss = criterion(output, train_target.float())
total_loss_train += batch_loss.item()
model.zero_grad()
batch_loss.backward()
optimizer.step()
# Evaluation
total_loss_val = 0
total_mae = 0
with torch.no_grad():
for test_input, test_target in Test_dataloader:
output = model(test_input.float())
batch_loss = criterion(output, test_target)
total_loss_val += batch_loss.item()
batch_mae = acc(output, test_target)
total_mae += batch_mae.item()
accuracy = total_mae/len(Test_Dataset)
# Add prune mechanism
trail.report(accuracy, epoch_num)
if trail.should_prune():
raise optuna.exceptions.TrialPruned()
return accuracy
After some days spent with PyTorch I ended up with the neural network, that despite being quite a good predictor, is extremely slow to learn. It is a MLP with 54 input neurons, 27 hidden neurons with sigmoid activation function, and one linear output neuron. Currently, running the NN for 20 000 epochs lasts around 20 minutes. I had some experience with PyTorch MLP with the same architecture, but 'created from scratch' - without bias, which was worse in terms of predictive capabilities, but the whole training lasted for less than 30s.
The reason I created new NN is that now my model is much more flexible (changing number of neutrons, number of layers or activation functions takes seconds). Also, I tried to use as many built-in tools as possible, so there was no problem with e.g. introducing bias to the neurons.
The code is following (I skipped the imports part):
Hyperparameters:
hyperparam_input_neurons = 54
hyperparam_hidden_neurons_1 = 27
hyperparam_output_neurons = 1
param_learning_rate = 0.01
param_weight_decay = 1e-6
param_momentum = 0.9
param_epochs = 2000
param_test_data_fraction=0.5
loss_function = nn.MSELoss()
training data:
train = pd.read_csv('input.csv')
Xf = torch.tensor(train.values,dtype=torch.float)
res=pd.read_csv('output.csv')
yf=torch.tensor(res.values,dtype=torch.float)
ntrainingelems=int((len(yf)+1)*param_test_data_fraction)
Xt=Xf[:ntrainingelems]
yt=yf[:ntrainingelems]
Xv=Xf[ntrainingelems:]
yv=yf[ntrainingelems:]
traintensor = TensorDataset(Xt, yt)
validtensor = TensorDataset(Xv, yv)
trainloader = DataLoader(traintensor, batch_size=ntrainingelems, shuffle=False)
validloader = DataLoader(validtensor, batch_size=(len(yf)-ntrainingelems), shuffle=False)
NN definition:
class Model(nn.Module):
def __init__(self):
super().__init__()
self.hidden = nn.Linear(hyperparam_input_neurons, hyperparam_hidden_neurons_1)
self.output = nn.Linear(hyperparam_hidden_neurons_1, hyperparam_output_neurons)
def forward(self, x):
x = self.hidden(x)
x = torch.sigmoid(x)
x = self.output(x)
return x
model = Model()
learning:
epoch_number = []
mse_loss_t = []
mse_loss_v = []
optimizer = optim.SGD(model.parameters(), lr=param_learning_rate, weight_decay= param_weight_decay, momentum = param_momentum, nesterov = True)
for epoch in range(1, param_epochs+1):
train_loss, valid_loss = [], []
epoch_number.append(int(epoch))
model.train()
for data, target in trainloader:
optimizer.zero_grad()
output = model(data)
loss = loss_function(output, target)
loss.backward()
optimizer.step()
train_loss.append(loss.item())
mse_loss_t.append(np.mean(train_loss))
model.eval()
for data, target in validloader:
output = model(data)
loss = loss_function(output, target)
valid_loss.append(loss.item())
mse_loss_v.append(np.mean(valid_loss))
if epoch==1 or epoch%100==0:
print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))
Do you have any ideas what is wrong here? Or how to make the learning quicker (at least 30x)?
FYI the very quick-to-learn 'scratch-based' NN I mentioned earlier used different definition of the forward method, so I guessed this may be the reason (see code below)... but if I understand the documentation correctly, the NN.linear can also make quick parallel computations.
def forward(self, X):
self.z = torch.matmul(X, self.W1)
self.z2 = self.sigmoid(self.z)
self.z3 = torch.matmul(self.z2, self.W2)
I think I finally found the issue - but your comments are welcome.
After applying the torch tensor sizing (to avoid (N,N) loss), I removed the DataLoader and used direct "feeding" of the data.
First change - NN object - flattening the output - size (N) instead of (N,1):
def forward(self, x):
x = self.hidden(x)
x = torch.sigmoid(x)
x = (self.output(x)).flatten()
return x
Second change - changing the shape of the tensors:
traintensor_X = torch.squeeze(Xt,1)
traintensor_y = torch.squeeze(yt,1)
validtensor_X = torch.squeeze(Xv,1)
validtensor_y = torch.squeeze(yv,1)
Then the last, third change - removing the old DataLoaders and removing the "for data, target in trainloader" subloop. Instead, only one "main" loop is used (one for epochs):
for epoch in range(1, param_epochs+1): ## run the model for x epochs
train_loss, valid_loss = [], []
epoch_number.append(int(epoch))
## training part
model.train()
optimizer.zero_grad()
## 1. forward propagation
output = model(traintensor_X)
## 2. loss calculation
loss = loss_function(output, traintensor_y)
## 3. backward propagation
loss.backward()
## 4. weight optimization
optimizer.step()
train_loss.append(loss.item())
if epoch==param_epochs:
print("T train size: ", traintensor_X.size())
print("T target size: ", traintensor_y.size())
print("T output size: ", output.size())
print("T loss size: ", loss.size())
## loss at each epoch (training set)
mse_loss_t.append(np.mean(train_loss))
## evaluation part
model.eval()
output = model(validtensor_X)
loss = loss_function(output, validtensor_y)
valid_loss.append(loss.item())
## loss at each epoch (validation set)
mse_loss_v.append(np.mean(valid_loss))
if epoch==param_epochs:
print("V train size: ", validtensor_X.size())
print("V target size: ", validtensor_y.size())
print("V output size: ", output.size())
print("V loss size: ", loss.size())
if epoch==1 or epoch%100==0:
print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))
I checked the dimensions of the tensors, they seem to be OK:
T train size: torch.Size([4289, 54])
T target size: torch.Size([4289])
T output size: torch.Size([4289])
V train size: torch.Size([2209, 54])
V target size: torch.Size([2209])
V output size: torch.Size([2209])
If you have any feedback, I'd really appreciate. Maybe I've done some silly mistake? The RMSE values are really similar to the ones from the NN with DataLoaders.
I am creating a model to identify names of items in RNN (LITMS)
i get data , then transform data then create Baches then create Model then create train function Correctly but the training is stop here(not working):
this this my code
for e in range(epochs):
# initialize hidden state
h = net.init_hidden(batch_size)
for x, y in get_batches(data, batch_size, seq_length):
print ("the login the loob get_batches is succressfuly")
counter += 1
# One-hot encode our data and make them Torch tensors
x = one_hot_encode(x, n_chars)
inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
if(train_on_gpu):
inputs, targets = inputs.cuda(), targets.cuda()
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
# zero accumulated gradients
net.zero_grad()
# get the output from the model
output, h = net(inputs, h)
# calculate the loss and perform backprop
loss = criterion(output, targets.view(batch_size*seq_length))
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
opt.step()
# loss stats
if counter % print_every == 0:
# Get validation loss
val_h = net.init_hidden(batch_size)
val_losses = []
net.eval()
for x, y in get_batches(val_data, batch_size, seq_length):
# One-hot encode our data and make them Torch tensors
x = one_hot_encode(x, n_chars)
x, y = torch.from_numpy(x), torch.from_numpy(y)
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
inputs, targets = x, y
if(train_on_gpu):
inputs, targets = inputs.cuda(), targets.cuda()
output, val_h = net(inputs, val_h)
val_loss = criterion(output, targets.view(batch_size*seq_length))
val_losses.append(val_loss.item())
net.train() # reset to train mode after iterationg through validation data
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.4f}...".format(loss.item()),
"Val Loss: {:.4f}".format(np.mean(val_losses)))
I don't know why.
some times i got this error after many trials
please help me if you can
You can find the notebook file here.
you duplicate this step
remove it then try again
I am trying to implement gradient accumulation for a twitter sentiment analysis model using HuggingFace's BERT model. However, when I go to implement gradient accumulation with a batch size of 64, I get the dreaded "OOM" error. Oddly enough, when I go to run my same model with a batch size of 64 and not using gradient accumulation, it trains right through. Does anyone know why this is and/or if my code is wrong?
batch_size = 32
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
vocabulary = tokenizer.get_vocab()
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# data preprocessing
tweets_pos = pd.read_csv('C:/1_Tweets.csv', sep=',', names = ['Tweet', 'Sentiment'])
tweets_neg = pd.read_csv('C:/0_Tweets.csv', sep=',', names = ['Tweet', 'Sentiment'])
data = pd.concat([tweets_pos, tweets_neg], axis=0)
data = data.sample(frac=1)
all_tweets = data['Tweet'].to_list()
all_sentiment = data['Sentiment'].to_list()
training_tweets = all_tweets[0:512]
training_labels = all_sentiment[0:512]
# create dataset
def create_dataset(tweets, labels):
inputs_ids_list = []
token_type_ids_list = []
attention_mask_list = []
label_list = []
for i in range(len(tweets)):
encoded = tokenizer.encode_plus(tweets[i], max_length = 512, pad_to_max_length=True, return_attention_mask=True, add_special_tokens=True)
inputs_ids_list.append(encoded['input_ids'])
token_type_ids_list.append(encoded['token_type_ids'])
attention_mask_list.append(encoded['attention_mask'])
label_list.append([labels[i]])
ids_and_mask = {'input_ids':inputs_ids_list, 'token_type_ids':token_type_ids_list,'attention_mask':attention_mask_list}
return tf.data.Dataset.from_tensor_slices((ids_and_mask, label_list))
# create dataset of batch_size = 32
train_dataset = create_dataset(training_tweets, training_labels).batch(batch_size)
# Accumulate Gradients
num_epochs = 1
for i in range(num_epochs):
print(f'Epoch: {i + 1}')
total_loss = 0
# get trainable variables
train_vars = model.trainable_variables
accum_gradient = [tf.zeros_like(this_var) for this_var in train_vars]
for (batch, (tweets, labels)) in enumerate(train_dataset):
labels = tf.dtypes.cast(labels, tf.float32)
with tf.GradientTape() as tape:
prediction = model(tweets, training=True)
prediction = tf.dtypes.cast(prediction, tf.float32)
loss_value = loss(y_true=labels, y_pred=prediction)
total_loss += loss_value
# get gradients of this tape
gradients = tape.gradient(loss_value, train_vars)
# Accumulate the gradients
accum_gradient = [(acum_grad+grad) for acum_grad, grad in zip(accum_gradient, gradients)]
# average gradients and apply the optimization step
accum_gradient = [this_grad/batch_size for this_grad in accum_gradient]
optimizer.apply_gradients(zip(accum_gradient,train_vars))
epoch_loss = total_loss / batch_size
print(f'Epoch loss: {epoch_loss}')
I know I'm a bit late for this, but on your question, you are already answering it.
It holds the graph in the memory to correctly calculate the gradient when accumulating gradients. In other words, when accumulating, you have every forward that you have already done in your memory. When not accumulating and using a batch size of 64, TensorFlow flushes the graph after back propping it.
This scenario is why you can train with this batch size of 64 but not accumulate with 64. I don't know why you are trying to accumulate it. However, maybe downsizing your batch size a little if you need it to accumulate.
I am new to Pytorch. I was trying to model a binary classifier on the Kepler dataset. The following was my dataset class.
class KeplerDataset(Dataset):
def __init__(self, test=False):
self.dataframe_orig = pd.read_csv(koi_cumm_path)
if (test == False):
self.data = df_numeric[( df_numeric.koi_disposition == 1 ) | ( df_numeric.koi_disposition == 0 )].values
else:
self.data = df_numeric[~(( df_numeric.koi_disposition == 1 ) | ( df_numeric.koi_disposition == 0 ))].values
self.X_data = torch.FloatTensor(self.data[:, 1:])
self.y_data = torch.FloatTensor(self.data[:, 0])
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
Here, I created a custom classifier class with one hidden layer and a single output unit that produces sigmoidal probability of being in class 1 (planet).
class KOIClassifier(nn.Module):
def __init__(self, input_dim, out_dim):
super(KOIClassifier, self).__init__()
self.linear1 = nn.Linear(input_dim, 32)
self.linear2 = nn.Linear(32, 32)
self.linear3 = nn.Linear(32, out_dim)
def forward(self, xb):
out = self.linear1(xb)
out = F.relu(out)
out = self.linear2(out)
out = F.relu(out)
out = self.linear3(out)
out = torch.sigmoid(out)
return out
I then created a train_model function to optimize the loss using SGD.
def train_model(X, y):
criterion = nn.BCELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.001)
n_epochs = 100
losses = []
for epoch in range(n_epochs):
y_pred = model.forward(X)
loss = criterion(y_pred, y)
losses.append(loss.item())
optim.zero_grad()
loss.backward()
optim.step()
losses = []
for X, y in train_loader:
losses.append(train_model(X, y))
But after performing the optimization over the train_loader, When I try predicting on the trainn_loader itself, the prediction values are so much worse.
for features, y in train_loader:
y_pred = model.predict(features)
break
y_pred
> tensor([[4.5436e-02],
[1.5024e-02],
[2.2579e-01],
[4.2279e-01],
[6.0811e-02],
.....
Why is my model not working properly? Is it the problem with the dataset or am I doing something wrong with implementing the Neural net? I will link my Kaggle notebook because more context might be helpful. Please help.
You are optimizing many times (100 steps) on the first batch (first samples), then moving to the next samples. It means that your model will overfit your few samples before going to the next batch. Then, your training will be very non smooth, diverge and go far from your global optimum.
Usually, in a training loop you should:
go over all samples (this is one epoch)
shuffle your dataset in order to visit your samples in a different order (set your pytorch training loader accordingly)
go back to 1. until you reach the max number of epochs
Also you should not define your optimizer each time (nor your criterion).
Your training loop should look like this:
criterion = nn.BCELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.001)
n_epochs = 100
def train_model():
for X, y in train_loader:
optim.zero_grad()
y_pred = model.forward(X)
loss = criterion(y_pred, y)
loss.backward()
optim.step()
for epoch in range(n_epochs):
train_model()