I am attempting to retrieve the mean squared error of my training. within the original code based in TensorFlow, I am moving this code over to PyTorch (for research reasons).
the original TensorFlow code:
print("Calculating threshold")
x_opt_predictions = model.predict(x_opt)
print("Calculating MSE on optimization set...")
mse = np.mean(np.power(x_opt - x_opt_predictions, 2), axis=1)
print("mean is %.5f" % mse.mean())
print("min is %.5f" % mse.min())
print("max is %.5f" % mse.max())
print("std is %.5f" % mse.std())
tr = mse.mean() + mse.std()
the training method of pytorch:
def train(net, x_train, x_opt, BATCH_SIZE, EPOCHS, input_dim):
outputs = 0
mse = 0
optimizer = optim.SGD(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()
loss = 0
for epoch in range(EPOCHS):
for i in tqdm(range(0, len(x_train), BATCH_SIZE)):
batch_y = x_opt[i:i + BATCH_SIZE]
outputs = net(batch_y)
loss = loss_function(outputs, batch_y)
print(f"Epoch: {epoch}. Loss: {loss}")
print("opt", x_opt.size(), "output", outputs.__sizeof__())
return np.mean(np.power(x_opt - outputs, 2), axis=1)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
as seen above the line "outputs" is not a numpy array of predictions, and acquiring this equivalent to generate the threshold
If there are any other (improved or missing) ways to acquire this value, appreciation in advance.
The variable output is a pytorch tensor to convert it to numpy all you have to change is change this line of code return np.mean(np.power(x_opt - outputs, 2), axis=1) to this return np.mean(np.power(x_opt - outputs.cpu().data.numpy(), 2), axis=1) That will convert the tensor to a numpy array. If you are not using cuda with your network you do not need the .cpu() part.
I'm trying to implement word2vec with negative sampling in python almost from scratch and quite new in neural networks and faced some issues. Would be very appreciate for any help.
So, I wrote simple nn with a forward pass. I didn't get which element have to have grad_fn, I'd been getting error like 'tensor have no grad_fn' until I add requires_grad_() on the returning value. Is that correct?
dataset = Word2VecNegativeSampling(data, num_negative_samples, 30000)
wordvec_dim = 10
class Word2VecNegativeSamples(nn.Module):
def __init__(self, num_tokens):
super(Word2VecNegativeSamples, self).__init__()
self.input = nn.Linear(num_tokens, 10, bias=False)
self.output = nn.Linear(10, num_tokens, bias=False)
self.num_tokens = num_tokens
def forward(self, input_index_batch, output_indices_batch):
Implements forward pass with negative sampling
input_index_batch - Tensor of ints, shape: (batch_size, ), indices of input words in the batch
output_indices_batch - Tensor if ints, shape: (batch_size, num_negative_samples+1),
indices of the target words for every sample
predictions - Tensor of floats, shape: (batch_size, num_negative_samples+1)
results = []
batch_size = len(input_index_batch)
for i in range(batch_size):
input_one_hot = torch.zeros(self.num_tokens)
input_one_hot[input_index_batch[i]] = 1
forward_result = self.output(self.input(input_one_hot))
results.append(torch.tensor([forward_result[out_index] for out_index in output_indices_batch[i]]))
return torch.stack(results).requires_grad_()
nn_model = Word2VecNegativeSamples(data.num_tokens())
After all i'm trying to train the model, but neither loss nor accuracy changing. Is the code for model prediction correct as well?
Here is training code:
def train_neg_sample(model, dataset, train_loader, optimizer, scheduler, num_epochs):
loss = nn.BCEWithLogitsLoss().type(torch.FloatTensor)
loss_history = []
train_history = []
for epoch in range(num_epochs):
model.train() # Enter train mode
loss_accum = 0
correct_samples = 0
total_samples = 0
for i_step, (inp, out, lab) in enumerate(train_loader):
prediction = model(inp, out)
loss_value = loss(prediction, lab)
_, indices = torch.max(prediction, 1)
correct_samples += torch.sum(indices == 0)
total_samples += lab.shape[0]
loss_accum += loss_value
ave_loss = loss_accum / i_step
train_accuracy = float(correct_samples) / total_samples
print("Epoch#: %i, Average loss: %f, Train accuracy: %f" % (epoch, ave_loss, train_accuracy))
return loss_history, train_history
If your loss function is not changing, it's highly probable that you register the wrong set of parameters to the optimizer. Can you post the code snippet where you initialize your model and optimizer? It is supposed to look like this:
nn_model = Word2VecNegativeSamples(data.num_tokens())
optimizer = optim.SGD(nn_model.parameters(), lr=0.001, momentum=0.9)
Why the loss function is always printing zero after the first epoch?
I suspect it's because of loss = loss_fn(outputs, torch.max(labels, 1)[1]).
But if I use loss = loss_fn(outputs, labels), I will get the error
RuntimeError: 0D or 1D target tensor expected, multi-target not supported
nepochs = 5
losses = np.zeros(nepochs)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(modell.parameters(), lr = 0.001)
for epoch in range(nepochs):
running_loss = 0.0
n = 0
for data in train_loader:
#single batch
if(n == 1):
inputs, labels = data
outputs = modell(inputs)
#loss = loss_fn(outputs, labels)
loss = loss_fn(outputs, torch.max(labels, 1)[1])
running_loss += loss.item()
n += 1
losses[epoch] = running_loss / n
print(f"epoch: {epoch+1} loss: {losses[epoch] : .3f}")
The model is:
def __init__(self, labels=10):
super(Classifier, self).__init__()
self.fc = nn.Linear(3 * 64 * 64, labels)
def forward(self, x):
out = x.reshape(x.size(0), -1)
out = self.fc (out)
return out
Any idea?
The labels are a 64 elements tensor like this:
tensor([[7],[1],[ 2],[3],[ 2],[9],[9],[8],[9],[8],[ 1],[7],[9],[2],[ 5],[1],[3],[3],[8],[3],[7],[1],[7],[9],[8],[ 8],[3],[7],[ 5],[ 1],[7],[3],[2],[1],[ 3],[3],[2],[0],[3],[4],[0],[7],[1],[ 8],[4],[1],[ 5],[ 3],[4],[3],[ 4],[8],[4],[1],[ 9],[7],[3],[ 2],[ 6],[4],[ 8],[3],[ 7],[3]])
Usually loss calculation is loss = loss_fn(outputs, labels) and here outputs is as following:
_ , outputs = torch.max(model(input), 1)
outputs = torch.max(predictions, 1)[0]
Common practice is modifying outputs instead of labels:
torch.max() returns a namedtuple (values, indices) where values is
the maximum value of each row of the input tensor in the given
dimension dim. And indices is the index location of each maximum value found (argmax).
In your code snippet the labels is not indices of the labels, so when you calculate the loss, the function should look like this:
loss = loss_fn(torch.max(outputs, 1)[0], labels)
I am doing this project in which i applied minibatches in neural network and calculating epoch cost:-
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.0001, num_epochs = 1500,
minibatch_size = 32, print_cost = True):
seed = 3
costs = []
(n_x, m) = X_train.shape
n_y = Y_train.shape[0]
#create placeholder
X, Y = create_placeholder(n_x, n_y)
# init parameter
parameters = init_parameter()
# forward prop
Z3 = forward_prop(X, parameters)
# compute cost
cost = compute_cost(Z3, Y)
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate).minimize(cost)
# Initialize all variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
for epoch in range(num_epochs):
epoch_cost = 0
num_minibatche = int(m/ minibatch_size)
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train,
minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_, minibatch_cost = sess.run([optimizer, cost], feed_dict
= {X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatche
# Print the cost every epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch " ,epoch, np.mean(epoch_cost))
if print_cost == True and epoch % 5 == 0:
# plot the cost
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
# save the parameters
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate the correct predictions
correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y))
# Calculate accuracy on the test set
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))
return parameters
SO when i run this code i am getting this error on line:-
---->epoch_cost += minibatch_cost / num_minibatche
---->ValueError: operands could not be broadcast together with shapes (32,) (5,) (32,)
I took minibatche_size = 32 and number of training examples = 1381
But i am totally confused why i am getting this error.
The code you've posted is missing a lot of parts, like the whole model() function, so it's difficult to debug. But based on just what we have, some things here that are supposed to be scalars are in fact, arrays.
epoch_cost starts out as a scalar zero with epoch_cost = 0. Then you add some value to it, then try to print np.mean( epoch_cost ). Why do you take the mean of a scalar? Looks like the code was different earlier, and the migration to a scalar epoch_cost was not successful.
It is easy to imagine that minibatch_cost is returned as an array from TensorFlow - one cost value for each member of the batch. In that case you would need to apply np.mean() right there, like
epoch_cost += np.mean( minibatch_cost ) / num_minibatche
Maybe even num_minibatche somehow became a vector. It comes from
num_minibatche = int(m/ minibatch_size)
and minibatch_size is supposedly 32, so that's all right. But m comes from
(n_x, m) = X_train.shape
and we know nothing of X_train. Maybe m somehow became a vector, and in turn num_minibatche too. You will need to print the value for num_minibatche once calculated and make sure it's what it's supposed to be.
Hope this helps. If you post the whole code, I can help you more.
I'm training a neural network to recognize characters using the notMNIST dataset, but once I run it its accuracy stays relatively constant after each iteration.
I've tried lower the learning rate but it wasn't any different. What might be the problem?
I think the problem might be in the implementation of the tf.nn.relu() method, and how I calculate the predictions, since I'm fairly new at Tensor Flow and neural networks
Here is a screenshot of my program running and you can see that the accuracy on the training set, validation set, and test set are all pretty bad
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions,1) == np.argmax(labels,1))
/ predictions.shape[0])
with tf.Session(graph=graph) as session:
#this is a one-time operation which ensure the parameters get initialized
#we described in the graph: random weights for the matrix, zeros for the
for step in range(num_steps):
#run the computations. we tell .run() that we want to run the optimizer,
#and get the loss value and the training predictions returned as numpy
_, l, predictions = session.run([optimizer,loss, train_prediction])
if (step % 100 ==0):
print("loss at step %d: %f" % (step,l))
print("Training accuracy: %.1f%%" % accuracy(
predictions, train_labels[:train_subset,:]))
#calling .eval() on valid_prediction is basically like calling run(), but
#just to get that one numpy array. Note that it recomputes all its graph
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("test accuracy: %.1f%%" % accuracy(test_prediction.eval(),test_labels))
batch_size = 128
hidden_nodes = 1024
graph = tf.Graph()
with graph.as_default():
#input data. For the training data, we use a placeholder that will be fed
#at run time with a training minibatch
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size*image_size), name="td")
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels), name="tl")
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
weights1 = tf.Variable(
tf.truncated_normal([image_size*image_size, hidden_nodes]))
biases1 = tf.Variable(tf.zeros([hidden_nodes]))
weights2 =tf.Variable(
tf.truncated_normal([hidden_nodes, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
#training computation.
relu1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
relu_out= tf.nn.relu(tf.matmul(relu1, weights2) + biases2)
loss = tf.reduce_mean(
optimizer = tf.train.GradientDescentOptimizer(0.25).minimize(loss)
#predictions for the training, validation, and test data
train_prediction = relu_out
valid_prediction = tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
test_prediction = tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)
num_steps = 3001
with tf.Session(graph=graph) as session:
for step in range(num_steps):
#pick an offset within the training data, which has been randomized.
#note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
#generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
#prepare a dictionary telling the session where to feed the minibatch.
#the key of the dictionary is the placeholder node of the graph to be fed,
#and the value is the numpy array to feed to it
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("minibatch loss at step %d: %f" % (step,l))
print("minibatch accuracy: %.1f%%" % accuracy(predictions,batch_labels))
print("validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
As I thought, the problem was the implementation of the relu() method.
In the computation segment I was using relu() 2 times where I should've been using it only once. After the change it ended up looking like this.
logits_1 = tf.matmul(tf_train_dataset, weights1) + biases1
relu1 = tf.nn.relu(logits_1)
logits_2 = tf.matmul(relu1, weights2) + biases2
And I changed the parameter logits in the loss variable from relu_out to logits_2.
loss = tf.reduce_mean(
And finally I changed the prediction variables in order to compute using logits_2 and not relu_out.
train_prediction = tf.nn.softmax(logits_2)
valid_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset,weights1) +biases1), weights2) + biases2)
test_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)
As you can see the accuracy gets up around 90%
Although I'm still not sure why implementing the relu() method twice was a problem. If I'm not wrong the relu() method returns either 0 or the value of the parameter it's given, so shouldnn't it be the same?
If anyone knows feel free to answer
Hi I am trying to modify the mnist example to match it to my dataset. I only try to use the mlp example and it gives a strange error.
Tha dataset is a matrix with 2100 rows and 17 columns, and the output should be one of the 16 possible classes. The error seems happening in the secon phase of the training. The model is build correctly (log info confirmed).
Here is the error log:
ValueError: y_i value out of bounds
Apply node that caused the error:
CrossentropySoftmaxArgmax1HotWithBias(Dot22.0, b, targets)
Toposort index: 33
Inputs types: [TensorType(float64, matrix), TensorType(float64, vector), >TensorType(int32, vector)]
Inputs shapes: [(100, 17), (17,), (100,)]
Inputs strides: [(136, 8), (8,), (4,)]
Inputs values: ['not shown', 'not shown', 'not shown']
Outputs clients: [[Sum{acc_dtype=float64}(CrossentropySoftmaxArgmax1HotWithBias.0)], [CrossentropySoftmax1HotWithBiasDx(Assert{msg='sm and dy do not have the same shape.'}.0, CrossentropySoftmaxArgmax1HotWithBias.1, targets)], []]
HINT: Re-running with most Theano optimization disabled could give you a >back-trace of when this node was created. This can be done with by >setting the Theano flag 'optimizer=fast_compile'. If that does not work, >Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Here is the code:
def build_mlp(input_var=None):
l_in = lasagne.layers.InputLayer(shape=(None, 16),
# Apply 20% dropout to the input data:
l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
# Add a fully-connected layer of 800 units, using the linear rectifier, and
# initializing weights with Glorot's scheme (which is the default anyway):
l_hid1 = lasagne.layers.DenseLayer(
l_in_drop, num_units=10,
# We'll now add dropout of 50%:
l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)
# Another 800-unit layer:
l_hid2 = lasagne.layers.DenseLayer(
l_hid1_drop, num_units=10,
# 50% dropout again:
l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
# Finally, we'll add the fully-connected output layer, of 10 softmax units:
l_out = lasagne.layers.DenseLayer(
l_hid2_drop, num_units=17,
# Each layer is linked to its incoming layer(s), so we only need to pass
# the output layer to give access to a network in Lasagne:
return l_out
def main(model='mlp', num_epochs=300):
# Load the dataset
print("Loading data...")
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
# Prepare Theano variables for inputs and targets
input_var = T.matrix('inputs')
target_var = T.ivector('targets')
# Create neural network model (depending on first command line parameter)
print("Building model and compiling functions...")
if model == 'cnn':
network = build_cnn(input_var)
elif model == 'mlp':
network = build_mlp(input_var)
elif model == 'lstm':
network = build_lstm(input_var)
print("Unrecognized model type %r." % model)
# Create a loss expression for training, i.e., a scalar objective we want
# to minimize (for our multi-class problem, it is the cross-entropy loss):
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
# We could add some weight decay as well here, see lasagne.regularization.
# Create update expressions for training, i.e., how to modify the
# parameters at each training step. Here, we'll use Stochastic Gradient
# Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(
loss, params, learning_rate=0.01, momentum=0.9)
# Create a loss expression for validation/testing. The crucial difference
# here is that we do a deterministic forward pass through the network,
# disabling dropout layers.
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
test_loss = test_loss.mean()
# As a bonus, also create an expression for the classification accuracy:
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
# Compile a function performing a training step on a mini-batch (by giving
# the updates dictionary) and returning the corresponding training loss:
train_fn = theano.function([input_var, target_var], loss, updates=updates)
# Compile a second function computing the validation loss and accuracy:
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
# Finally, launch the training loop.
print("Starting training...")
# We iterate over epochs:
for epoch in range(num_epochs):
# In each epoch, we do a full pass over the training data:
train_err = 0
train_batches = 0
start_time = time.time()
for batch in iterate_minibatches(X_train, y_train, 100, shuffle=True):
inputs, targets = batch
train_err += train_fn(inputs, targets)
train_batches += 1
# And a full pass over the validation data:
val_err = 0
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(X_val, y_val, 100, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
val_err += err
val_acc += acc
val_batches += 1
# Then we print the results for this epoch:
print("Epoch {} of {} took {:.3f}s".format(
epoch + 1, num_epochs, time.time() - start_time))
print(" training loss:\t\t{:.6f}".format(train_err / train_batches))
print(" validation loss:\t\t{:.6f}".format(val_err / val_batches))
print(" validation accuracy:\t\t{:.2f} %".format(
val_acc / val_batches * 100))
# After training, we compute and print the test error:
test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 100, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
test_err += err
test_acc += acc
test_batches += 1
print("Final results:")
print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print(" test accuracy:\t\t{:.2f} %".format(
test_acc / test_batches * 100))
I Figured out the problem:
my dataset does not have an output for every target, becouse it is too small! There are 17 target outputs but my dataset has only 16 different outputs, and it is missing examples of the 17th output.
In order to resolve this problem, just change the softmax with rectify,
from this:
l_out = lasagne.layers.DenseLayer(
l_hid2_drop, num_units=17,
to this:
l_out = lasagne.layers.DenseLayer(
l_hid2_drop, num_units=17,