I had a curious experience with Keras.
Info: input dataset shapes
16 features, 5000 observations
target variable: 1 dimension
Problem: Regression
While writing code for students I developed a toy network using tf using the following code (I know is not a complete example but I hope it will give you enough information)
n1 = 15 # Number of neurons in layer 1
n2 = 15 # Number of neurons in layer 2
n3 = 15
nx = number_of_x_points
n_dim = nx
n4 = 1
stddev_f = 0.1
tf.set_random_seed(5)
X = tf.placeholder(tf.float32, [n_dim, None])
Y = tf.placeholder(tf.float32, [10, None])
W1 = tf.Variable(tf.random_normal([n1, n_dim], stddev=stddev_f))
b1 = tf.Variable(tf.constant(0.0, shape = [n1,1]) )
W2 = tf.Variable(tf.random_normal([n2, n1], stddev=stddev_f))
b2 = tf.Variable(tf.constant(0.0, shape = [n2,1]))
W3 = tf.Variable(tf.random_normal([n3,n2], stddev = stddev_f))
b3 = tf.Variable(tf.constant(0.0, shape = [n3,1]))
W4 = tf.Variable(tf.random_normal([n4,n3], stddev = stddev_f))
b4 = tf.Variable(tf.constant(0.0, shape = [n4,1]))
X = tf.placeholder(tf.float32, [nx, None]) # Inputs
Y = tf.placeholder(tf.float32, [1, None]) # Labels
Z1 = tf.nn.sigmoid(tf.matmul(W1, X) + b1) # n1 x n_dim * n_dim x n_obs = n1 x n_obs
Z2 = tf.nn.sigmoid(tf.matmul(W2, Z1) + b2) # n2 x n1 * n1 * n_obs = n2 x n_obs
Z3 = tf.nn.sigmoid(tf.matmul(W3, Z2) + b3)
Z4 = tf.matmul(W4, Z3) + b4
y_ = tf.sigmoid(Z4)
cost = tf.reduce_mean(tf.square(y_-Y))
learning_rate = 0.005
training_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
training_epochs = 1000
cost_history = np.empty(shape=[1], dtype = float)
cost_meas_history = np.empty(shape=[1], dtype = float)
train_x = np.transpose(data)
train_y = np.transpose(targets)
cost_history = []
for epoch in range(training_epochs+1):
for i in range(0, train_x.shape[0], batch_size):
x_batch = train_x[i:i + batch_size,:]
y_batch = train_y[i:i + batch_size,:]
sess.run(training_step, feed_dict = {X: x_batch, Y: y_batch})
cost_ = sess.run(cost, feed_dict={ X:train_x, Y: train_y})
cost_history = np.append(cost_history, cost_)
if (epoch % 5000 == 0):
print("Reached epoch",epoch,"cost J =", cost_)
this code is working quite well and it takes on my laptop for 1000 epochs 5 sec. Now I developed the same network with keras with the code
model = tf.keras.Sequential()
model.add(layers.Dense(15, input_dim=16, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.train.AdamOptimizer(0.005),
loss='mse',
metrics=['mae'])
# Training Phase
model.fit(train_x.transpose(), train_y.transpose()/100.0, epochs=1000, batch_size=100,verbose = 0)
This code takes 43 sec. Has anyone any idea what this is the case? Now I expected Keras to be slower but not that much slower. What am I missing?
Thanks, Umberto
Ok I found the reason... It was my mistake. Due to a series of mistakes, due to programming at night after midnight (...), I realized I was comparing batch GD and mini-batch GD. My apologies to everyone and thanks to today that noticed my mistake...
If someone thinks this should be deleted is fine with me.
Now Keras and plain TF are taking exactly the same time. Thanks everyone for reading.
Best, Umberto
Related
I have a vector x and want to compute a vector y such that y[j] = x[j]**2 using a neural network specified by TensorFlow, below. It doesn't work so well, the error is high.
Am I doing something wrong?
Any help will be appreciated
The way it works is it first generates data in Xtrain, Ytrain, Xtest, and Ytest and then creates placeholder variables to get TensorFlow going.
Then it specifies three hidden layers and one output layer. Then it trains, and Ypred, the prediction for Ytest, is created using a feed dictionary.
import numpy as np
import tensorflow as tf
n = 10
k = 1000
n_hidden = 10
learning_rate = .01
training_epochs = 100000
Xtrain = []
Ytrain = []
Xtest = []
Ytest = []
for i in range(0,k,1):
X = np.random.randn(1,n)[0]
Xtrain += [X]
Ytrain += [Xtrain[-1]**2]
X = np.random.randn(1,n)[0]
Xtest += [X]
Ytest += [Xtest[-1]**2]
x = tf.placeholder(tf.float64,shape = (k,n))
y = tf.placeholder(tf.float64,shape = (k,n))
W1 = tf.Variable(tf.random_normal((n,n_hidden),dtype = tf.float64))
b1 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
x_hidden1 = tf.nn.sigmoid(tf.matmul(x,W1) + b1)
W2 = tf.Variable(tf.random_normal((n,n_hidden),dtype = tf.float64))
b2 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
x_hidden2 = tf.nn.sigmoid(tf.matmul(x_hidden1,W2) + b2)
W3 = tf.Variable(tf.random_normal((n,n_hidden),dtype = tf.float64))
b3 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
x_hidden3 = tf.nn.sigmoid(tf.matmul(x_hidden1,W3) + b3)
W4 = tf.Variable(tf.random_normal((n,n_hidden),dtype = tf.float64))
b4 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
y_pred = tf.matmul(x_hidden3,W4) + b4
penalty = tf.reduce_sum(tf.abs((y - y_pred)))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(penalty)
model = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(model)
for i in range(0,training_epochs):
sess.run(train_op,{x: Xtrain,y: Ytrain})
Ypred = y_pred.eval(feed_dict = {x: Xtest})
Here is just some simple modification of your code:
import numpy as np
import tensorflow as tf
n = 10
k = 1000
learning_rate = 1e-3
training_epochs = 100000
# It will be better for you to use PEP8 style
# None here will allow you to feed data with ANY k size
x = tf.placeholder(tf.float64, shape=(None, n))
y = tf.placeholder(tf.float64, shape=(None, n))
# Use default layer constructors
# from your implementation it uses another random initializer
out = tf.layers.dense(x, 100)
out = tf.layers.batch_normalization(out)
# ReLU is better than sigmoid, there are a lot of articles about it
out = tf.nn.relu(out)
out = tf.layers.dense(out, 200)
out = tf.layers.batch_normalization(out)
out = tf.nn.relu(out)
out = tf.layers.dense(out, n)
# total loss = mean L1 for samples
# each sample is a vector of 10 values, so you need to calculate
# sum along first axis, and them calculate mean of sums
l1 = tf.reduce_mean(tf.reduce_sum(tf.abs(y - out), axis=1))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(l1)
model = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(model)
for i in range(training_epochs):
xs = np.random.randn(k, n)
ys = xs ** 2
_, l1_value = sess.run(
[train_op, l1],
feed_dict={x: xs, y: ys})
if (i + 1) % 10000 == 0 or i == 0:
print('Current l1({}/{}) = {}'.format(
i + 1, training_epochs, l1_value))
xs = np.random.randn(k, n)
ys = xs ** 2
test_l1 = sess.run(l1, feed_dict={x: xs, y: ys})
print ('Total l1 at test = {}'.format(test_l1))
Output:
Current l1(1/100000) = 11.0853215657
Current l1(10000/100000) = 0.126037403282
Current l1(20000/100000) = 0.096445475666
Current l1(30000/100000) = 0.0719392853473
Current l1(40000/100000) = 0.0690671103719
Current l1(50000/100000) = 0.07661241544
Current l1(60000/100000) = 0.0743827124406
Current l1(70000/100000) = 0.0656016587469
Current l1(80000/100000) = 0.0675546809828
Current l1(90000/100000) = 0.0649035400487
Current l1(100000/100000) = 0.0583308788607
Total l1 at test = 0.0613149096968
Total penalty may be enhanced using some other architecture, learning rate, batch size, count of epochs, loss function, e.t.c.
It looks that architecture may be increased, then you will be able to run training for a long period of time to get 1e-3.
More information about how it works and how to do it you can find at CS231 course.
P.S. here is some assumption about data feeding: some data on which I tested might was at the training process. Because task is simple it's OK, but it's better to guarantee, that not any train sample will be in test set.
This code does a lot better. Anyone want to make any further improvements?
import numpy as np
import tensorflow as tf
n = 10
k = 1000
n_hidden = 50
learning_rate = .001
training_epochs = 100000
Xtrain = []
Ytrain = []
Xtest = []
Ytest = []
for i in range(0,k,1):
X = np.random.randn(1,n)[0]
Xtrain += [X]
Ytrain += [Xtrain[-1]**2]
X = np.random.randn(1,n)[0]
Xtest += [X]
Ytest += [Xtest[-1]**2]
x = tf.placeholder(tf.float64,shape = (k,n))
y = tf.placeholder(tf.float64,shape = (k,n))
W1 = tf.Variable(tf.random_normal((n,n_hidden),dtype = tf.float64))
b1 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
x_hidden1 = tf.nn.sigmoid(tf.matmul(x,W1) + b1)
W2 = tf.Variable(tf.random_normal((n_hidden,n_hidden),dtype = tf.float64))
b2 = tf.Variable(tf.random_normal((n_hidden,),dtype = tf.float64))
x_hidden2 = tf.nn.sigmoid(tf.matmul(x_hidden1,W2) + b2)
W3 = tf.Variable(tf.random_normal((n_hidden,n),dtype = tf.float64))
b3 = tf.Variable(tf.random_normal((n,),dtype = tf.float64))
y_pred = tf.matmul(x_hidden2,W3) + b3
penalty = tf.reduce_sum((y - y_pred)**2)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(penalty)
model = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(model)
for i in range(0,training_epochs):
sess.run(train_op,{x: Xtrain,y: Ytrain})
Ypred = y_pred.eval(feed_dict = {x: Xtest})
I try to build a model that will identify by data and try to see the LOSS function
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
But as of now I only get NAN on the prediction and printing NAN in the LOSS function
np_labels = np.array(labels)
np_labels = np_labels.reshape([np_labels.shape[0], 1])
features = 910
hidden_layer_nodes = 100
x = tf.placeholder(tf.float32, [None, features])
y_ = tf.placeholder(tf.float32, [None, 1])
W1 = tf.Variable(tf.truncated_normal([features,hidden_layer_nodes], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, shape=[hidden_layer_nodes]))
z1 = tf.add(tf.matmul(x,W1),b1)
a1 = tf.nn.relu(z1)
W2 = tf.Variable(tf.truncated_normal([hidden_layer_nodes,1], stddev=0.1))
b2 = tf.Variable(0.)
z2 = tf.matmul(a1,W2) + b2
y = 1 / (1.0 + tf.exp(-z2))
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
update = tf.train.AdamOptimizer(0.01).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(0,50):
sess.run(update, feed_dict = {x:fvecs, y_:np_labels})
print(sess.run(loss, feed_dict={x: fvecs, y_: np_labels}))
# sess.run(update, feed_dict = {x:data_x, y_:data_y})
# print(sess.run(loss, feed_dict={x: data_x, y_: data_y}))
print('prediction: ', y.eval(session=sess, feed_dict = {x:[[493.9, 702.6, .....
i want to print the loss
Thanks
This is not a TensorFlow-Issue. This results from the very bad idea of implementing the loss-function yourself.
import tensorflow as tf
z2 = tf.random_normal([8, 10]) * 20
y_ = tf.random_uniform([8, 1], minval=0, maxval=10, dtype=tf.float32)
y = 1 / (1.0 + tf.exp(-z2))
loss = tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
with tf.Session() as sess:
print sess.run(loss) # will always fail with high prob
Will give Inf just because of missing the log-sum-exp trick which then causes your implementation to fail due to numerical instabilities (a folklore example which produces overflows). Just run this code several times and you get either NaN or Inf.
Solution would be:
replace y = tf.sigmoid(-z2) by y = tf.identity(z2) to just get the untransformed logits
replace loss = .. by loss = tf.nn.sigmoid_cross_entropy_with_logits(...) to use the numerical stable way
See the docs of sigmoid_cross_entropy_with_logits which explicitly describes this issue.
I am trying to build a deep network using TF after using Martin Gorner's video as a reference. I has some success with the shallow network example; however the deep network's accuracy is collapsing after reaching around 98% accuracy for some reason.
The network implemented is used to recognise MNIST numerical characters using a five layer network. I am training with batches of 100 for 10000 iterations. The accuracy steadily increases until it reaches around 98%, then collapses completely to 9.8%.
Any ideas please?
"""Tensor flow character recognition of Numerals"""
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# layer K will have 200 neuron and so on
K = 200
L = 100
M = 60
N = 30
# ----- Initialization -----
# the None will become the batch size of 100
# 28 by 28 grayscale images described by a single byte
X = tf.placeholder(tf.float32, [None, 784])
# training will require computing variables W and b
W1 = tf.Variable(tf.truncated_normal([28*28, K], stddev=0.1))
B1 = tf.Variable(tf.zeros([K]))
W2 = tf.Variable(tf.truncated_normal([K, L], stddev=0.1))
B2 = tf.Variable(tf.zeros([L]))
W3 = tf.Variable(tf.truncated_normal([L, M], stddev=0.1))
B3 = tf.Variable(tf.zeros([M]))
W4 = tf.Variable(tf.truncated_normal([M, N], stddev=0.1))
B4 = tf.Variable(tf.zeros([N]))
W5 = tf.Variable(tf.truncated_normal([N, 10], stddev=0.1))
B5 = tf.Variable(tf.zeros([10]))
init = tf.global_variables_initializer()
# ----- Model -----
# the model Y = WX+b
# reshape is used to flatted the image into a 1D array of 784 locations
# -1 is used to tell python to figure the reshape as there's only on solution
#Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, 784]), W) + b)
Y1 = tf.nn.relu(tf.matmul(X, W1) + B1)
Y2 = tf.nn.relu(tf.matmul(Y1, W2) + B2)
Y3 = tf.nn.relu(tf.matmul(Y2, W3) + B3)
Y4 = tf.nn.relu(tf.matmul(Y3, W4) + B4)
Y5 = tf.nn.softmax(tf.matmul(Y4, W5) + B5)
# placeholder for correct answers
# e.g. correct answer for 2 will be [0 0 1 0 0 0 0 0 0 0 ]
Y_ = tf.placeholder(tf.float32, [None, 10])
# the loss function
cross_entropy = tf.reduce_sum(Y_ * tf.log(Y5)) * -1
# ----- Success Metrics -----
# calculate the % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y5, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
# ----- Training Step -----
# pick an optimizer and tell it to minimize the cross entropy loss function
optimizer = tf.train.GradientDescentOptimizer(0.003)
train_step = optimizer.minimize(cross_entropy)
# create the execution session
sess = tf.Session()
sess.run(init)
for i in range(10000):
# load a batch of images from mnist
batch_X, batch_Y = mnist.train.next_batch(100)
train_data = {X: batch_X, Y_: batch_Y}
# ----- Execution -----
# train
sess.run(train_step, feed_dict=train_data)
# test for success
a, c = sess.run([accuracy, cross_entropy], feed_dict=train_data)
# this is only to display information
if i%100 == 0:
# check for success on whole data set
test_data = {X: mnist.test.images, Y_:mnist.test.labels}
a, c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)
It is the accuracy on the validation set which collapses. right ?
so, you may be dramatically overfitting.
98% is possibly the best you can achieve with a network with such a capacity/structure.
I've been struggling with this pet problem for a while now, so any help would be appreciated!
I have a csv file, with a few random columns, and a final column that's based on the sum of the last few values from the first column. I'm trying to use an LSTM model to capture this structure, i.e. to predict the last column given the first few.
Here's the model I've been using:
# Generate test data
train_input = train_input.reshape(m, n_input, 1) # is nr of rows, n_input is number of input columns
NUM_EXAMPLES = int(m * training_size)
test_input = train_input[NUM_EXAMPLES:]
test_output = train_output[NUM_EXAMPLES:]
train_input = train_input[:NUM_EXAMPLES]
train_output = train_output[:NUM_EXAMPLES]
#
# # Design model
#
data = tf.placeholder(tf.float32, [None, n_input, 1])
target = tf.placeholder(tf.float32, [None, n_classes])
num_hidden = 24
cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
cross_entropy = -tf.reduce_sum(target * tf.log(tf.clip_by_value(prediction,1e-10,1.0)))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
no_of_batches = int(len(train_input)/batch_size)
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
print("Epoch - {}".format(i))
incorrect = sess.run(error,{data: test_input, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()
I've tried several spreadsheets with random numbers, and I'm consistently getting around 83% error rate. On the other hand, this algorithm CAN learn if the target column is not sequential.
Thanks in advance!
I can't clearly get your point, do you mean you have a csv file like this?
x1 x2 x3 x4 ... xn
v11 v21 v31 v41 ... vn1
v12 v22 v32 v42 ... vn2
...
v1n v2n v3n v4n ... vnn
y1 y2 y3 y4 ... yn
And yn based on sum(vn1+...+vnn)? Like a * sum(V) + b?
I Wrote a Neural Network in TensorFlow for the XOR input. I have used 1 hidden layer with 2 units and softmax classification. The input is of the form <1, x_1, x_2, zero, one> , where
1 is the bias
x_1 and x_2 are either between 0 and 1 for all the combination {00, 01, 10, 11}. Selected to be normally distributed around 0 or 1
zero: is 1 if the output is zero
one: is 1 if the output is one
The accuracy is always around 0.5. What has gone wrong? Is the architecture of the neural network wrong, or is there something with the code?
import tensorflow as tf
import numpy as np
from random import randint
DEBUG=True
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def model(X, weight_hidden, weight_output):
# [1,3] x [3,n_hiddent_units] = [1,n_hiddent_units]
hiddern_units_output = tf.nn.sigmoid(tf.matmul(X, weight_hidden))
# [1,n_hiddent_units] x [n_hiddent_units, 2] = [1,2]
return hiddern_units_output
#return tf.matmul(hiddern_units_output, weight_output)
def getHiddenLayerOutput(X, weight_hidden):
hiddern_units_output = tf.nn.sigmoid(tf.matmul(X, weight_hidden))
return hiddern_units_output
total_inputs = 100
zeros = tf.zeros([total_inputs,1])
ones = tf.ones([total_inputs,1])
around_zeros = tf.random_normal([total_inputs,1], mean=0, stddev=0.01)
around_ones = tf.random_normal([total_inputs,1], mean=1, stddev=0.01)
batch_size = 10
n_hiddent_units = 2
X = tf.placeholder("float", [None, 3])
Y = tf.placeholder("float", [None, 2])
weight_hidden = init_weights([3, n_hiddent_units])
weight_output = init_weights([n_hiddent_units, 2])
hiddern_units_output = getHiddenLayerOutput(X, weight_hidden)
py_x = model(X, weight_hidden, weight_output)
#cost = tf.square(Y - py_x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
with tf.Session() as sess:
tf.global_variables_initializer().run()
trX_0_0 = sess.run(tf.concat([ones, around_zeros, around_zeros, ones, zeros], axis=1))
trX_0_1 = sess.run(tf.concat([ones, around_zeros, around_ones, zeros, ones], axis=1))
trX_1_0 = sess.run(tf.concat([ones, around_ones, around_zeros, zeros, ones], axis=1))
trX_1_1 = sess.run(tf.concat([ones, around_ones, around_ones, ones, zeros], axis=1))
trX = sess.run(tf.concat([trX_0_0, trX_0_1, trX_1_0, trX_1_1], axis=0))
trX = sess.run(tf.random_shuffle(trX))
print(trX)
for i in range(10):
for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX) + 1, batch_size)):
trY = tf.identity(trX[start:end,3:5])
trY = sess.run(tf.reshape(trY,[batch_size, 2]))
sess.run(train_op, feed_dict={ X: trX[start:end,0:3], Y: trY })
start_index = randint(0, (total_inputs*4)-batch_size)
y_0 = sess.run(py_x, feed_dict={X: trX[start_index:start_index+batch_size,0:3]})
print("iteration :",i, " accuracy :", np.mean(np.absolute(trX[start_index:start_index+batch_size,3:5]-y_0)),"\n")
Check the comments section for the updated code
The problem was with the randomly assigned weights. Here is the modified version, obtained after a series of trail-and-error.