Super high cost Tensorflow - python

I'm trying to make some price prediction on a kaggle dataset with Tensorflow.
My Neural network is learning, but, my cost function is really high and my predictions are far from the real output.
I tried to change my network by adding or removing some layers, neurons and activations functions.
I tried a lot with my hyper-parameters but that don't change so much things.
I don't think that the problem come from my datas, I checked on kaggle and that's the ones that most people uses.
If you have any idea why my cost is so high and how to reduce it and if you could explain it to me, it would be really great !
Her's my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.utils import shuffle
df = pd.read_csv(r"C:\Users\User\Documents\TENSORFLOW\Prediction prix\train2.csv", sep=';')
df.head()
df = df.loc[:, ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'SalePrice']]
df = df.replace(np.nan, 0)
df
%matplotlib inline
plt = sns.pairplot(df)
plt
df = shuffle(df)
df_train = df[0:1000]
df_test = df[1001:1451]
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 500
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.1))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*n_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
for i in range(total_batch):
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: inputX,
y: inputY})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(pred,y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: inputX, y: inputY}))
print(sess.run(pred, feed_dict={x: inputX_test}))
Epoch: 0001 cost= 10142407502702304395526144.000000000
Epoch: 0051 cost= 3256106752.000019550
Epoch: 0101 cost= 3256106752.000019550
Epoch: 0151 cost= 3256106752.000019550
Epoch: 0201 cost= 3256106752.000019550
...
Thanks for your help !

I see couple of problems with the implementation:
Inputs are not scaled.
Use sklearn StandardScaler to scale the inputs inputX, inputY (and also inputX_text and inputY_text) to make it zero mean and unit variance. You can use the inverse_transform to convert the outputs back to proper scale again.
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputX_test = sc.transform(inputX_test)
The batch_size is too large, you are passing the entire set as a single batch. This should not cause the particular problem you are facing, but for better convergence try with reduced batch size. Implement a get_batch() generator function and do the following:
for batch_X, batch_Y in get_batch(input_X, input_Y, batch_size):
_, c = sess.run([optimizer, cost], feed_dict={x: batch_X,
y: batch_Y})
Try smaller Weights initialization (stddev) if you still see issues.
WORKING CODE BELOW:
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
sc1 = StandardScaler().fit(inputY)
inputY = sc1.transform(inputY)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputX_test = sc.transform(inputX_test)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
inputY_test = sc1.transform(inputY_test)
learning_rate = 0.01
training_epochs = 1000
batch_size = 50
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.005))
biases = tf.Variable(tf.zeros([1, out_size]))
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_mean(tf.pow(tf.subtract(pred, y), 2))
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
#for i in range(total_batch):
for batch_x, batch_y in get_batch(inputX, inputY, batch_size):
# Run optimization op (backprop) and cost op (to get loss value)
_, c, _l1, _pred = sess.run([optimizer, cost, l1, pred], feed_dict={x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f} ".format(avg_cost))
#print(_l1, _pred)
print("Optimization Finished!")

I have already had a similar problem of a very high cost reached after a few training steps, and then the cost remaining constant there. For me it was a kind of overflow, with the gradients too big and creating Nan values quite early in training. I solved it by starting with a smaller learning rate (potentially much smaller), until the cost and gradients become more reasonable (a few dozen steps), and then back to a regular one (higher at the start, potentially decaying).
See my answer to this post for a similar case that was solved just by taking a smaller learning rate on start.
You can also clip your gradients to avoid this problem, using tf.clip_by_value. It sets a minimum and maximum value to your gradients, which avoids to have huge ones that send your weights straight to Nan after the first few iterations. To use it (with min and max at -1 and 1, which is probably too tight), replace
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
by
opt= tf.train.GradientDescentOptimizer(learning_rate)
gvs = opt.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)

Related

LSTM+FFN performs more poorly than FFN

I am building several simple networks to predict the bike rentals at 500 stations in the upcoming hour, given rentals at all stations in the previous 24 hours. I am working with two architectures, one with a graph convolution (which amounts to updating each station with a learned linear combination of other stations, at each hour) and a FNN layer to prediction, and a second with a graph convolution -> LSTM -> FNN to prediction.
Before I describe more, I'm getting poorer performance for my model which includes an LSTM unit, which is confusing me.
See these two images for a description of each architecture, for each architecture I also add hourly meta-data (weather, time, etc) as variation, they are in the images in red, and not relevant to my question. Image links at the bottom of the post.
[Architecture 1: GCNN + FNN][1]
[Architecture 2: GCNN + LSTM + FNN][2]
Confusingly, the test RMSE for the first model is 3.46, for the second model its 3.57. Could someone please explain to me why the second wouldn't be lower, as it seems to be running the exact same processes, except with an additional LSTM unit.
Here are relevant snippets of my code for the GCNN+FNN model:
def gcnn_ddgf(hidden_layer, node_num, feature_in, horizon, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training, Y_training, X_val, Y_val, X_test, Y_test, scaler, display_step):
n_output_vec = node_num * horizon # length of output vector at the final layer
early_stop_k = 0 # early stop patience
best_val = 10000
traing_error = 0
test_error = 0
pred_Y = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
vec_length = feature_in
weights_hidden['h1'] = tf.Variable(tf.random_normal([vec_length, hidden_layer], stddev=0.5))
biases['b1'] = tf.Variable(tf.random_normal([1, hidden_layer], stddev=0.5))
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([hidden_layer, horizon], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, horizon], stddev=0.5))
# Construct model
pred= gcn(X, weights_hidden, weights_A, biases, node_num, horizon) #see below
pred = scaler.inverse_transform(pred)
Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y_original, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
# testing
c_tes, preds, Y_true = sess.run([cost, pred, Y_original], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
test_error = c_tes
traing_error = avg_cost
pred_Y = preds
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The test RMSE is ", test_error)
return best_val, pred_Y ,Y_true,test_error
# code that creates the model
def gcn(signal_in, weights_hidden, weights_A, biases, node_num, horizon):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [-1, int(feature_len)]) # node_num * batch, feature_in
signal_output = tf.add(tf.matmul(Z, weights_hidden['h1']), biases['b1'])
signal_output = tf.nn.relu(signal_output) # node_num * batch, hidden_vec
final_output = tf.add(tf.matmul(signal_output, weights_hidden['out']), biases['bout']) # node_num * batch, horizon
# final_output = tf.nn.relu(final_output)
final_output = tf.reshape(final_output, [node_num, -1, horizon]) # node_num, batch, horizon
final_output = tf.transpose(final_output, [1, 0, 2]) # batch, node_num, horizon
final_output = tf.reshape(final_output, [-1, node_num*horizon]) # batch, node_num*horizon
return final_output
And the code for the GCNN+LSTM+FNN model:
def gcnn_ddgf_lstm(node_num, feature_in, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training,
Y_training, X_val, Y_val, X_test, Y_test, scaler, lstm_layer):
n_output_vec = node_num # length of output vector at the final layer
early_stop_k = 0 # early stop patience
display_step = 1 # frequency of printing results
best_val = 10000
traing_error = 0
test_error = 0
predic_res = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_layer, state_is_tuple=True)
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['h1'] = tf.Variable(tf.random_normal([lstm_layer, node_num], stddev=0.5))
biases['h1'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
# Construct model
pred= gcn_lstm(X, weights_hidden, weights_A, biases, node_num, lstm_cell)
# pred = scaler.inverse_transform(pred)
# Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
c_tes, preds = sess.run([cost, pred], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
# save model
#saver.save(sess, './bikesharing_gcnn_ddgf')
test_error = c_tes
traing_error = avg_cost
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
pred_Y = scaler.inverse_transform(preds)
Y_true = scaler.inverse_transform(Y_test)
test_err = tf.sqrt(tf.reduce_mean(tf.pow(pred_Y - Y_true, 2)))
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The scaled test RMSE is ", test_error)
return pred_Y, Y_true
def gcn_lstm(signal_in, weights_hidden, weights_A, biases, node_num, lstm_cell):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [node_num, -1, int(feature_len)]) # node_num, batch, feature_in
Z = tf.transpose(Z,[1,2,0]) # batch, feature_in, node_num
# init_state = cell.zero_state(batch_size, tf.float32)
_, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) # init_state?
dense_output = tf.add(tf.matmul(Z[1], weights_hidden['h1']), biases['h1'])
dense_output = tf.nn.relu(dense_output)
final_output = tf.add(tf.matmul(dense_output, weights_hidden['out']), biases['bout']) # batch, node_num*horizon
return final_output
In particular, should I be weary that _, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) causes my variables defined elsewhere not to train?
Thanks a lot for any help :)
[1]: https://i.stack.imgur.com/MAO2t.png
[2]: https://i.stack.imgur.com/UDjHw.png
I resolved this.
I have three years of bike use data to make the prediction, and was using the ~last three months as my validation/test set. The last few months were winter with lower bike use. I got expected results (GCNN+LSTM outperforms GCNN, though not by much) when I shuffled my training data prior to allocating to sets (with sequences preserved for LSTM)

Getting Error while running cost sigmoid cost function

I have a training data, with 1000 rows. I am using Tensorflow for training this data. Also trying to divide this into mini-batches of size 32. While Training the data, i am getting the error as mentioned below
InvalidArgumentError: Incompatible shapes: [1000] vs. [32]
[[{{node logistic_loss_1/mul}}]]
On the contrary, if i don't divide my training data into minibatches, or use a single minibatch of size 1000, the code works fine.
I have defined weights as tf.Variables and running the tensorflow session. See the code below
def sigmoid_cost(z,Y):
print("Entered Cost")
z = tf.squeeze(z)
Y = tf.cast(Y_train,tf.float64)
logits = tf.transpose(z)
labels = (Y)
print(logits.shape)
print(labels.shape)
return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,logits=logits))
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.0001,
num_epochs = 1500, minibatch_size = 32, print_cost = True):
hidden_layer = 4
m,n = X_train.shape
n_y = Y_train.shape[0]
X = tf.placeholder(tf.float64,shape=(None,n), name="X")
Y = tf.placeholder(tf.float64,shape=(None),name="Y")
parameters = init_params(n)
z4, parameters = fwd_model(X,parameters)
cost = sigmoid_cost(z4,Y)
num_minibatch = m/minibatch_size
print("Getting Minibatches")
num_minibatch = tf.cast(num_minibatch,tf.int32)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)
print("Gradient Defination Done")
init = tf.global_variables_initializer()
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(init_op)
for epoch in range(0,num_epochs):
minibatches = []
minibatches = minibatch(X_train,Y_train,minibatch_size)
minibatch_cost = 0
for i in range (0,len(minibatches)):
(X_m,Y_m) = minibatches[i]
Y_m = np.squeeze(Y_m)
print("Minibatch %d X shape Y Shape ",i, X_m.shape,Y_m.shape)
_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: X_m, Y: Y_m})
print("Mini Batch Cost is ",minibatch_cost)
epoch_cost = minibatch_cost/num_minibatch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
print(epoch_cost)
For some reason, while running the cost function the size of either X or Y batch is being taken as 32, 100 or vice-versa. Any help would be appreciated.
I think you are getting above error because of Y = tf.cast(Y_train, tf.float64) line inside sigmoid_cost function. Here, Y_train has 1000 rows, but loss function is expecting 32(which is your batch size).
It should be Y = tf.cast(Y, tf.float64). Infact, there is no need to cast data type here as Y is already of type tf.float64. Check below line:
Y = tf.placeholder(tf.float64,shape=(None),name="Y")
That's why, when you were using a single minibatch of size 1000(full Y_train data), your code was working fine.

Tensorflow Linear Regression Not Converging to Correct Values

I have a csv data set with 2 columns, an input and output column and when I use Excel to to find the trend line, I get:
y = -0.4571x + 0.9011
When I run the following code w and converge to different values depending on the learning rate and batch size I choose. I have played around with different values without any luck. Maybe I am missing something perhaps?
The cost doesn't seem to change either.
learningRate = 0.001
epochs = 2000
batchSize = 20
df = pd.read_csv("C:\\Users\\Brian\\Desktop\\data.csv")
X = df[df.columns[0]].values
Y = df[df.columns[1]].values
def getBatch(batchSize, inputs, outputs):
idx = np.arange(0,len(inputs))
np.random.shuffle(idx)
idx = idx[:batchSize]
xBatch = [inputs[i] for i in idx]
yBatch = [outputs[i] for i in idx]
xBatch = np.reshape(xBatch, (batchSize,1))
return np.asarray(xBatch), np.asarray(yBatch)
w = tf.Variable(0.0, tf.float32)
b = tf.Variable(0.0, tf.float32)
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)
prediction = tf.add(tf.multiply(x,w), b)
cost = tf.reduce_sum(tf.square(prediction-y))
optimizer = tf.train.GradientDescentOptimizer(learningRate).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(epochs):
xBatch, yBatch = getBatch(batchSize,X,Y)
#for (trainX, trainY) in zip(xBatch,yBatch):
sess.run(optimizer, feed_dict={x: xBatch, y: yBatch})
if(epoch+1) % 50 == 0:
c = sess.run(cost, feed_dict={x: X, y: Y})
print("Epoch:", (epoch+1), "cost=", "{:.4f}".format(c), "w=", sess.run(w), "b=", sess.run(b))
print("Optimization Finished")
trainingCost = sess.run(cost, feed_dict={x: X, y:Y})
print("Training cost=", trainingCost, "w=", sess.run(w), "b=", sess.run(b))
When I run the following code w and converge to different values depending on the learning rate and batch size I choose.
Because, if you run sess.run(optimizer, feed_dict={x: xBatch, y: yBatch}) TensorFlow does something like below.
w -= learningRate * dw
where dw is the value calculated by gradient descent optimizer.
So if you change learningRate and then run the program, you get different value of w. And w affect dw and dw affect next w. So it's difficult to predict what value w will become if you change the learningRate.

Making batch tensorflow

So I have this problem of making batch in my code, the thing is, I tried to search how we do batching but all I found was using some method like next_batch in MNIST sample program. I would really appreciate if someone could actually give me some tips on how I should make batch in my program below.
import tensorflow as tf
import numpy as np
from sklearn import cross_validation
import pandas as pd
np.random.seed(20160612)
tf.set_random_seed(20160612)
#this is input data, data is a 7x86594 and label is a 5x86594
data2 = pd.read_csv('rawdata.csv', sep=',', header=None)
data = np.array(data2)
label2=pd.read_csv('class.csv', sep='\t', header=None)
label=np.array(label2)
train_x,test_x,train_t,test_t=cross_validation.train_test_split(data,label,test_size=0.1,random_state=None)
#this is supposed to be neural size in hidden layer
num_units = 15
x = tf.placeholder(tf.float32, [None, 7])
t = tf.placeholder(tf.float32, [None, 5])
w1 = tf.Variable(tf.truncated_normal([7, num_units], mean=0.0, stddev=0.05))
b1 = tf.Variable(tf.zeros([num_units]))
hidden1 = tf.nn.relu(tf.matmul(x, w1) + b1)
w0 = tf.Variable(tf.zeros([num_units, 5]))
b0 = tf.Variable(tf.zeros([5]))
p = tf.nn.softmax(tf.matmul(hidden1, w0) + b0)
loss = -tf.reduce_sum(t * tf.log(tf.clip_by_value(p,1e-10,1.0)))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(p, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#this is how i think batching is
batch_size = 100
for j in range(0, 86594, batch_size):
xs,ys= train_x[j:j+batch_size], train_t[j:j+batch_size]
i = 0
for _ in range(4000):
i += 1
sess.run(train_step, feed_dict={x: xs, t: ys})
if i % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy],feed_dict={x:test_x, t: test_t})
print ('Step: %d, Loss: %f, Accuracy: %f'% (i, loss_val, acc_val))
The result of this program, of course, isn't right.
Keep extracting the batches of your data and keep feeding them to the network for training. In each epoch, all the samples of your training dataset should be run once. So you can rewrite your code like this:
Required part of code only:
epochs = 4000
batch_size = 100
for epoch_no in range(epochs):
for index, offset in enumerate(range(0, 86594, batch_size)):
xs, ys = train_x[offset: offset + batch_size], train_t[offset: offset + batch_size]
sess.run(train_step, feed_dict={x: xs, t: ys})
if index % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy], feed_dict = {x: test_x, t: test_t})
print ('Epoch %d, Step: %d, Loss: %f, Accuracy: %f'% (epoch_no, index, loss_val, acc_val))

unable to get the updated value of tensor in tensorflow

I used the code below for simple logistic regression. I was able to get the updated value of b: the values of b.eval() before/after training are different. However, the value of W.eval() remains the same. I was wondering what mistake I made? Thank you!
from __future__ import print_function
import tensorflow as tf
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.01
training_epochs = 20
batch_size = 100
display_step = 1
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
# Set model weights
W = tf.Variable(tf.random_normal([784, 10]))
b = tf.Variable(tf.zeros([10]))
# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
print('W is:')
print(W.eval())
print('b is:')
print(b.eval())
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
y: batch_ys})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if (epoch+1) % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished!")
print('W is:')
print(W.eval())
print('b is:')
print(b.eval())
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
When we print a numpy array only initial and last values will get printed, And in case of MNIST those indices of weights are not updating as corresponding pixels in images remains constant as all digits are written in centre part of array or image not along boundary regions.
The actual pixels which are varying from one input sample to another input sample are centre pixels so only those corresponding weights elements will get update.
To compare weights before and after training you can use numpy.array_equal(w1, w2)
or, you can print whole numpy array by doing:
import numpy
numpy.set_printoptions(threshold='nan')
or, you can compare element by element, and print only those values of array which differ by a certain threshold

Categories

Resources