Background:
I am trying to create a MLP using Tensorflow, my first time using tensorflow. It is a simple NN that will do the xor operation. I have 2 input neurons (for the 1s and 0s) a hidden layer that is 2 neurons wide. One output that will give me a 1 or 0. My activation is a simple sigmoid.
The Issue
I am running into an issue with launching the graph. Something I noticed is that when we launch the graph we get all of the batch instead of one at a time. for example I have the following in an array [[1,0],[0,0],[0,1],[1,1]]. When I try to start the graph I do the following:
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
As we can see I feed in x and y into the neural network. Once I do this I need to multiply the weights * outputs (essentially the input [1,0]) and sum them The issue is that I get a mismatch in size between the x values and the weights array:
tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
InvalidArgumentError: Incompatible shapes: [2,3] vs. [4,3]
[[Node: Mul_6 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](transpose_10, concat_12)]]
What am I doing wrong here, I understand this is not a perfect implementation. But I want to do a NN step by step
Here is my full code:
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.num_neurons = num_neurons
self.num_weights = num_weights
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.num_neurons = num_neurons+1
self.weights = tf.random_normal([num_neurons, num_weights])
self.outputs = tf.zeros(num_neurons, tf.float32)
self.sums = tf.zeros(num_neurons, tf.float32)
self.deltas = tf.zeros(num_neurons, tf.float32)
self.gradiants = tf.zeros([num_neurons, num_weights], tf.float32)
self.weight_deltas = tf.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(p_layer.weights) , p_layer.outputs), 1))
return self.sums
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = tf.concat([sigmoid(self.sums, False), tf.constant([1.0])], 0)
else:
self.outputs = sigmoid(self.sums, False)
return self.outputs
def calculate_deltas(self, n_layer = None, y=None):
if self.layer_type == 'hidden':
self.deltas = sigmoid(self.sums, True) * n_layer.deltas * self.weights[:-1,0]
else:#output delta
E = self.outputs[:self.num_neurons]-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
shape = (tf.shape(self.outputs)[0], 1)
self.gradiants += tf.reshape(self.outputs, shape=shape) * tf.transpose(n_layer.deltas)#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
self.weight_deltas = self.gradiants*learning_rate + momentum * self.weight_deltas
self.weights += self.weight_deltas
# for i in range(len(self.gradiants)):
# for j in range(len(self.gradiants[0])):
# self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
# self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+tf.exp(-x))) * (1.0 - (1.0/(1+tf.exp(-x))))
return 1.0/(1+tf.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
def run_nn(x_val, y_val):
l0.outputs = tf.concat([x_val, tf.ones(shape=(tf.shape(x_val)[0],1))], 1)
print 'set output'
#forward pass
# l1.calculate_sums(l0)
# print 'l1 calc sum'
# l1.calculate_outputs(l0)
# print 'l1 calc output'
# ol.calculate_sums(l1)
# print 'ol calc sum'
# ol.calculate_outputs(l1)
# print 'ol calc output'
# #backwards pass
# ol.calculate_deltas(y=y_val)
# print 'ol calc deltas'
# l1.calculate_deltas(ol)
# print 'l1 calc deltas'
# l1.calculate_gradiants(ol)
# print 'l1 calc gradiants'
# l0.calculate_gradiants(l1)
# print 'l0 calc gradiants'
# #we dont want to update the weights every time, just after we have gone through every batch/minibatch
# l1.update_weights()
# print 'l1 update weights'
# l0.update_weights()
# print 'l0 uipdate weights'
# l1.gradiants = tf.zeros_like(l1.gradiants)
# print 'l1 zero gradiants'
# l0.gradiants = tf.zeros_like(l0.gradiants)
# print 'l0 zero gradiants'
# #test
# print 'run test'
# l0.outputs = tf.concat([x, tf.constant([1.0])], 0 )
# #forward pass
# l1.calculate_sums(l0)
# l1.calculate_outputs(l0)
#
# ol.calculate_sums(l1)
# ol.calculate_outputs(l1)
# print 'DONE'
return tf.transpose(tf.reduce_sum(tf.multiply(tf.transpose(l0.weights) , l0.outputs), 1))
l0 = layer(2,2,'hidden')#input
l1 = layer(2,1,'hidden')#hidden
ol = layer(1,0,'output')#output
x_vals = np.array([[1.0, 0.0],[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
# initialize variables
init = tf.global_variables_initializer()
x = tf.placeholder('float', None)
y = tf.placeholder('float', None)
result = run_nn(x,y)
with tf.Session() as sess:
sess.run(init)
results = sess.run(result, feed_dict={x: x_vals, y:y_vals})
print results
Here is some equivalent code in pure python/numpy
import math
import numpy as np
momentum = 0.5
learning_rate = 2.0
class layer:
def __init__(self, num_neurons, num_weights, layer_type):#number of weights corresponds to number of neurons in next layer
self.layer_type = layer_type
if layer_type == 'hidden':
num_neurons = num_neurons+1#account for bias
self.weights = np.random.rand(num_neurons,num_weights)
self.outputs = np.zeros(shape=(1,num_neurons))
self.sums = np.zeros(shape=(1,num_neurons))
self.deltas = np.zeros(shape=(1,num_neurons)).T
self.gradiants = np.zeros(shape=(num_neurons,num_weights))
self.weight_deltas = np.zeros_like(self.gradiants)
def calculate_sums(self, p_layer):
self.sums = np.array([(sum(p_layer.weights * p_layer.outputs))]).T
return self.sums;
def calculate_outputs(self, p_layer):
if self.layer_type == 'hidden':
self.outputs = np.concatenate((np.array([[sigmoid(X, False)] for X in self.sums]), np.array([[1.0]])))
else:
self.outputs = np.array([[sigmoid(X, False)] for X in self.sums])
return self.outputs
def calculate_deltas(self, n_layer = None):
if self.layer_type == 'hidden':
self.deltas = np.array([[sigmoid(X, True)] for X in self.sums]) * n_layer.deltas * self.weights[:-1]
else:#output delta
E = self.outputs-y
#print 'error: {}'.format(E)
self.deltas = -E* sigmoid(self.sums, True)
return self.deltas
def calculate_gradiants(self, n_layer):
self.gradiants += self.outputs * n_layer.deltas.T#we add the gradiants for every batch completion then update, dont want to update every time
return self.gradiants
def update_weights(self):
for i in range(len(self.gradiants)):
for j in range(len(self.gradiants[0])):
self.weight_deltas[i,j] = weight_change(self.gradiants[i,j], self.weight_deltas[i,j])
self.weights[i,j] += self.weight_deltas[i,j]
def sigmoid(x, derivative = False):
if derivative == True:
return (1.0/(1+math.exp(-x))) * (1.0 - (1.0/(1+math.exp(-x))))
return 1.0/(1+math.exp(-x))
#the output delta is just E*f'i, essentially the error * the derivative of the activation function
def weight_change(g, p_w_delta):#gradiant, previous weight delta
return learning_rate*g + momentum * p_w_delta
input_layer = layer(3,2, 'hidden')
hidden_layer1 = layer(2,1, 'hidden')
output_layer = layer(1,0, 'output')
x_vals = []
y_vals = []
for i in range(2):
for j in range(2):
for k in range(2):
x_vals.append(np.array([[float(i)],[float(j)],[float(k)]]))
y_vals.append(np.array([float(i ^ j ^ k)]))
#x_vals = [np.array([[1.0], [0.0]]), np.array([[0.0], [0.0]]), np.array([[0.0], [1.0]]),np.array([[1.0], [1.0]])]
#y_vals = np.array([[1.0],[0.0],[1.0],[0.0]])
#input_layer.weights = np.array([[-0.06782947598673161,0.9487814395569221],[0.22341077197888182,0.461587116462548], [-0.4635107399577998, 0.09750161997450091]])
#hidden_layer1.weights = np.array([[-0.22791948943117624],[0.581714099641357], [0.7792991203673414]])
Error = []
for n in range(10000):
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
Error.append(-(output_layer.outputs-y))
#backwards pass
output_layer.calculate_deltas()
hidden_layer1.calculate_deltas(output_layer)
hidden_layer1.calculate_gradiants(output_layer)
input_layer.calculate_gradiants(hidden_layer1)
if n % 1000 == 0:
print 'Epoch #{}; error: {}'.format(n, sum(Error)/len(Error))
Error = []
#we dont want to update the weights every time, just after we have gone through every batch/minibatch
hidden_layer1.update_weights()
input_layer.update_weights()
hidden_layer1.gradiants.fill(0.0)
input_layer.gradiants.fill(0.0)
#test
for x, y in zip(x_vals, y_vals):
input_layer.outputs = np.concatenate((x, np.array([[1.0]])))
#forward pass
hidden_layer1.calculate_sums(input_layer)
hidden_layer1.calculate_outputs(input_layer)
output_layer.calculate_sums(hidden_layer1)
output_layer.calculate_outputs(hidden_layer1)
print 'Y_hat: {}, Y: {}'.format(round(float(output_layer.outputs), 3), float(y))
Can anyone point me in the right direction.
Thanks
Related
I am trying to create a nueral network using tensor flow. I am not using keras api. I have some parameter estimation(weight,bias and some other parameters) to do. The code is working but the parameter estimation is really bad and error percentage is very high what is the problem here? I tried so many ways still no improvement. the loss fn is less.
I tried creating my own optimizer but the process is slow and the error is large. Is there any way to apply optimizers parameter.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from pyDOE import lhs
import math as ma
class PhysicsInformedNN:
def __init__(self,X_n,v,layers,lb,ub):
self.lb = lb
self.ub = ub
self.layers = layers
self.dx_n = tf.convert_to_tensor(X_n[:,0:1],dtype = 'float32')
self.t_n = tf.convert_to_tensor(X_n[:,1:2],dtype = 'float32')
self.v_r = tf.convert_to_tensor(v,dtype = 'float32')
self.lambda_1 = tf.Variable(0,dtype = 'float32')#1.5
self.lambda_2 = tf.Variable(-6,dtype = 'float32')
self.para =[self.lambda_1,self.lambda_2]
self.weights, self.biases = self.initialize_NN(layers)
def initialize_NN(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype='float32'), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype='float32')
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.math.tanh(tf.math.add(tf.linalg.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.math.add(tf.linalg.matmul(H, W), b)
return Y
def net_u(self, x, t):
v = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return v
def net_f(self, x, t):
lambda_1 = self.para[0]
lambda_2 = tf.exp(self.para[1])
with tf.GradientTape(persistent=True) as tape :
tape.watch(t)
tape.watch(x)
u = self.net_u(x,t)
u_x = tape.gradient(u,x)
u_t = tape.gradient(u,t)
u_xx = tape.gradient(u_x,x)
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
del tape
return f
def callback(self, loss,n):
print('Loss:', loss, ' Epoch : ', n)
def train(self,epoch):
for i in range(epoch):
with tf.GradientTape(persistent=True) as tape :
tape.watch(self.weights)
tape.watch(self.biases)
tape.watch(self.para)
f_pred = self.net_f(self.dx_n, self.t_n)
v_pred = self.net_u(self.dx_n, self.t_n)
loss = tf.reduce_mean(tf.square(self.v_r - v_pred)) + tf.reduce_mean(tf.square(f_pred))
dw = tape.gradient(loss,self.weights)
db = tape.gradient(loss,self.biases)
dp = tape.gradient(loss,self.para)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-2,
decay_steps=10000,
decay_rate=0.9)
optimizer1 = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer1.apply_gradients(zip(dw, self.weights))
optimizer1.apply_gradients(zip(db, self.biases))
optimizer2 = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer2.apply_gradients(zip(dp, self.para))
del tape
self.callback(loss,i)
def predict(self, X_star):
v_star = self.net_u(X_star[:,0:1], X_star[:,1:2])
f_star = f_pred = self.net_f(X_star[:,0:1], X_star[:,1:2])
para_last = self.para
return v_star, f_star, para_last
if __name__ == '__main__':
#PARAMETERS for the problem
np.random.seed(123)
nu =0.01/np.pi
layers = [2, 20, 20, 20, 20, 1]
N_u = 2000
data = scipy.io.loadmat('burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
lb = X_star.min(0)
ub = X_star.max(0)
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(1000)
X_star = tf.convert_to_tensor(X_star,dtype='float32')
u_pred, f_pred, param = model.predict(X_star)
error_lambda_1 = np.abs(param[0] - 1.0)*100
error_lambda_2 = np.abs( np.exp(param[1])- nu)/nu * 100
print(error_lambda_1,error_lambda_2)
The Code below gives about 95 % accuracy if I do not use dropout in training.
The accuracy drops to 11 % if I use dropout.
The network is built using Numpy.
I have used a class Neural Networks which contains many layer objects.
The last layer has sigmoid activation and the rest have Relu.
The code is:
import numpy as np
import idx2numpy as idx
import matplotlib.pyplot as plt
np.random.seed(0)
img = r"C:\Users\Aaditya\OneDrive\Documents\ML\train-image"
lbl = r'C:\Users\Aaditya\OneDrive\Documents\ML\train-labels-idx1-ubyte'
t_lbl = r'C:\Users\Aaditya\OneDrive\Documents\ML\t10k-labels.idx1-ubyte'
t_img = r'C:\Users\Aaditya\OneDrive\Documents\ML\t10k-images.idx3-ubyte'
image = idx.convert_from_file(img)
iput = np.reshape(image, (60000,784))/255
otput = np.eye(10)[idx.convert_from_file(lbl)]
test_image = idx.convert_from_file(t_img)
test_input = np.reshape(test_image, (10000,784))/255
test_output = idx.convert_from_file(t_lbl)
def sigmoid(x):
sigmoid = 1/(1+ np.exp(-x))
return sigmoid
def tanh(x):
return np.tanh(x)
def relu(x):
return np.where(x>0,x,0)
def reluprime(x):
return (x>0).astype(x.dtype)
def sigmoid_prime(x):
return sigmoid(x)*(1-sigmoid(x))
def tanh_prime(x):
return 1 - tanh(x)**2
class Layer_Dense:
def __init__(self,n_inputs,n_neurons,activation="sigmoid",keep_prob=1):
self.n_neurons=n_neurons
if activation == "sigmoid":
self.activation = sigmoid
self.a_prime = sigmoid_prime
elif activation == "tanh":
self.activation = tanh
self.a_prime = tanh_prime
else :
self.activation = relu
self.a_prime = reluprime
self.keep_prob = keep_prob
self.weights = np.random.randn(n_inputs ,n_neurons)*0.1
self.biases = np.random.randn(1,n_neurons)*0.1
def cal_output(self,input,train=False):
output = np.array(np.dot(input,self.weights) + self.biases,dtype="float128")
if train == True:
D = np.random.randn(1,self.n_neurons)
self.D = (D>self.keep_prob).astype(int)
output = output * self.D
return output
def forward(self,input):
return self.activation(self.cal_output(input))
def back_propagate(self,delta,ap,lr=1,keep_prob=1):
dz = delta
self.weights -= 0.001*lr*(np.dot(ap.T,dz)*self.D)
self.biases -= 0.001*lr*(np.sum(dz,axis=0,keepdims=True)*self.D)
return np.multiply(np.dot(dz,self.weights.T),(1-ap**2))
class Neural_Network:
def __init__(self,input,output):
self.input=input
self.output=output
self.layers = []
def Add_layer(self,n_neurons,activation="relu",keepprob=1):
if len(self.layers) != 0:
newL = Layer_Dense(self.layers[-1].n_neurons,n_neurons,activation,keep_prob=keepprob)
else:
newL = Layer_Dense(self.input.shape[1],n_neurons,activation,keep_prob=keepprob)
self.layers.append(newL)
def predict(self,input):
output = input
for layer in self.layers:
output = layer.forward(output)
return output
def cal_zs(self,input):
self.activations = []
self.activations.append(input)
output = input
for layer in self.layers:
z = layer.cal_output(output,train=True)
activation = layer.activation(z)
self.activations.append(activation)
output = activation
def train(self,input=None,output=None,lr=10):
if input is None:
input=self.input
output=self.output
if len(input)>1000:
indices = np.arange(input.shape[0])
np.random.shuffle(indices)
input = input[indices]
output = output[indices]
for _ in range(100):
self.lr = lr
for i in range(int(len(input)/100)):
self.lr *=0.99
self.train(input[i*100:i*100+100],output[i*100:i*100+100],self.lr)
return
self.cal_zs(input)
for i in range(1,len(self.layers)+1):
if i==1:
delta = self.activations[-1] - output
self.delta = self.layers[-1].back_propagate(delta,self.activations[-2],lr)
else:
self.delta = self.layers[-i].back_propagate(self.delta,self.activations[-i-1],lr)
def MSE(self):
predict = self.predict(self.input)
error = (predict - self.output)**2
mse = sum(sum(error))
print(mse)
def Logloss(self):
predict = self.predict(self.input)
error = np.multiply(self.output,np.log(predict)) + np.multiply(1-self.output,np.log(1-predict))
logloss = -1*sum(sum(error))
print(logloss)
def accuracy(self):
predict = self.predict(test_input)
prediction = np.argmax(predict,axis=1)
correct = np.mean(prediction == test_output)
print(correct*100)
# def train(self,input,output):
model = Neural_Network(iput,otput)
# model.Add_layer(4)
model.Add_layer(64)
model.Add_layer(16)
model.Add_layer(10,"sigmoid")
lrc= 6
for _ in range(10):
model.accuracy()
model.Logloss()
model.train(lr=lrc)
model.accuracy()
I have used MNIST database the link is THIS
One of the reason can be that you might be dropping too much neurons. In below code
D = np.random.randn(1,self.n_neurons)
self.D = (D>self.keep_prob).astype(int)
Matrix generated in first line might contain many values which are less then zero. Because of that when comparing it with self.keep_prob (which has value 1) lot of neurons are getting dropped
Please try with one change
self.D = (D < self.keep_prob).astype(int)
There could be various reasons for that. One was specified by #anuragal.
Basically dropout is used to reduce overfitting and to help the network correct errors. But when you use dropout before your final layer, it could be that the network is unable to correct itself, thus leading to a lower accuracy
Another reason could be that I see your network is small. Usually, shallow networks aren't benefitted by dropouts
I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session
I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?
I was trying to train a simple polynomial linear model with pytorch using Hermite polynomials since they seem to have a better conditioned Hessian.
To do that I decided to use the hermvander since it gives the Vandermonde matrix with each entry being a Hermite term. To do that I just made my feature vectors be the outpute of hermvander:
Kern_train = hermvander(X_train,Degree_mdl)
however, when I proceeded to train I get NaN all the time. I suspected it could have been a step size issue but I decided to use the step size suggested by this question that already has my example working in R, so there is no need to search for a step size I thought. However, when I tried it it does not work.
Anyone has any idea whats going on?
Same error occurs in tensorflow:
import pdb
import numpy as np
from numpy.polynomial.hermite import hermvander
import random
import tensorflow as tf
def get_batch(X,Y,M):
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = X[batch_indices,:]
batch_ys = Y[batch_indices]
return batch_xs, batch_ys
##
D0=1
logging_freq = 100
## SGD params
M = 5
eta = 0.1
#eta = lambda i: eta/(i**0.6)
nb_iter = 500*10
##
lb,ub = 0,1
freq_sin = 4 # 2.3
f_target = lambda x: np.sin(2*np.pi*freq_sin*x)
N_train = 10
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train).reshape(N_train,1)
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
## degree of mdl
Degree_mdl = N_train-1
## Hermite
Kern_train = hermvander(X_train,Degree_mdl)
print(f'Kern_train.shape={Kern_train.shape}')
Kern_train = Kern_train.reshape(N_train,Kern_train.shape[1])
##
Kern_train_pinv = np.linalg.pinv( Kern_train )
c_pinv = np.dot(Kern_train_pinv, Y_train)
nb_terms = c_pinv.shape[0]
##
condition_number_hessian = np.linalg.cond(Kern_train)
##
graph = tf.Graph()
with graph.as_default():
X = tf.placeholder(tf.float32, [None, nb_terms])
Y = tf.placeholder(tf.float32, [None,1])
w = tf.Variable( tf.zeros([nb_terms,1]) )
#w = tf.Variable( tf.truncated_normal([Degree_mdl,1],mean=0.0,stddev=1.0) )
#w = tf.Variable( 1000*tf.ones([Degree_mdl,1]) )
##
f = tf.matmul(X,w) # [N,1] = [N,D] x [D,1]
#loss = tf.reduce_sum(tf.square(Y - f))
loss = tf.reduce_sum( tf.reduce_mean(tf.square(Y-f), 0))
l2loss_tf = (1/N_train)*2*tf.nn.l2_loss(Y-f)
##
learning_rate = eta
#global_step = tf.Variable(0, trainable=False)
#learning_rate = tf.train.exponential_decay(learning_rate=eta, global_step=global_step,decay_steps=nb_iter/2, decay_rate=1, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session(graph=graph) as sess:
Y_train = Y_train.reshape(N_train,1)
tf.global_variables_initializer().run()
# Train
for i in range(nb_iter):
#if i % (nb_iter/10) == 0:
if i % (nb_iter/10) == 0 or i == 0:
current_loss = sess.run(fetches=loss, feed_dict={X: Kern_train, Y: Y_train})
print(f'tf: i = {i}, current_loss = {current_loss}')
## train
batch_xs, batch_ys = get_batch(Kern_train,Y_train,M)
sess.run(train_step, feed_dict={X: batch_xs, Y: batch_ys})
print(f'condition_number_hessian = {condition_number_hessian}')
print('\a')
Totally self contained code in pytorch:
import numpy as np
from numpy.polynomial.hermite import hermvander
import random
import torch
from torch.autograd import Variable
def vectors_dims_dont_match(Y,Y_):
'''
Checks that vector Y and Y_ have the same dimensions. If they don't
then there might be an error that could be caused due to wrong broadcasting.
'''
DY = tuple( Y.size() )
DY_ = tuple( Y_.size() )
if len(DY) != len(DY_):
return True
for i in range(len(DY)):
if DY[i] != DY_[i]:
return True
return False
def index_batch(X,batch_indices,dtype):
'''
returns the batch indexed/sliced batch
'''
if len(X.shape) == 1: # i.e. dimension (M,) just a vector
batch_xs = torch.FloatTensor(X[batch_indices]).type(dtype)
else:
batch_xs = torch.FloatTensor(X[batch_indices,:]).type(dtype)
return batch_xs
def get_batch2(X,Y,M,dtype):
'''
get batch for pytorch model
'''
# TODO fix and make it nicer, there is pytorch forum question
X,Y = X.data.numpy(), Y.data.numpy()
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = index_batch(X,batch_indices,dtype)
batch_ys = index_batch(Y,batch_indices,dtype)
return Variable(batch_xs, requires_grad=False), Variable(batch_ys, requires_grad=False)
def get_sequential_lifted_mdl(nb_monomials,D_out, bias=False):
return torch.nn.Sequential(torch.nn.Linear(nb_monomials,D_out,bias=bias))
def train_SGD(mdl, M,eta,nb_iter,logging_freq ,dtype, X_train,Y_train):
##
#pdb.set_trace()
N_train,_ = tuple( X_train.size() )
#print(N_train)
for i in range(1,nb_iter+1):
# Forward pass: compute predicted Y using operations on Variables
batch_xs, batch_ys = get_batch2(X_train,Y_train,M,dtype) # [M, D], [M, 1]
## FORWARD PASS
y_pred = mdl.forward(batch_xs)
## Check vectors have same dimension
if vectors_dims_dont_match(batch_ys,y_pred):
raise ValueError('You vectors don\'t have matching dimensions. It will lead to errors.')
## LOSS + Regularization
batch_loss = (1/M)*(y_pred - batch_ys).pow(2).sum()
## BACKARD PASS
batch_loss.backward() # Use autograd to compute the backward pass. Now w will have gradients
## SGD update
for W in mdl.parameters():
delta = eta(i)*W.grad.data
W.data.copy_(W.data - delta)
## train stats
if i % (nb_iter/10) == 0 or i == 0:
#X_train_, Y_train_ = Variable(X_train), Variable(Y_train)
X_train_, Y_train_ = X_train, Y_train
current_train_loss = (1/N_train)*(mdl.forward(X_train_) - Y_train_).pow(2).sum().data.numpy()
print('\n-------------')
print(f'i = {i}, current_train_loss = {current_train_loss}\n')
print(f'eta*W.grad.data = {eta*W.grad.data}')
print(f'W.grad.data = {W.grad.data}')
## Manually zero the gradients after updating weights
mdl.zero_grad()
final_sgd_error = current_train_loss
return final_sgd_error
##
D0=1
logging_freq = 100
#dtype = torch.cuda.FloatTensor
dtype = torch.FloatTensor
## SGD params
M = 5
eta = 0.1
eta = lambda i: eta/(i**0.6)
nb_iter = 500*10
##
lb,ub = 0,1
freq_sin = 4 # 2.3
f_target = lambda x: np.sin(2*np.pi*freq_sin*x)
N_train = 10
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train).reshape(N_train,1)
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
## degree of mdl
Degree_mdl = N_train-1
## Hermite
Kern_train = hermvander(X_train,Degree_mdl)
Kern_train = Kern_train.reshape(N_train,Kern_train.shape[2])
##
Kern_train_pinv = np.linalg.pinv( Kern_train )
c_pinv = np.dot(Kern_train_pinv, Y_train)
##
condition_number_hessian = np.linalg.cond(Kern_train)
## linear mdl to train with SGD
nb_terms = c_pinv.shape[0]
mdl_sgd = get_sequential_lifted_mdl(nb_monomials=nb_terms,D_out=1, bias=False)
mdl_sgd[0].weight.data.normal_(mean=0,std=0.001)
mdl_sgd[0].weight.data.fill_(0)
## Make polynomial Kernel
Kern_train_pt, Y_train_pt = Variable(torch.FloatTensor(Kern_train).type(dtype), requires_grad=False), Variable(torch.FloatTensor(Y_train).type(dtype), requires_grad=False)
final_sgd_error = train_SGD(mdl_sgd, M,eta,nb_iter,logging_freq ,dtype, Kern_train_pt,Y_train_pt)
## PRINT ERRORS
from plotting_utils import *
train_error_pinv = (1/N_train)*(np.linalg.norm(Y_train-np.dot(Kern_train,c_pinv))**2)
print('\n-----------------')
print(f'N_train={N_train}')
print(f'train_error_pinv = {train_error_pinv}')
print(f'final_sgd_error = {final_sgd_error}')
print(f'condition_number_hessian = {condition_number_hessian}')
print('\a')
Maybe it's a bit late, but you might have a look at this https://github.com/Orcuslc/OrthNet