I am trying to create a nueral network using tensor flow. I am not using keras api. I have some parameter estimation(weight,bias and some other parameters) to do. The code is working but the parameter estimation is really bad and error percentage is very high what is the problem here? I tried so many ways still no improvement. the loss fn is less.
I tried creating my own optimizer but the process is slow and the error is large. Is there any way to apply optimizers parameter.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from pyDOE import lhs
import math as ma
class PhysicsInformedNN:
def __init__(self,X_n,v,layers,lb,ub):
self.lb = lb
self.ub = ub
self.layers = layers
self.dx_n = tf.convert_to_tensor(X_n[:,0:1],dtype = 'float32')
self.t_n = tf.convert_to_tensor(X_n[:,1:2],dtype = 'float32')
self.v_r = tf.convert_to_tensor(v,dtype = 'float32')
self.lambda_1 = tf.Variable(0,dtype = 'float32')#1.5
self.lambda_2 = tf.Variable(-6,dtype = 'float32')
self.para =[self.lambda_1,self.lambda_2]
self.weights, self.biases = self.initialize_NN(layers)
def initialize_NN(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype='float32'), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype='float32')
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.math.tanh(tf.math.add(tf.linalg.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.math.add(tf.linalg.matmul(H, W), b)
return Y
def net_u(self, x, t):
v = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return v
def net_f(self, x, t):
lambda_1 = self.para[0]
lambda_2 = tf.exp(self.para[1])
with tf.GradientTape(persistent=True) as tape :
tape.watch(t)
tape.watch(x)
u = self.net_u(x,t)
u_x = tape.gradient(u,x)
u_t = tape.gradient(u,t)
u_xx = tape.gradient(u_x,x)
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
del tape
return f
def callback(self, loss,n):
print('Loss:', loss, ' Epoch : ', n)
def train(self,epoch):
for i in range(epoch):
with tf.GradientTape(persistent=True) as tape :
tape.watch(self.weights)
tape.watch(self.biases)
tape.watch(self.para)
f_pred = self.net_f(self.dx_n, self.t_n)
v_pred = self.net_u(self.dx_n, self.t_n)
loss = tf.reduce_mean(tf.square(self.v_r - v_pred)) + tf.reduce_mean(tf.square(f_pred))
dw = tape.gradient(loss,self.weights)
db = tape.gradient(loss,self.biases)
dp = tape.gradient(loss,self.para)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-2,
decay_steps=10000,
decay_rate=0.9)
optimizer1 = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer1.apply_gradients(zip(dw, self.weights))
optimizer1.apply_gradients(zip(db, self.biases))
optimizer2 = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer2.apply_gradients(zip(dp, self.para))
del tape
self.callback(loss,i)
def predict(self, X_star):
v_star = self.net_u(X_star[:,0:1], X_star[:,1:2])
f_star = f_pred = self.net_f(X_star[:,0:1], X_star[:,1:2])
para_last = self.para
return v_star, f_star, para_last
if __name__ == '__main__':
#PARAMETERS for the problem
np.random.seed(123)
nu =0.01/np.pi
layers = [2, 20, 20, 20, 20, 1]
N_u = 2000
data = scipy.io.loadmat('burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
lb = X_star.min(0)
ub = X_star.max(0)
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(1000)
X_star = tf.convert_to_tensor(X_star,dtype='float32')
u_pred, f_pred, param = model.predict(X_star)
error_lambda_1 = np.abs(param[0] - 1.0)*100
error_lambda_2 = np.abs( np.exp(param[1])- nu)/nu * 100
print(error_lambda_1,error_lambda_2)
Related
I have created a pytorch model and I want to reduce the model size.
Defining Model Architecture :-
import torch
import torch.quantization
import torch.nn as nn
import copy
import os
import time
import numpy as np
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.utils.prune as prune
import torch.nn.functional as F
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import time
import codecs
import pickle
import torch
from torch.autograd import Variable
import torch.nn.utils.prune as prune
from config import Config
from loader import *
from utils import *
from model import BiLSTM_CRF
START_TAG = '<START>'
STOP_TAG = '<STOP>'
def init_embedding(input_embedding):
"""
Initialize embedding
"""
bias = np.sqrt(3.0 / input_embedding.size(1))
nn.init.uniform(input_embedding, -bias, bias)
def init_linear(input_linear):
"""
Initialize linear transformation
"""
bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
nn.init.uniform(input_linear.weight, -bias, bias)
if input_linear.bias is not None:
input_linear.bias.data.zero_()
def init_lstm(input_lstm):
"""
Initialize lstm
"""
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bias:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind))
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind))
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
def to_scalar(var):
return var.view(-1).data.tolist()[0]
def argmax(vec):
_, idx = torch.max(vec, 1)
return to_scalar(idx)
def log_sum_exp(vec):
# vec 2D: 1 * tagset_size
max_score = vec[0, argmax(vec)]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + \
torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_lstm_dim=25,
char_to_ix=None, pre_word_embeds=None, char_embedding_dim=25, use_gpu=False,
n_cap=None, cap_embedding_dim=None, use_crf=True, char_mode='CNN'):
super(BiLSTM_CRF, self).__init__()
self.use_gpu = use_gpu
self.embedding_dim = embedding_dim #100
self.hidden_dim = hidden_dim #200
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.n_cap = n_cap
self.cap_embedding_dim = cap_embedding_dim
self.use_crf = use_crf
self.tagset_size = len(tag_to_ix)
self.out_channels = char_lstm_dim #25
self.char_mode = char_mode
print('char_mode: %s, out_channels: %d, hidden_dim: %d, ' % (char_mode, char_lstm_dim, hidden_dim))
if self.n_cap and self.cap_embedding_dim:
self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
# print("self.cap_embeds.weight------",self.cap_embeds.weight)
init_embedding(self.cap_embeds.weight)
if char_embedding_dim is not None:
self.char_lstm_dim = char_lstm_dim
self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim)
# print("self.char_embeds.weight-------", self.char_embeds.weight)
init_embedding(self.char_embeds.weight)
if self.char_mode == 'LSTM':
self.char_lstm = nn.LSTM(char_embedding_dim, char_lstm_dim, num_layers=1, bidirectional=True)
init_lstm(self.char_lstm)
if self.char_mode == 'CNN':
self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2,0))
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
if pre_word_embeds is not None:
self.pre_word_embeds = True
self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
else:
self.pre_word_embeds = False
self.dropout = nn.Dropout(0.5)
if self.n_cap and self.cap_embedding_dim:
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2+cap_embedding_dim, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels+cap_embedding_dim, hidden_dim, bidirectional=True)
else:
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels, hidden_dim, bidirectional=True)
init_lstm(self.lstm)
self.hw_trans = nn.Linear(self.out_channels, self.out_channels)
self.hw_gate = nn.Linear(self.out_channels, self.out_channels)
self.h2_h1 = nn.Linear(hidden_dim*2, hidden_dim)
self.tanh = nn.Tanh()
self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)
init_linear(self.h2_h1)
init_linear(self.hidden2tag)
init_linear(self.hw_gate)
init_linear(self.hw_trans)
if self.use_crf:
self.transitions = nn.Parameter(
torch.zeros(self.tagset_size, self.tagset_size))
self.transitions.data[tag_to_ix[START_TAG], :] = -10000
self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
def _score_sentence(self, feats, tags):
# tags is ground_truth, a list of ints, length is len(sentence)
# feats is a 2D tensor, len(sentence) * tagset_size
r = torch.LongTensor(range(feats.size()[0]))
if self.use_gpu:
r = r.cuda()
pad_start_tags = torch.cat([torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags = torch.cat([tags, torch.cuda.LongTensor([self.tag_to_ix[STOP_TAG]])])
else:
pad_start_tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags = torch.cat([tags, torch.LongTensor([self.tag_to_ix[STOP_TAG]])])
score = torch.sum(self.transitions[pad_stop_tags, pad_start_tags]) + torch.sum(feats[r, tags])
return score
def _get_lstm_features(self, sentence, chars2, caps, chars2_length, d):
if self.char_mode == 'LSTM':
# self.char_lstm_hidden = self.init_lstm_hidden(dim=self.char_lstm_dim, bidirection=True, batchsize=chars2.size(0))
chars_embeds = self.char_embeds(chars2).transpose(0, 1)
packed = torch.nn.utils.rnn.pack_padded_sequence(chars_embeds, chars2_length)
lstm_out, _ = self.char_lstm(packed)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
outputs = outputs.transpose(0, 1)
chars_embeds_temp = Variable(torch.FloatTensor(torch.zeros((outputs.size(0), outputs.size(2)))))
if self.use_gpu:
chars_embeds_temp = chars_embeds_temp.cuda()
for i, index in enumerate(output_lengths):
chars_embeds_temp[i] = torch.cat((outputs[i, index-1, :self.char_lstm_dim], outputs[i, 0, self.char_lstm_dim:]))
chars_embeds = chars_embeds_temp.clone()
for i in range(chars_embeds.size(0)):
chars_embeds[d[i]] = chars_embeds_temp[i]
if self.char_mode == 'CNN':
chars_embeds = self.char_embeds(chars2).unsqueeze(1)
chars_cnn_out3 = self.char_cnn3(chars_embeds)
chars_embeds = nn.functional.max_pool2d(chars_cnn_out3, kernel_size=(chars_cnn_out3.size(2), 1)).view(chars_cnn_out3.size(0), self.out_channels)
# t = self.hw_gate(chars_embeds)
# g = nn.functional.sigmoid(t)
# h = nn.functional.relu(self.hw_trans(chars_embeds))
# chars_embeds = g * h + (1 - g) * chars_embeds
embeds = self.word_embeds(sentence)
if self.n_cap and self.cap_embedding_dim:
cap_embedding = self.cap_embeds(caps)
if self.n_cap and self.cap_embedding_dim:
embeds = torch.cat((embeds, chars_embeds, cap_embedding), 1)
else:
embeds = torch.cat((embeds, chars_embeds), 1)
embeds = embeds.unsqueeze(1)
embeds = self.dropout(embeds)
lstm_out, _ = self.lstm(embeds)
lstm_out = lstm_out.view(len(sentence), self.hidden_dim*2)
lstm_out = self.dropout(lstm_out)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def _forward_alg(self, feats):
# calculate in log domain
# feats is len(sentence) * tagset_size
# initialize alpha with a Tensor with values all equal to -10000.
init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
forward_var = autograd.Variable(init_alphas)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
emit_score = feat.view(-1, 1)
tag_var = forward_var + self.transitions + emit_score
max_tag_var, _ = torch.max(tag_var, dim=1)
tag_var = tag_var - max_tag_var.view(-1, 1)
forward_var = max_tag_var + torch.log(torch.sum(torch.exp(tag_var), dim=1)).view(1, -1) # ).view(1, -1)
terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]).view(1, -1)
alpha = log_sum_exp(terminal_var)
# Z(x)
return alpha
def viterbi_decode(self, feats):
backpointers = []
# analogous to forward
init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_vvars[0][self.tag_to_ix[START_TAG]] = 0
forward_var = Variable(init_vvars)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
next_tag_var = forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size) + self.transitions
_, bptrs_t = torch.max(next_tag_var, dim=1)
bptrs_t = bptrs_t.squeeze().data.cpu().numpy()
next_tag_var = next_tag_var.data.cpu().numpy()
viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t]
viterbivars_t = Variable(torch.FloatTensor(viterbivars_t))
if self.use_gpu:
viterbivars_t = viterbivars_t.cuda()
forward_var = viterbivars_t + feat
backpointers.append(bptrs_t)
terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
terminal_var.data[self.tag_to_ix[STOP_TAG]] = -10000.
terminal_var.data[self.tag_to_ix[START_TAG]] = -10000.
best_tag_id = argmax(terminal_var.unsqueeze(0))
path_score = terminal_var[best_tag_id]
best_path = [best_tag_id]
for bptrs_t in reversed(backpointers):
best_tag_id = bptrs_t[best_tag_id]
best_path.append(best_tag_id)
start = best_path.pop()
assert start == self.tag_to_ix[START_TAG]
best_path.reverse()
return path_score, best_path
def neg_log_likelihood(self, sentence, tags, chars2, caps, chars2_length, d):
# sentence, tags is a list of ints
# features is a 2D tensor, len(sentence) * self.tagset_size
feats = self._get_lstm_features(sentence, chars2, caps, chars2_length, d)
if self.use_crf:
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
else:
tags = Variable(tags)
scores = nn.functional.cross_entropy(feats, tags)
return scores
def forward(self, sentence, chars, caps, chars2_length, d):
feats = self._get_lstm_features(sentence, chars, caps, chars2_length, d)
# viterbi to get tag_seq
if self.use_crf:
score, tag_seq = self.viterbi_decode(feats)
else:
score, tag_seq = torch.max(feats, 1)
tag_seq = list(tag_seq.cpu().data)
return score, tag_seq
create Model Instance:-
model_fp32 = BiLSTM_CRF(vocab_size=len(word_to_id),
tag_to_ix=tag_to_id,
embedding_dim=parameters['word_dim'],
hidden_dim=parameters['word_lstm_dim'],
use_gpu=parameters['use_gpu'],
char_to_ix=char_to_id,
pre_word_embeds=word_embeds,
use_crf=parameters['crf'],
char_mode=parameters['char_mode'])
Apply Quantization
model_int8 = torch.quantization.quantize_dynamic(
model_fp32, # the original model
{nn.LSTM,nn.Linear}, # a set of layers to dynamically quantize
dtype=torch.qint8)
Checking Quantization Results:
def print_size_of_model(model, label=""):
torch.save(model.state_dict(), "temp.p")
size=os.path.getsize("temp.p")
print("model: ",label,' \t','Size (KB):', size/1e3)
os.remove('temp.p')
return size
compare the sizes
f=print_size_of_model(model_fp32,"model_fp32")
q=print_size_of_model(model_int8,"model_int8")
print("{0:.2f} times smaller".format(f/q))
Results
model: model_fp32 Size (KB): 806494.996
model: model_int8 Size (KB): 804532.412
1.00 times smaller
is there any way to reduce the model size significantly??
Based on Results section of question and vocab_size of approximately 2 million, it's seems reasonable to quantize attribute word_embeds. Expected that quantization of only this module alone will result in significant drop of memory occupation by weights. According to documentation there is no support for dynamic quantization(which is used for nn.Linear and nn.LSTM in snippet above) of nn.Embedding(type of word_embeds), but static quantization can handle this. Default qconfig which is used in some pytorch examples seems not working on nn.Embedding, but there is a hint in issue discussion how to quantize nn.Embedding. After training:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig
model_fp32.word_embeds.qconfig = float_qparams_weight_only_qconfig
torch.quantization.prepare(model_fp32, inplace=True)
torch.quantization.convert(model_fp32, inplace=True)
And after that word_embeds in model_fp32 will be quantized to torhc.quint8.
I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code
I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session
I am trying to interface CasADi and Tensorflow. CasADi is a toolbox that uses symbolic variables and does automatic differentiation. It is often used for dynamic/static optimization problems.
I found an example where GPflow is used (https://web.casadi.org/blog/tensorflow/). In this case, the GP model is firstly trained with data as follows
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
Then the prediction model is build without passing the real values but a tensor
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
Such that CasADi can substitute real values by using a callback function that calls tensorflow.
I want to use the tf.keras.Sequential() model instead of a GPflow model since I want to implement a recurrent neural network. But for the sequential model the method _build_predict(X) does not exist. I tried to use just predict but I get the following error
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype double and shape [35039,1,8]
[[{{node Placeholder}}]]
Do you know what is the equivalent in this case?
Here the complete code using GPflow
from casadi import *
T = 10. # Time horizon
N = 20 # number of control intervals
# Declare model variables
x1 = MX.sym('x1')
x2 = MX.sym('x2')
x = vertcat(x1, x2)
u = MX.sym('u')
# Model equations
xdot = vertcat((1-x2**2)*x1 - x2 + u, x1)
# Formulate discrete time dynamics
if False:
# CVODES from the SUNDIALS suite
dae = {'x':x, 'p':u, 'ode':xdot}
opts = {'tf':T/N}
F = integrator('F', 'cvodes', dae, opts)
else:
# Fixed step Runge-Kutta 4 integrator
M = 4 # RK4 steps per interval
DT = T/N/M
f = Function('f', [x, u], [xdot])
X0 = MX.sym('X0', 2)
U = MX.sym('U')
X = X0
Q = 0
for j in range(M):
k1 = f(X, U)
k2 = f(X + DT/2 * k1, U)
k3 = f(X + DT/2 * k2, U)
k4 = f(X + DT * k3, U)
X=X+DT/6*(k1 +2*k2 +2*k3 +k4)
F = Function('F', [X0, U], [X],['x0','p'],['xf'])
# Start with an empty NLP
w=[]
w0 = []
lbw = []
ubw = []
g=[]
lbg = []
ubg = []
# "Lift" initial conditions
Xk = MX.sym('X0', 2)
w += [Xk]
lbw += [0, 1]
ubw += [0, 1]
w0 += [0, 1]
# Formulate the NLP
for k in range(N):
# New NLP variable for the control
Uk = MX.sym('U_' + str(k))
w += [Uk]
lbw += [-1]
ubw += [1]
w0 += [0]
# Integrate till the end of the interval
Fk = F(x0=Xk, p=Uk)
Xk_end = Fk['xf']
# New NLP variable for state at end of interval
Xk = MX.sym('X_' + str(k+1), 2)
w += [Xk]
lbw += [-0.25, -inf]
ubw += [ inf, inf]
w0 += [0, 0]
# Add equality constraint
g += [Xk_end-Xk]
lbg += [0, 0]
ubg += [0, 0]
nd = N+1
import gpflow
import time
from tensorflow_casadi import TensorFlowEvaluator
class GPR(TensorFlowEvaluator):
def __init__(self, model, session, opts={}):
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
mean = tf.reshape(mean,(1,1))
TensorFlowEvaluator.__init__(self,[X],[mean],session,opts)
self.counter = 0
self.time = 0
def eval(self,arg):
self.counter += 1
t0 = time.time()
ret = TensorFlowEvaluator.eval(self,arg)
self.time += time.time()-t0
return [ret]
# Create
np.random.seed(0)
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
import tensorflow as tf
with tf.Session() as session:
model.initialize()
GPR = GPR(model, session)
w = vertcat(*w)
# Create an NLP solver
prob = {'f': GPR(w[0::3]), 'x': w , 'g': vertcat(*g)}
options = {"ipopt": {"hessian_approximation": "limited-memory"}}
solver = nlpsol('solver', 'ipopt', prob,options);
# Solve the NLP
sol = solver(x0=w0, lbx=lbw, ubx=ubw, lbg=lbg, ubg=ubg)
print("Ncalls",GPR.counter)
print("Total time [s]",GPR.time)
w_opt = sol['x'].full().flatten()
# Plot the solution
x1_opt = w_opt[0::3]
x2_opt = w_opt[1::3]
u_opt = w_opt[2::3]
tgrid = [T/N*k for k in range(N+1)]
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
plt.plot(tgrid, x1_opt, '--')
plt.plot(tgrid, x2_opt, '-')
plt.step(tgrid, vertcat(DM.nan(1), u_opt), '-.')
plt.xlabel('t')
plt.legend(['x1','x2','u'])
plt.grid()
plt.show()
and the class TensorFlowEvaluator
import casadi
import tensorflow as tf
class TensorFlowEvaluator(casadi.Callback):
def __init__(self,t_in,t_out,session, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependeant on those placeholders)
session: a tensorflow session
"""
casadi.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.session = session
self.refs = []
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self,i):
return casadi.Sparsity.dense(*self.t_in[i].get_shape().as_list())
def get_sparsity_out(self,i):
return casadi.Sparsity.dense(*self.t_out[i].get_shape().as_list())
def eval(self,arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
d = dict((v,arg[i].toarray()) for i,v in enumerate(self.t_in))
# Evaluate the tensorflow expressions
ret = self.session.run(self.t_out,feed_dict=d)
return ret
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self,nadj,name,inames,onames,opts):
# Construct tensorflow placeholders for the reverse seeds
adj_seed = [tf.placeholder(shape=self.sparsity_out(i).shape,dtype=tf.float64) for i in range(self.n_out())]
# Construct the reverse tensorflow graph through 'gradients'
grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in+adj_seed,grad,self.session)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return casadi.Function(name,nominal_in+nominal_out+adj_seed,callback.call(nominal_in+adj_seed),inames,onames)
if __name__=="__main__":
from casadi import *
a = tf.placeholder(shape=(2,2),dtype=tf.float64)
b = tf.placeholder(shape=(2,1),dtype=tf.float64)
y = tf.matmul(tf.sin(a), b)
with tf.Session() as session:
f_tf = TensorFlowEvaluator([a,b], [y], session)
a = MX.sym("a",2,2)
b = MX.sym("a",2,1)
y = f_tf(a,b)
yref = mtimes(sin(a),b)
f = Function('f',[a,b],[y])
fref = Function('f',[a,b],[yref])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
f = Function('f',[a,b],[jacobian(y,a)])
fref = Function('f',[a,b],[jacobian(yref,a)])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
And here is my attempt:
# design network
model = tf.keras.Sequential()
LSTM = tf.keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(LSTM) #, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(tf.keras.layers.Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
with tf.Session() as session:
testXshape = test_X.shape
GPR = GPR(model, session,testXshape)
Thanks!
I've let the TensorFlowEvaluator the same and created the GPR class this way:
class ValFcn(TensorFlowEvaluator):
import tensorflow as tf
def __init__(self, NN, session, opts={}):
self.X = self.tf.placeholder(shape=(1,4), dtype=self.tf.float32)
self.output = NN(self.X)
TensorFlowEvaluator.__init__(self, [self.X], [self.output], session, opts)
def eval(self, arg):
ret = TensorFlowEvaluator.eval(self, arg)
return ret
I was working with float32 so I had to change it there and in the TensorFlowEvaluator.
I'm actually using this model as a cost function term for an OCP.
Hope it works!
I need help with my machine learning code. I am trying to train my network using RPROP, and the train accuracy has not been updating.
Here's the code:
import theano
from theano import *
import theano.tensor as T
from theano.ifelse import ifelse
import numpy as np
from random import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
import math
#GETTING TEST DATA
def get_iris_data():
""" Read the iris data set and split them into training and test sets """
iris = datasets.load_iris()
data = iris.data
target = iris.target
# Prepend the column of 1s for bias
N, M = data.shape
all_X = np.ones((N, M + 1))
all_X[:, 1:] = data
# Convert into one-hot vectors
num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target] # One liner trick!
return train_test_split(all_X, all_Y, test_size=0.33)
#SETTING INITIAL WEIGHT VALUES
def init_weights(shape):
""" Weight initialization """
weights = np.asarray(np.random.randn(*shape) * 0.01, dtype=theano.config.floatX)
return theano.shared(weights)
def feedforward(X, w1, w2):
hidden = T.nnet.sigmoid(T.dot(X,w1))
out = T.nnet.softmax(T.dot(hidden,w2))
return out
def rprop(cost, params, learning_rate):
""" Back-propagation """
#RPROP Variables
updates = []
gradients = T.grad(cost = cost, wrt = params)
#Default Values
prevparams = params
deltaMax = 50.
deltaMin = math.exp(-6)
deltas = -0.1 * numpy.ones(len(params))
prevgradients = numpy.zeros(len(params))
npos = 1.2
nneg = 0.5
#All Values
allvalues = zip(params, prevparams, gradients, deltas, prevgradients)
for param, prevparam, gradient, delta, prevgradient in allvalues:
polarity = T.sgn(gradient * prevgradient)
prevdelta = delta
if T.gt(polarity, 0):
delta = T.minimum(prevdelta * npos, deltaMax)
change = - T.sgn(gradient) * delta
prevgradient = gradient
elif T.lt(polarity,0):
delta = T.maximu(prevdelta * nneg, deltaMin)
prevgradient = 0
change = -prevgradient
else:
change = - T.sign(gradient) * delta
prevgradient = gradient
updates.append((param, param - change * learning_rate))
return updates
#MAIN FUNCTION
def main():
#Initialization of Variables and data
train_X, test_X, train_Y, test_Y = get_iris_data()
learning_rate = 0.01
X = T.fmatrix()
Y = T.fmatrix()
#Set ANN Network Size
in_size = train_X.shape[1]
hid_size = 256
out_size = train_Y.shape[1]
#Set weights inbetween
w1 = init_weights((in_size, hid_size))
w2 = init_weights((hid_size, out_size))
#Forward Propagation Function for Neuron activation and transfer funtion
yHat = feedforward(X,w1,w2)
#Backpropagation for correction
cost = T.mean(T.nnet.categorical_crossentropy(yHat, Y))
params = [w1,w2]
updates = rprop(cost, params, learning_rate)
# Train and predict
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
pred_Y = T.argmax(yHat, axis=1)
predict = theano.function(inputs=[X], outputs=pred_Y, allow_input_downcast=True)
# Run SGD
for iter in range(2000):
for i in range(len(train_X)):
train(train_X[i: i + 1], train_Y[i: i + 1])
train_accuracy = np.mean(np.argmax(train_Y, axis=1) == predict(train_X))
test_accuracy = np.mean(np.argmax(test_Y, axis=1) == predict(test_X))
print("Iteration = %d, train accuracy = %.2f%%, test accuracy = %.2f%%"
% (iter + 1, 100 * train_accuracy, 100 * test_accuracy))
if __name__ == '__main__':
main()