Exploding Gradient on fully connected layer

Exploding Gradient on fully connected layer - python

I am trying to learn on the MNIST dataset using a deep learning model.
My model is of the format-
Input(28*28*1)
Conv2d(14*14*32)
Conv2d(7*7*64)-flatten
FC(3164*1024)
FC(1024*10)
10 class prediction of MNIST
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train,y_test=one_hot(y_train),one_hot(y_test)
x_train=np.reshape(x_train,[x_train.shape[0],28,28,1])
x_test=np.reshape(x_test,[x_test.shape[0],28,28,1])
x_dataset=tf.data.Dataset.from_tensor_slices(x_train)
y_dataset=tf.data.Dataset.from_tensor_slices(y_train)
train_dataset=tf.data.Dataset.zip((x_dataset,y_dataset)).repeat().batch(50)
iterator=tf.data.Iterator.from_structure(train_dataset.output_types,train_dataset.output_shapes)
next_element=iterator.get_next()
training_init_op=iterator.make_initializer(train_dataset)
x_testds=tf.data.Dataset.from_tensor_slices(x_test)
y_testds=tf.data.Dataset.from_tensor_slices(y_test)
testds=tf.data.Dataset.zip((x_testds,y_testds)).repeat().batch(2000)
valid_inti_op=iterator.make_initializer(testds)
##%%##
def one_hot(y_train):
y_train1=np.zeros((y_train.shape[0],10))
for i in range(y_train.shape[0]):
y_train1[i][y_train[i]]=1
return y_train1
def conv_layer(input,channels_in,channels_out,name="conv"):
with tf.name_scope(name):
input=tf.cast(input,tf.float32)
w=tf.Variable(tf.truncated_normal([5,5,channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.truncated_normal([channels_out],stddev=0.1),name="B")
conv=tf.nn.conv2d(input,w,strides=[1,1,1,1],padding="SAME")
act=tf.nn.relu(conv+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
def fc_layer(input,channels_in,channels_out,name="fc"):
with tf.name_scope(name):
w=tf.Variable(tf.truncated_normal([channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.zeros([channels_out]),name="B")
act=tf.nn.relu(tf.matmul(input,w)+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
conv1=conv_layer(next_element[0],1,32,"conv1")
pool1=tf.nn.max_pool(conv1,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool1")
conv2=conv_layer(pool1,32,64,"conv2")
pool2=tf.nn.max_pool(conv2,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool2")
flattened=tf.reshape(pool2,[-1,7*7*64])
fc1=fc_layer(flattened,7*7*64,1024,"fc1")
logits=fc_layer(fc1,1024,10,"fc2")
##%%##
with tf.name_scope("cross_entropy"):
cross_entropy=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=next_element[1]))
with tf.name_scope("train"):
train_step=tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope("accuracy"):
correct_prediction=tf.equal(tf.argmax(logits,1),tf.argmax(next_element[1],1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
##%%
sess=tf.Session()
tf.summary.scalar('cross_entropy',cross_entropy)
tf.summary.scalar('accuracy',accuracy)
tf.summary.image('input',next_element[0])
merged_summary=tf.summary.merge_all()
writer=tf.summary.FileWriter("D:/work/ML/tensorboard/MNIST/deep/4")
writer.add_graph(sess.graph)
##%%
sess.run(tf.global_variables_initializer())
sess.run(training_init_op)
for i in range(600):
s=sess.run(merged_summary)
if(i%5==0):
writer.add_summary(s,i)
print(i,end="\r")
sess.run(valid_inti_op)
for i in range(1,6):
s1=sess.run(merged_summary)
writer.add_summary(s1,601+i)
My Accuracy and cross_entropy are stuck. After trying to use tensorboard, the issue seems to be that my weights of FC layer are stuck at very large values, even though i have initialised them to 0 if this really is the error then i dont know how to fix and if it isnt then i dont know what the error is.

A sess.run(train_step) after inputting data fixed my code

Related

Handling extremely long timestep sequence in LSTM (NLP Multi-label classification)

This is my first time asking a question on stackoverflow so sorry if I am not asking in a correct format. Let's say I am working with some extremely long timestep sequence datas (10000000), with 2701 samples and only one feature, my input array is [2701,10000000,1], and my dataset looks like
[ 2.81143e-01 4.98219e-01 -8.08500e-03 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 1.95077e-01 2.20920e-02 -1.68663e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
...
[ 1.06033e-01 8.96650e-02 -3.20860e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 6.85510e-02 -3.83653e-01 -2.19265e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 2.51404e-01 8.02280e-02 2.84610e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]]
However, from what I had read, usually LSTM network perform better in a range of (200~400) time steps, even ignoring the performance, I cannot successfully train with a single sample [1,10000000,1]. I believe the network is functional since I tried to limit the length of each sample to (1500), which is now [2701,1500,1] and it finally stop stucking at the first epoch. Here below is my code if needed:
from keras.utils import Sequence
import numpy as np
from numpy.lib.format import open_memmap
import gc
import platform
import pandas as pd
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Masking
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC' #A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
stock_price=pd.read_csv("C:/Users/user/Desktop/Final-Year-Project-master/stock_classification_7Days_test.csv",sep=',', dtype={"ID":"string","Class":int})
print(stock_price)
print (platform.architecture())
y_data=[]
x_data=[]
y_data=pd.get_dummies(stock_price['Class'])
def embedded_reader(file_path):
with open(file_path) as embedded_raw:
for line in embedded_raw:
for word in line.split(','):
try:
val=float(word)
yield val
except:
pass
embedded_raw.close()
gc.collect()
for y in range(len(stock_price)):
if int(stock_price.at[y,'Class']) is not None:
i = stock_price.at[y,'ID']
print("Company code current: ",i)
embedded_current=[]
try:
gen=embedded_reader("C:/Users/user/Desktop/Final-Year-Project-master/json_test/{}.jsonl".format(i))
while True:
val=next(gen)
embedded_current.append(val)
except:
pass
fp=np.memmap('embedded_array.mymemmap', dtype=np.uint8,mode='w+',shape=(1,))
fp=np.delete(fp,0)
fp=np.concatenate((fp,embedded_current),axis=0)
fp=np.pad(fp, (0,(10000000-len(embedded_current))), 'constant', constant_values=(100, 100))
print(fp)
x_data.append(fp)
print(np.shape(x_data))
del fp
print("embedded_data current: ",len(embedded_current))
print("this is number {}".format(y))
print("-----------------------------")
gc.collect()
gc.collect()
print(len(x_data))
print(np.shape(x_data))
print("-"*20)
print(np.shape(y_data))
print(np.size(y_data))
X_train, X_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.2,random_state=0)
print(np.shape(X_train))
print(np.shape(X_test))
X_train=np.array(X_train)
X_test=np.array(X_test)
print(np.shape(X_train))
print(np.shape(X_test))
print(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], 1))
print(np.shape(X_train))
print(np.shape(X_test))
y_train=np.array(y_train)
y_test=np.array(y_test)
print(len(X_test[0]))
print(np.shape(y_train))
model=Sequential()
model.add(Masking(mask_value=100, input_shape=(10000000,1)))
model.add(LSTM(units=1, return_sequences = True, input_shape=(10000000,1)))
model.add(LSTM(units=1,return_sequences=False))
model.add(Dense(5,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
model.fit(X_train,y_train,epochs=50,batch_size=4,verbose=1)
print(model.predict(X_test))
print("class label:", reverse_label(model.predict_classes(X_test)))
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
model.save('my_model')
From some tutorials they had mentioned to reshape the array, so I tried to reshape my array into something like [2701*25000, 10000000/25000, 1] but then I got stuck with the problem that x_data sample and y_data sample is not the same. Also I saw ones mentioned model.fit_generator but seems like it is solving the problem with huge sample size, which in my case, the model is not even working with a single sample (I am new to neural network so not sure if I am understanding it correctly). Totally got no clue and would really appreciate for any help, thank you.
Edit: just to state my question clear: "any advice on treating such long input using LSTM?"

Debugging tensorflow fit not making sense

So I was having different results with a self-implemented code and Tensorflow results. I wanted to test each value to see where was my error (loss, gradients, optimizer, etc).
Therefore I did a test code like the one in this repo inspired on the fashion mnist example. Just for simplicity I will copy-paste it at the end of the question.
Logic:
Basically, I do 1 epoch on 1 batch. And then save:
Weigths before training
Gradients
Weights after only one epoch and batch.
As I use the default SGD TensorFlow algorithm, then the saved gradients should be equal to (initial_weights - final_weights)/0.01. This idea was taken from here.
However, this does not happen, what's more, results get closer if I divide by 0.0001 instead of 0.01 which is strangely enough 0.01^2.
Is there an error in my logic? testing code? I cannot find it.
PS: I tried using tf version 2.2.0 and 2.4.1 on Linux.
import tensorflow as tf
import numpy as np
from pdb import set_trace
def get_dataset():
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
return (train_images, train_labels), (test_images, test_labels)
def get_model(init1='glorot_uniform', init2='glorot_uniform'):
tf.random.set_seed(1)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu', kernel_initializer=init1),
tf.keras.layers.Dense(10, kernel_initializer=init2)
])
model.compile(optimizer='sgd',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
return model
def train(model, x_fit, y_fit):
np.save("initial_weights.npy", np.array(model.get_weights()))
with tf.GradientTape() as g:
y_pred = model(x_fit)
loss = tf.keras.losses.categorical_crossentropy(y_pred=y_pred, y_true=y_fit)
np.save("loss.npy", np.array(loss))
gradients = g.gradient(loss, model.trainable_weights)
np.save("gradients.npy", np.array(gradients))
model.fit(x_fit, y_fit, epochs=1, batch_size=100)
np.save("final_weights.npy", np.array(model.get_weights()))
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = get_dataset()
model = get_model()
y_fit = np.zeros((100, 10))
for i, val in enumerate(train_labels[:100]):
y_fit[i][val] = 1.
train(model, train_images[:100], y_fit)
results = {
"loss": np.load("loss.npy", allow_pickle=True),
"init_weights": np.load("initial_weights.npy", allow_pickle=True),
"gradients": np.load("gradients.npy", allow_pickle=True),
"final_weights": np.load("final_weights.npy", allow_pickle=True)
}
for i_w, f_w, gr in zip(results["init_weights"], results["final_weights"], results["gradients"]):
gr = gr.numpy()
print(np.allclose(gr, (i_w - f_w) / 0.01))
# set_trace()

It looks like the call to fit is averaging the gradient over the batch size. I don't know if it's a bug of it is by design.
As you compute your gradient manually anyway, you can just call model.optimizer.apply_gradients to update your weights, you should get the correct results.
def train(model, x_fit, y_fit):
np.save("initial_weights.npy", np.array(model.get_weights()))
with tf.GradientTape() as g:
y_pred = model(x_fit)
loss = tf.keras.losses.categorical_crossentropy(y_pred=y_pred, y_true=y_fit)
np.save("loss.npy", np.array(loss))
gradients = g.gradient(loss, model.trainable_weights)
np.save("gradients.npy", np.array(gradients))
model.optimizer.apply_gradients(zip(gradients, model.trainable_weights))
np.save("final_weights.npy", np.array(model.get_weights()))

The accuracy problem of hand sign gestures recognition with using CNN in Python

Im working on my senior project in my university and I have only 2 days to fix this problem.I created a hand gesture recognition with using CNN in Python.I used 78000 images with 50x50px values.But I got stuck in the last part of my model.I can not improve my accuracy.When I start to train the data with 100 epochs,the first 15 epochs show 0,039 accuracy and it is horrible,because of that I'm not waiting the end of the train.Maybe it happens because of the values of conv2d or pooling because I don't know how to put the correct values into conv2d,pooling etc.
I'm new and I could not fix the problem.If you help me,I will be grateful for you
The code I wrote is given below;
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import pickle
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
from numpy import asarray
DATADIR = "asl_alphabet_train"
CATEGORIES = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
X_train = []
y_train = []
X_test=[]
y_test=[]
IMG_SIZE=50
def create_training_data():
for category in CATEGORIES:
path = os.path.join(DATADIR,category) # create path to dogs and cats
class_num = CATEGORIES.index(category) # get the classification (0 or a 1).
for img in tqdm(os.listdir(path)): # iterate over each image per dogs and cats
try:
img_array = cv2.imread(os.path.join(path,img)) # convert to array
#new_array = cv2.resize(img_array, (28, 50 )) # resize to normalize data size
X_train.append(img_array) # add this to our trainingdata
# add this to our X_train
y_train.append(class_num) # add this to our X_train
except Exception as e: # in the interest in keeping the output clean...
pass
create_training_data()
X_train = asarray(X_train)
y_train = asarray(y_train)
"""
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))
"""
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2,random_state=0)
N = y_train.size
M = y_train.max()+1
resultArray = np.zeros((N,M),int)
idx = (np.arange(N)*M) + y_train
resultArray.ravel()[idx] = 1
y_train=resultArray
classifier=Sequential()
#convolution step
classifier.add(Convolution2D(filters=96, input_shape=(50,50,3), kernel_size=(11,11), padding='valid',activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
#convolution step
classifier.add(Convolution2D(filters=256,kernel_size=(11,11),padding="valid",activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
classifier.add(Convolution2D(filters=384,kernel_size=(3,3),padding="valid",activation="relu"))
classifier.add(MaxPooling2D(pool_size=(2,2)))
#flatten step
classifier.add(Flatten())
#Dense(Fully connected step)
classifier.add(Dense(output_dim=128,activation="relu"))
#Dropout to decrease the possibility of overfitting
classifier.add(Dropout(0.5))
#Dense to determine the output
classifier.add(Dense(output_dim=26,activation="softmax"))
#compile step
classifier.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
enter code here
classifier.fit(X_train,y_train,epochs=100,batch_size=32)
filename="CNN_TEST.sav"
pickle.dump(classifier, open(filename, 'wb'))
y_pred=classifier.predict(X_test)
print(y_pred)

Would recommend the following :
1) Reduce the kernel size in the first two convolutional layers of your model.
2) I believe the MaxPooling layer is not necessary after every convolutional layer. Do verify this.
3) A DropOut of 0.5 could drop out a large number of essential neurons, you might want to lower that.
4) Vary the number of epochs and see how your model performs each time.
Plot "train accuracy vs val accuracy" and "train loss vs val loss" at each attempt and see if your model overfits or underfits.

keras rl - dqn model update

I am reading through the DQN implementation in keras-rl /rl/agents/dqn.py and see that in the compile() step essentially 3 keras models are instantiated:
self.model : provides q value predictions
self.trainable_model : same as self.model but has the loss function we want to train
self.target_model : target model which provides the q targets and is updated every k steps with the weights from self.model
The only model on which train_on_batch() is called is trainable_model, however - and this is what I don't understand - this also updates the weights of model.
In the definition of trainable_model one of the output tensors y_pred is referencing the output from model:
y_pred = self.model.output
y_true = Input(name='y_true', shape=(self.nb_actions,))
mask = Input(name='mask', shape=(self.nb_actions,))
loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_true, y_pred, mask])
ins = [self.model.input] if type(self.model.input) is not list else self.model.input
trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
When trainable_model.train_on_batch() is called, BOTH the weights in trainable_model and in model change. I am surprised because even though the two models reference the same output tensor object (trainable_model.y_pred = model.output), the instantiation of trainable_model = Model(...) should also instantiate a new set of weights, no?
Thanks for the help!

This is a small example to show that when you instantiate a new keras.models.Model() using the input and output tensor from another model, then the weights of these two models are shared. They don't get re-initialized.
# keras version: 2.2.4
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import SGD
np.random.seed(123)
model1 = Sequential()
model1.add(Dense(1, input_dim=1, activation="linear", name="model1_dense1", weights=[np.array([[10]]),np.array([10])]))
model1.compile(optimizer=SGD(), loss="mse")
model2 = Model(inputs=model1.input, outputs=model1.output)
model2.compile(optimizer=SGD(), loss="mse")
x = np.random.normal(size=2000)
y = 2 * x + np.random.normal(size=2000)
print("model 1 weights", model1.get_weights())
print("model 2 weights", model2.get_weights())
model2.fit(x,y, epochs=3, batch_size=32)
print("model 1 weights", model1.get_weights())
print("model 2 weights", model2.get_weights())
Definitely something to keep in mind. Wasnt intuitive to me.

Dropout layer directly in tensorflow: how to train?

After I created my model in Keras, I want to get the gradients and apply them directly in Tensorflow with the tf.train.AdamOptimizer class. However, since I am using a Dropout layer, I don't know how to tell to the model whether it is in the training mode or not. The training keyword is not accepted. This is the code:
net_input = Input(shape=(1,))
net_1 = Dense(50)
net_2 = ReLU()
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input))))
#mycost = ...
optimizer = tf.train.AdamOptimizer()
gradients = optimizer.compute_gradients(mycost, var_list=[net.trainable_weights])
# perform some operations on the gradients
# gradients = ...
trainstep = optimizer.apply_gradients(gradients)
I get the same behavior with and without dropout layer, even with dropout rate=1. How to solve this?

As #Sharky already said you can use training argument while invoking call() method of Dropout class. However, if you want to train in tensorflow graph mode you need to pass a placeholder and feed it boolean value during training. Here is the example of fitting Gaussian blobs applicable to your case:
import tensorflow as tf
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
x_train, y_train = make_blobs(n_samples=10,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=1)
x_train, x_test, y_train, y_test = train_test_split(
x_train, y_train, test_size=0.2)
# `istrain` indicates whether it is inference or training
istrain = tf.placeholder(tf.bool, shape=())
y = tf.placeholder(tf.int32, shape=(None))
net_input = Input(shape=(2,))
net_1 = Dense(2)
net_2 = Dense(2)
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input)), training=istrain))
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=net.output)
loss_fn = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(0.01)
grads_and_vars = optimizer.compute_gradients(loss_fn,
var_list=[net.trainable_variables])
trainstep = optimizer.apply_gradients(grads_and_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
l1 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:True}) # apply dropout
print(l1) # 1.6264652
l2 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:False}) # no dropout
print(l2) # 1.5676715
sess.run(trainstep, feed_dict={net_input:x_train,
y:y_train,
istrain:True}) # train with dropout

Keras layers inherit from tf.keras.layers.Layer class. Keras API handle this internally with model.fit. In case Keras Dropout is used with pure TensorFlow training loop, it supports a training argument in its call function.
So you can control it with
dropout = tf.keras.layers.Dropout(rate, noise_shape, seed)(prev_layer, training=is_training)
From official TF docs
Note: - The following optional keyword arguments are reserved for
specific uses: * training: Boolean scalar tensor of Python boolean
indicating whether the call is meant for training or inference. *
mask: Boolean input mask. - If the layer's call method takes a mask
argument (as some Keras layers do), its default value will be set to
the mask generated for inputs by the previous layer (if input did come
from a layer that generated a corresponding mask, i.e. if it came from
a Keras layer with masking support.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dropout#call

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Exploding Gradient on fully connected layer - python

A sess.run(train_step) after inputting data fixed my code

Related

Handling extremely long timestep sequence in LSTM (NLP Multi-label classification)

Debugging tensorflow fit not making sense

The accuracy problem of hand sign gestures recognition with using CNN in Python

keras rl - dqn model update

Dropout layer directly in tensorflow: how to train?

Categories

Resources