Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)
#
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
def __init__(self):
self.batch_size = 256
self.input_size = 5 # input dimension is 5
self.hidden_layer1_size = 50
self.output_size = 1 # output dimension is 5
self.train_data = data['training_data_pos']
self.df_traindata = pd.DataFrame(data=self.train_data)
self.validation_data_num = 17142
self.valid_data = data['validation_data_pos']
self.df_validdata = pd.DataFrame(data=self.valid_data)
# weight initialization for ReLu
self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)
#bias initialization
self.b1 = np.zeros((1,self.hidden_layer1_size))
self.b2 = np.zeros((1,self.output_size))
self.lr = 5e-3 # learning rate
self.reg = 1e-3 # regularization strength
self.p = 0.5 # dropout probability = 1-p
self.first_moment_W3=0
self.second_moment_W3=0
self.first_moment_W2=0
self.second_moment_W2=0
self.first_moment_W1=0
self.second_moment_W1=0
self.first_moment_b3=0
self.second_moment_b3=0
self.first_moment_b2=0
self.second_moment_b2=0
self.first_moment_b1=0
self.second_moment_b1=0
def feedforward(self):
### randomly selected mini-batch as inputs
self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
self.train_output = self.df_sample_t.as_matrix(columns=[5])
#hidden layer with dropput technique
self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
U1= np.random.rand(*self.hidden_layer1.shape) < self.p # drop mask
self.hidden_layer1 *= U1 # drop!
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
def backpropagation(self):
self.d_output = (self.output_layer-self.train_output)/ self.batch_size
#data part
self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
self.dhidden1 = np.dot(self.d_output, self.W2.T)
self.dhidden1[self.hidden_layer1<= 0] = 0
self.dW1 = np.dot(self.train_input.T, self.dhidden1)
self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)
#regularization part
self.dW2 = self.dW2 + self.reg * self.W2
self.dW1 = self.dW1 + self.reg * self.W1
def Adam(self, epoch, dW2, dW1, db2, db1):
beta1 = 0.9
beta2 = 0.99
self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)
self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)
self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)
self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)
def validation(self):
self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
self.valid_output = self.df_sample_v.as_matrix(columns=[5])
self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
NN = NeuralNetwork()
num_iterations = 120
training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T
# Training and validation
while(t < num_iterations):
NN.feedforward()
NN.backpropagation()
NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
training_loss = np.append(training_loss, NN.total_loss)
if t % 10 == 0:
print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
NN.validation()
validation_loss = np.append(validation_loss, NN.total_loss)
validation_dataloss = np.append(validation_dataloss, NN.data_loss)
if t % 10 == 0:
print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
t+=1
Related
I have created a pytorch model and I want to reduce the model size.
Defining Model Architecture :-
import torch
import torch.quantization
import torch.nn as nn
import copy
import os
import time
import numpy as np
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.utils.prune as prune
import torch.nn.functional as F
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import time
import codecs
import pickle
import torch
from torch.autograd import Variable
import torch.nn.utils.prune as prune
from config import Config
from loader import *
from utils import *
from model import BiLSTM_CRF
START_TAG = '<START>'
STOP_TAG = '<STOP>'
def init_embedding(input_embedding):
"""
Initialize embedding
"""
bias = np.sqrt(3.0 / input_embedding.size(1))
nn.init.uniform(input_embedding, -bias, bias)
def init_linear(input_linear):
"""
Initialize linear transformation
"""
bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
nn.init.uniform(input_linear.weight, -bias, bias)
if input_linear.bias is not None:
input_linear.bias.data.zero_()
def init_lstm(input_lstm):
"""
Initialize lstm
"""
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bias:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind))
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind))
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')
weight.data.zero_()
weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
def to_scalar(var):
return var.view(-1).data.tolist()[0]
def argmax(vec):
_, idx = torch.max(vec, 1)
return to_scalar(idx)
def log_sum_exp(vec):
# vec 2D: 1 * tagset_size
max_score = vec[0, argmax(vec)]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + \
torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_lstm_dim=25,
char_to_ix=None, pre_word_embeds=None, char_embedding_dim=25, use_gpu=False,
n_cap=None, cap_embedding_dim=None, use_crf=True, char_mode='CNN'):
super(BiLSTM_CRF, self).__init__()
self.use_gpu = use_gpu
self.embedding_dim = embedding_dim #100
self.hidden_dim = hidden_dim #200
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.n_cap = n_cap
self.cap_embedding_dim = cap_embedding_dim
self.use_crf = use_crf
self.tagset_size = len(tag_to_ix)
self.out_channels = char_lstm_dim #25
self.char_mode = char_mode
print('char_mode: %s, out_channels: %d, hidden_dim: %d, ' % (char_mode, char_lstm_dim, hidden_dim))
if self.n_cap and self.cap_embedding_dim:
self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
# print("self.cap_embeds.weight------",self.cap_embeds.weight)
init_embedding(self.cap_embeds.weight)
if char_embedding_dim is not None:
self.char_lstm_dim = char_lstm_dim
self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim)
# print("self.char_embeds.weight-------", self.char_embeds.weight)
init_embedding(self.char_embeds.weight)
if self.char_mode == 'LSTM':
self.char_lstm = nn.LSTM(char_embedding_dim, char_lstm_dim, num_layers=1, bidirectional=True)
init_lstm(self.char_lstm)
if self.char_mode == 'CNN':
self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2,0))
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
if pre_word_embeds is not None:
self.pre_word_embeds = True
self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
else:
self.pre_word_embeds = False
self.dropout = nn.Dropout(0.5)
if self.n_cap and self.cap_embedding_dim:
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2+cap_embedding_dim, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels+cap_embedding_dim, hidden_dim, bidirectional=True)
else:
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels, hidden_dim, bidirectional=True)
init_lstm(self.lstm)
self.hw_trans = nn.Linear(self.out_channels, self.out_channels)
self.hw_gate = nn.Linear(self.out_channels, self.out_channels)
self.h2_h1 = nn.Linear(hidden_dim*2, hidden_dim)
self.tanh = nn.Tanh()
self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)
init_linear(self.h2_h1)
init_linear(self.hidden2tag)
init_linear(self.hw_gate)
init_linear(self.hw_trans)
if self.use_crf:
self.transitions = nn.Parameter(
torch.zeros(self.tagset_size, self.tagset_size))
self.transitions.data[tag_to_ix[START_TAG], :] = -10000
self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
def _score_sentence(self, feats, tags):
# tags is ground_truth, a list of ints, length is len(sentence)
# feats is a 2D tensor, len(sentence) * tagset_size
r = torch.LongTensor(range(feats.size()[0]))
if self.use_gpu:
r = r.cuda()
pad_start_tags = torch.cat([torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags = torch.cat([tags, torch.cuda.LongTensor([self.tag_to_ix[STOP_TAG]])])
else:
pad_start_tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags = torch.cat([tags, torch.LongTensor([self.tag_to_ix[STOP_TAG]])])
score = torch.sum(self.transitions[pad_stop_tags, pad_start_tags]) + torch.sum(feats[r, tags])
return score
def _get_lstm_features(self, sentence, chars2, caps, chars2_length, d):
if self.char_mode == 'LSTM':
# self.char_lstm_hidden = self.init_lstm_hidden(dim=self.char_lstm_dim, bidirection=True, batchsize=chars2.size(0))
chars_embeds = self.char_embeds(chars2).transpose(0, 1)
packed = torch.nn.utils.rnn.pack_padded_sequence(chars_embeds, chars2_length)
lstm_out, _ = self.char_lstm(packed)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
outputs = outputs.transpose(0, 1)
chars_embeds_temp = Variable(torch.FloatTensor(torch.zeros((outputs.size(0), outputs.size(2)))))
if self.use_gpu:
chars_embeds_temp = chars_embeds_temp.cuda()
for i, index in enumerate(output_lengths):
chars_embeds_temp[i] = torch.cat((outputs[i, index-1, :self.char_lstm_dim], outputs[i, 0, self.char_lstm_dim:]))
chars_embeds = chars_embeds_temp.clone()
for i in range(chars_embeds.size(0)):
chars_embeds[d[i]] = chars_embeds_temp[i]
if self.char_mode == 'CNN':
chars_embeds = self.char_embeds(chars2).unsqueeze(1)
chars_cnn_out3 = self.char_cnn3(chars_embeds)
chars_embeds = nn.functional.max_pool2d(chars_cnn_out3, kernel_size=(chars_cnn_out3.size(2), 1)).view(chars_cnn_out3.size(0), self.out_channels)
# t = self.hw_gate(chars_embeds)
# g = nn.functional.sigmoid(t)
# h = nn.functional.relu(self.hw_trans(chars_embeds))
# chars_embeds = g * h + (1 - g) * chars_embeds
embeds = self.word_embeds(sentence)
if self.n_cap and self.cap_embedding_dim:
cap_embedding = self.cap_embeds(caps)
if self.n_cap and self.cap_embedding_dim:
embeds = torch.cat((embeds, chars_embeds, cap_embedding), 1)
else:
embeds = torch.cat((embeds, chars_embeds), 1)
embeds = embeds.unsqueeze(1)
embeds = self.dropout(embeds)
lstm_out, _ = self.lstm(embeds)
lstm_out = lstm_out.view(len(sentence), self.hidden_dim*2)
lstm_out = self.dropout(lstm_out)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def _forward_alg(self, feats):
# calculate in log domain
# feats is len(sentence) * tagset_size
# initialize alpha with a Tensor with values all equal to -10000.
init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
forward_var = autograd.Variable(init_alphas)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
emit_score = feat.view(-1, 1)
tag_var = forward_var + self.transitions + emit_score
max_tag_var, _ = torch.max(tag_var, dim=1)
tag_var = tag_var - max_tag_var.view(-1, 1)
forward_var = max_tag_var + torch.log(torch.sum(torch.exp(tag_var), dim=1)).view(1, -1) # ).view(1, -1)
terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]).view(1, -1)
alpha = log_sum_exp(terminal_var)
# Z(x)
return alpha
def viterbi_decode(self, feats):
backpointers = []
# analogous to forward
init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_vvars[0][self.tag_to_ix[START_TAG]] = 0
forward_var = Variable(init_vvars)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
next_tag_var = forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size) + self.transitions
_, bptrs_t = torch.max(next_tag_var, dim=1)
bptrs_t = bptrs_t.squeeze().data.cpu().numpy()
next_tag_var = next_tag_var.data.cpu().numpy()
viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t]
viterbivars_t = Variable(torch.FloatTensor(viterbivars_t))
if self.use_gpu:
viterbivars_t = viterbivars_t.cuda()
forward_var = viterbivars_t + feat
backpointers.append(bptrs_t)
terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
terminal_var.data[self.tag_to_ix[STOP_TAG]] = -10000.
terminal_var.data[self.tag_to_ix[START_TAG]] = -10000.
best_tag_id = argmax(terminal_var.unsqueeze(0))
path_score = terminal_var[best_tag_id]
best_path = [best_tag_id]
for bptrs_t in reversed(backpointers):
best_tag_id = bptrs_t[best_tag_id]
best_path.append(best_tag_id)
start = best_path.pop()
assert start == self.tag_to_ix[START_TAG]
best_path.reverse()
return path_score, best_path
def neg_log_likelihood(self, sentence, tags, chars2, caps, chars2_length, d):
# sentence, tags is a list of ints
# features is a 2D tensor, len(sentence) * self.tagset_size
feats = self._get_lstm_features(sentence, chars2, caps, chars2_length, d)
if self.use_crf:
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
else:
tags = Variable(tags)
scores = nn.functional.cross_entropy(feats, tags)
return scores
def forward(self, sentence, chars, caps, chars2_length, d):
feats = self._get_lstm_features(sentence, chars, caps, chars2_length, d)
# viterbi to get tag_seq
if self.use_crf:
score, tag_seq = self.viterbi_decode(feats)
else:
score, tag_seq = torch.max(feats, 1)
tag_seq = list(tag_seq.cpu().data)
return score, tag_seq
create Model Instance:-
model_fp32 = BiLSTM_CRF(vocab_size=len(word_to_id),
tag_to_ix=tag_to_id,
embedding_dim=parameters['word_dim'],
hidden_dim=parameters['word_lstm_dim'],
use_gpu=parameters['use_gpu'],
char_to_ix=char_to_id,
pre_word_embeds=word_embeds,
use_crf=parameters['crf'],
char_mode=parameters['char_mode'])
Apply Quantization
model_int8 = torch.quantization.quantize_dynamic(
model_fp32, # the original model
{nn.LSTM,nn.Linear}, # a set of layers to dynamically quantize
dtype=torch.qint8)
Checking Quantization Results:
def print_size_of_model(model, label=""):
torch.save(model.state_dict(), "temp.p")
size=os.path.getsize("temp.p")
print("model: ",label,' \t','Size (KB):', size/1e3)
os.remove('temp.p')
return size
compare the sizes
f=print_size_of_model(model_fp32,"model_fp32")
q=print_size_of_model(model_int8,"model_int8")
print("{0:.2f} times smaller".format(f/q))
Results
model: model_fp32 Size (KB): 806494.996
model: model_int8 Size (KB): 804532.412
1.00 times smaller
is there any way to reduce the model size significantly??
Based on Results section of question and vocab_size of approximately 2 million, it's seems reasonable to quantize attribute word_embeds. Expected that quantization of only this module alone will result in significant drop of memory occupation by weights. According to documentation there is no support for dynamic quantization(which is used for nn.Linear and nn.LSTM in snippet above) of nn.Embedding(type of word_embeds), but static quantization can handle this. Default qconfig which is used in some pytorch examples seems not working on nn.Embedding, but there is a hint in issue discussion how to quantize nn.Embedding. After training:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig
model_fp32.word_embeds.qconfig = float_qparams_weight_only_qconfig
torch.quantization.prepare(model_fp32, inplace=True)
torch.quantization.convert(model_fp32, inplace=True)
And after that word_embeds in model_fp32 will be quantized to torhc.quint8.
I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
I've been programming this neural network composed of 4 layers:
The first one with 2 neural, second with 2, third with 2 and the output one with one neurons
I made this schema to show what I'm trying to reproduce :
Here is the code, you can try to run it (python 3.7) :
import numpy as np
import matplotlib.pyplot as plt
#Calculus of the sigmoid
def sigmoid(z):
return 1.0/(1+ np.exp(-z))
#Calculus of the sigmoid derivation
def sigmoid_derivative(y):
return y * (1.0 - y)
#Initialisation of the class (input, output, targets, weights, biais)
class NeuralNetwork:
def __init__(self, x, y):
self.input = x
self.weights1 = np.random.rand(self.input.shape[1],2)
self.weights2 = np.random.rand(2,2)
self.weights3 = np.random.rand(2,2)
self.weights4 = np.random.rand(2,1)
self.y = y
self.output = np.zeros(self.y.shape)
self.bias1 = np.random.rand(1,2)
self.bias2 = np.random.rand(1,2)
self.bias3 = np.random.rand(1,2)
self.bias4 = np.random.rand(1,1)
self.learning_rate = 0.005
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
#Back propagation algorithme
def backprop(self):
# application of the chain rule to find derivative of the loss function with respect to weights4, weights3, weights2, weights1 and the associated bias
delta_4 = 2*(self.y - self.output) * sigmoid_derivative(self.output)
d_weights4 = np.dot(self.layer3.T, delta_4)
d_bias4 = delta_4
d_bias4 = d_bias4.mean(axis=0)
delta_3 = np.dot(delta_4, self.weights4.T) * sigmoid_derivative(self.layer3)
d_weights3 = np.dot(self.layer2.T, delta_3)
d_bias3 = delta_3
d_bias3 = d_bias3.mean(axis=0)
delta_2 = np.dot(delta_3, self.weights3.T) * sigmoid_derivative(self.layer2)
d_weights2 = np.dot(self.layer1.T, delta_2)
d_bias2 = delta_2
d_bias2 = d_bias2.mean(axis=0)
delta_1 = np.dot(delta_2, self.weights2.T) * sigmoid_derivative(self.layer1)
d_weights1 = np.dot(self.input.T, delta_1)
d_bias1 = delta_1
d_bias1 = d_bias1.mean(axis=0)
# update the weights with the derivative (slope) of the loss function
self.weights1 += d_weights1 * self.learning_rate
self.weights2 += d_weights2 * self.learning_rate
self.weights3 += d_weights3 * self.learning_rate
self.weights4 += d_weights4 * self.learning_rate
self.bias1 += d_bias1 * self.learning_rate
self.bias2 += d_bias2 * self.learning_rate
self.bias3 += d_bias3 * self.learning_rate
self.bias4 += d_bias4 * self.learning_rate
def cost(self):
return np.mean((self.output - self.y)**2)
if __name__ == "__main__":
#Number of rows per class
row_per_class = 200
#generate rows
#Creating a data set hard to resolve
sick_people = (np.random.randn(row_per_class,2))
row_sick = int(row_per_class/8)
healthy_people = 2*(np.random.randn(row_sick,2)) + np.array([0,10])
healthy_people2 = 2*(np.random.randn(row_sick,2)) + np.array([0,-10])
healthy_people3 = 2*(np.random.randn(row_sick,2)) + np.array([10,0])
healthy_people4 = 2*(np.random.randn(row_sick,2)) + np.array([-10,0])
healthy_people5 = 2*(np.random.randn(row_sick,2)) + np.array([10,10])
healthy_people6 = 2*(np.random.randn(row_sick,2)) + np.array([10,-10])
healthy_people7 = 2*(np.random.randn(row_sick,2)) + np.array([-10,10])
healthy_people8 = 2*(np.random.randn(row_sick,2)) + np.array([-10,-10])
features = np.vstack([sick_people, healthy_people2, healthy_people, healthy_people3, healthy_people4, healthy_people5, healthy_people6, healthy_people7, healthy_people8])
targets = (np.concatenate((np.zeros(row_per_class), np.zeros(row_per_class)+1)))
#To have a good vision of the dataset created just above
plt.scatter(features[:,0], features[:,1], c=targets, cmap = plt.cm.Spectral)
plt.show()
targets = targets[np.newaxis].T
#Initialing the neural network
nn = NeuralNetwork(features,targets)
#Test without training, we can see the current accuracy
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
#Training part
for i in range(30000):
if i % 1000 == 0:
print (nn.cost())
nn.feedforward()
nn.backprop()
# Re Testing of the feedforward after the training
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
predictions = np.around(np.squeeze(np.asarray(nn.output)))
#Show on graph how well the training went
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
# It allows us to have a better vision of the result, we project random point by thousands and
# see the graph
row_per_class = 2000
#generate rows
sick_people = (np.random.randn(row_per_class,2))*4
sick_people2 = (np.random.randn(row_per_class,2))*4
healthy_people = (np.random.randn(row_per_class,2))*4
healthy_people2 = (np.random.randn(row_per_class,2))*4
features = np.vstack([sick_people,sick_people2, healthy_people, healthy_people2])
nn.input = features
nn.feedforward()
predictions = np.around(np.squeeze(np.asarray(nn.output)))
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
It looks like I've respected the mathematic concept of back propagation but the accuracy is never good neither the cost.
It looks like it is random.
Here is the tutorial I have used to make this code (especially the back propagation) :
https://theclevermachine.wordpress.com/2014/09/06/derivation-error-backpropagation-gradient-descent-for-neural-networks/
Thank you so much for your help !
Matrix connections in your feedforward function are wrong
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
must be
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer2, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer3, self.weights4) + self.bias4)
I tried your code this way and it seems to work for me
Here is how the prediction looks like
By the way, not that it makes a big difference but theoretically, you should use a binary cross entropy cost function rather than MSE because your problem here is logistic regression. MSE may make it non-convex that would otherwise be convex.
This is a simple MLP I am writing for binary image classification, with backpropagation:
class MLP:
def __init__(self, size, epochs = 1000, learning_rate = 1):
self.l1weights = numpy.random.random((size + 1, 3))
self.l2weights = numpy.random.random(3)
self.epochs = epochs
self.learning_rate = learning_rate
def predict(self, _input_):
#Append bias at the beginning of input
l1output = self.sigmoid(numpy.dot(numpy.append([1], _input_), self.l1weights))
l2output = self.sigmoid(numpy.dot(l1output, self.l2weights))
return l1output, l2output
def train(self, training_set, training_goal):
for epoch in range(self.epochs):
l1squared_error = 0
l2squarederror = 0
for set_index in range(training_goal.shape[0]):
set = training_set[set_index]
l1output, l2output = self.predict(set)
l2error = training_goal[set_index] - l2output
l1error = l2error * self.dsigmoid(l2output) * self.l2weights
self.l1weights[0] = self.l1weights[0] + self.learning_rate * l1error
for index in range(len(self.l1weights) - 1):
self.l1weights[index + 1] += self.learning_rate * l1error * self.dsigmoid(l1output)
for index in range(len(self.l2weights)):
self.l2weights[index] += self.learning_rate * l2error * self.dsigmoid(l2output)
l1squared_error += sum(l1error ** 2)
l2squarederror += l2error ** 2
print("Squared error at epoch " + str(epoch) + " : " + str(l1squared_error) + ", " + str(l2squarederror))
def sigmoid(self, _input_):
#Sigmoid sigmoid function
return 1 / (1 + numpy.exp(-_input_))
def dsigmoid(self, _input_):
return _input_ * (1 - _input_)
When run sometimes all output converges into 1 but for some reason the predictions for 0 converge into 0.5 while predictions for 1 stay near 0.75, with error from layer 2 staying the same after ~1000 epochs, if it does relatively more successfully. This is from testing with 2x2 image classification with the code below:
def image_class(input):
return 1 if input >= 2 else 0
training_set = ((numpy.arange(2**4)[:,None] & (1 << numpy.arange(4))) != 0)
training_goals = numpy.array([image_class(sum(i)) for i in training_set])
mlp = MLP(size=4)
mlp.train(training_set, training_goals)
I could solve this by adding a layer right after the output layer with step activation instead of sigmoid and training it separately from the initial network, at least with 2x2 recognition.