I am trying to create a Convolutional Neural Network to classify what language a certain "word" is from. There are two files ("english_words.txt" and "spanish_words.txt") which each contain about 60,000 words each. I have converted each word into a 29-dimensional vector where each element is a number between 0 and 1. I am training the model for 500 epochs with the optimizer "adam". However, when I train the model, the loss tends to hover around 0.7 and the accuracy around 0.5, and no matter how long I train it for, these metrics will not improve. Here is the code:
import keras
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
import re
train_labels = []
train_data = []
with open("english_words.txt") as words:
full_words = words.read()
full_words = full_words.split("\n")
# all of the labels are just 1.
# we now need to encode them into 29 dimensional vectors.
vector = []
i = 0
for word in full_words:
train_labels.append([1,0])
for letter in word:
vector.append((ord(letter) - 96) * (1.0 / 26.0))
i += 1
if (i < 29):
for x in range(0, 29 - i):
vector.append(0)
train_data.append(vector)
vector = []
i = 0
with open("spanish_words.txt") as words:
full_words = words.read()
full_words = full_words.replace(' ', '')
full_words = full_words.replace('\n', ',')
full_words = full_words.split(",")
vector = []
for word in full_words:
train_labels.append([0,1])
for letter in word:
vector.append((ord(letter) - 96) * (1.0 / 26.0))
i += 1
if (i < 29):
for x in range(0, 29 - i):
vector.append(0)
train_data.append(vector)
vector = []
i = 0
def shuffle_in_unison(a, b):
assert len(a) == len(b)
shuffled_a = np.empty(a.shape, dtype=a.dtype)
shuffled_b = np.empty(b.shape, dtype=b.dtype)
permutation = np.random.permutation(len(a))
for old_index, new_index in enumerate(permutation):
shuffled_a[new_index] = a[old_index]
shuffled_b[new_index] = b[old_index]
return shuffled_a, shuffled_b
train_data = np.asarray(train_data, dtype=np.float32)
train_labels = np.asarray(train_labels, dtype=np.float32)
train_data, train_labels = shuffle_in_unison(train_data, train_labels)
print(train_data.shape, train_labels.shape)
model = Sequential()
model.add(Dense(29, input_shape=(29,)))
model.add(Dense(60))
model.add(Dense(40))
model.add(Dense(25))
model.add(Dense(2))
model.compile(optimizer="adam",
loss="categorical_crossentropy",
metrics=["accuracy"])
model.summary()
model.fit(train_data, train_labels, epochs=500, batch_size=128)
model.save("language_predictor.model")
For some extra info, I am running python 3.x with tensorflow 1.15 and keras 1.15 on windows x64.
I can see several potential problems with your code.
You added several Dense layers one after another, but you really need to also include a non-linear activation function with the parameter activation= .... In the absence of any non-linear activation functions, all those fully-connected Dense layers will mathematically collapse into one single linear Dense layer incapable of learning a non-linear decision boundary.
In general, if you see your loss and accuracy not making any improvement or even getting worse, then the first thing to try is to reduce your learning rate.
You don't need to necessarily implement your own shuffling function. The Keras fit() function can do it if you use the shuffle=True parameter.
In addition to the points mentioned by stackoverflowuser2010:
I find this a very good read and highly suggest checking the mentioned points: 37 Reasons why your Neural Network is not working
Center your input data: Compute a component-wise mean vector and subtract it from every input.
Related
I am trying to train a word embedding to a list of repeated sentences where only the subject changes. I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding. However, the angle between the vectors of subjects is not always larger than the angle between subjects and a random word.
Man is going to write a very long novel that no one can read.
Woman is going to write a very long novel that no one can read.
Boy is going to write a very long novel that no one can read.
The code is based on pytorch tutorial:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
class EmbedTrainer(nn.Module):
def __init__(self, d_vocab, d_embed, d_context):
super(EmbedTrainer, self).__init__()
self.embed = nn.Embedding(d_vocab, d_embed)
self.fc_1 = nn.Linear(d_embed * d_context, 128)
self.fc_2 = nn.Linear(128, d_vocab)
def forward(self, x):
x = self.embed(x).view((1, -1)) # flatten after embedding
x = self.fc_2(F.relu(self.fc_1(x)))
x = F.log_softmax(x, dim=1)
return x
text = " ".join(["{} is going to write a very long novel that no one can read.".format(x) for x in ["Man", "Woman", "Boy"]])
text_split = text.split()
trigrams = [([text_split[i], text_split[i+1]], text_split[i+2]) for i in range(len(text_split)-2)]
dic = list(set(text.split()))
tok_to_ids = {w:i for i, w in enumerate(dic)}
tokens_text = text.split(" ")
d_vocab, d_embed, d_context = len(dic), 10, 2
""" Train """
loss_func = nn.NLLLoss()
model = EmbedTrainer(d_vocab, d_embed, d_context)
print(model)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
losses = []
epochs = 10
for epoch in range(epochs):
total_loss = 0
for input, target in trigrams:
tok_ids = torch.tensor([tok_to_ids[tok] for tok in input], dtype=torch.long)
target_id = torch.tensor([tok_to_ids[target]], dtype=torch.long)
model.zero_grad()
log_prob = model(tok_ids)
#if total_loss == 0: print("train ", log_prob, target_id)
loss = loss_func(log_prob, target_id)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(total_loss)
losses.append(total_loss)
embed_map = {}
for word in ["Man", "Woman", "Boy", "novel"]:
embed_map[word] = model.embed.weight[tok_to_ids[word]]
print(word, embed_map[word])
def angle(a, b):
from numpy.linalg import norm
a, b = a.detach().numpy(), b.detach().numpy()
return np.dot(a, b) / norm(a) / norm(b)
print("man.woman", angle(embed_map["Man"], embed_map["Woman"]))
print("man.novel", angle(embed_map["Man"], embed_map["novel"]))
I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding
I don't really think you'll achieve that kind of result with only 3 sentences and like 40 iterations in 10 epochs (plus most of the data in your 40 iterations is repeated).
maybe try downloading a couple of free datasets out there, or try your own data with a proven model like a genism model.
I'll give you the code for training a gensim model, so you can test your dataset on another model and see if the problem comes from your data or from your model.
I've tested similar gensim models on datasets with millions of sentences and it worked like a charm, for smaller datasets you might want to change the parameters.
from gensim.models import Word2Vec
from multiprocessing import cpu_count
corpus_path = 'eachLineASentence.txt'
vecSize = 300
winSize = 5
numWorkers = cpu_count()-1
epochs = 20
minCount = 5
skipGram = False
modelName = f'mymodel.model'
model = Word2Vec(corpus_file=corpus_path,
size=vecSize,
window=winSize,
min_count=minCount,
workers=numWorkers,
iter=epochs,
sg=skipGram)
model.save(modelName)
P.S. I don't think it's a good idea to use the keyword input as a variable in your code.
It's most probably the training size. Training a 128d embedding is definitely overkill. Rule of thumb from the the google developers blog:
Why is the embedding vector size 3 in our example? Well, the following "formula" provides a general rule of thumb about the number of embedding dimensions:
embedding_dimensions = number_of_categories**0.25
That is, the embedding vector dimension should be the 4th root of the number of categories. Since our vocabulary size in this example is 81, the recommended number of dimensions is 3:
3 = 81**0.25
I know there are several questions about this here, but I haven't found one which fits exactly my problem.
I'm trying to fit an LSTM with data from Pandas DataFrames but getting confused about the format I have to provide them.
I created a small code snipped which shall show you what I try to do:
import pandas as pd, tensorflow as tf, random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
targets = pd.DataFrame(index=pd.date_range(start='2019-01-01', periods=300, freq='D'))
targets['A'] = [random.random() for _ in range(len(targets))]
targets['B'] = [random.random() for _ in range(len(targets))]
features = pd.DataFrame(index=targets.index)
for i in range(len(features)) :
features[str(i)] = [random.random() for _ in range(len(features))]
model = Sequential()
model.add(LSTM(units=targets.shape[1], input_shape=features.shape))
model.compile(optimizer='adam', loss='mae')
model.fit(features, targets, batch_size=10, epochs=10)
this results to:
ValueError: Input 0 of layer sequential is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [10, 300]
which I expect relates to the dimensions of the features DataFrame provided. I guess that once fixed this the next error would mention the targets DataFrame.
As far as I understand, 'units' parameter of my first layer defines the output dimensionality of this model. The inputs have to have a 3D shape, but I don't know how to create them out of the 2D world of the Data Frames.
I hope you can help me understanding the reshape mechanism in Python and how to use them in combination with Pandas DataFrames. (I'm quite new to Python and came from R)
Thankls in advance
Lets looks at the few popular ways in LSTMs are used.
Many to Many
Example: You have a sentence (composed of words in sequence). Give these sequence of words you would like to predict the Parts of speech (POS) of each word.
So you have n words and you feed each word per timestep to the LSTM. Each LSTM timestep (also called LSTM unwrapping) will produce and output. The word is represented by a a set of features normally word embeddings. So the input to LSTM is of size bath_size X time_steps X features
Keras code:
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = True)(inputs)
outputs = keras.layers.TimeDistributed(keras.layers.Dense(5, activation='softmax'))(lstm)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y = np.random.randint(0,2, size=(4,10,5))
model.fit(X, y, epochs=2)
print (model.predict(X).shape)
Many to One
Example: Again you have a sentence (composed of words in sequence). Give these sequence of words you would like to predict sentiment of the sentence if it is positive or negative.
Keras code
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = False)(inputs)
outputs =keras.layers.Dense(5, activation='softmax')(lstm)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y = np.random.randint(0,2, size=(4,5))
model.fit(X, y, epochs=2)
print (model.predict(X).shape)
Many to multi-headed
Example: You have a sentence (composed of words in sequence). Give these sequence of words you would like to predict sentiment of the sentence as well the author of the sentence.
This is multi-headed model where one head will predict the sentiment and another head will predict the author. Both the heads share the same LSTM backbone.
Keras code
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = False)(inputs)
output_A = keras.layers.Dense(5, activation='softmax')(lstm)
output_B = keras.layers.Dense(5, activation='softmax')(lstm)
model = keras.Model(inputs=inputs, outputs=[output_A, output_B])
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y_A = np.random.randint(0,2, size=(4,5))
y_B = np.random.randint(0,2, size=(4,5))
model.fit(X, [y_A, y_B], epochs=2)
y_hat_A, y_hat_B = model.predict(X)
print (y_hat_A.shape, y_hat_B.shape)
What you are looking for is Many to Multi head model where your predictions for A will be made by one head and another head will make predictions for B
The input data for the LSTM has to be 3D.
If you print the shapes of your DataFrames you get:
targets : (300, 2)
features : (300, 300)
The input data has to be reshaped into (samples, time steps, features). This means that targets and features must have the same shape.
You need to set a number of time steps for your problem, in other words, how many samples will be used to make a prediction.
For example, if you have 300 days and 2 features the time step can be 3. So that three days will be used to make one prediction (you can choose this arbitrarily). Here is the code for reshaping your data (with a few more changes):
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
data = pd.DataFrame(index=pd.date_range(start='2019-01-01', periods=300, freq='D'))
data['A'] = [random.random() for _ in range(len(data))]
data['B'] = [random.random() for _ in range(len(data))]
# Choose the time_step size.
time_steps = 3
# Use numpy for the 3D array as it is easier to handle.
data = np.array(data)
def make_x_y(ts, data):
"""
Parameters
ts : int
data : numpy array
This function creates two arrays, x and y.
x is the input data and y is the target data.
"""
x, y = [], []
offset = 0
for i in data:
if offset < len(data)-ts:
x.append(data[offset:ts+offset])
y.append(data[ts+offset])
offset += 1
return np.array(x), np.array(y)
x, y = make_x_y(time_steps, data)
print(x.shape, y.shape)
nodes = 100 # This is the width of the network.
out_size = 2 # Number of outputs produced by the network. Same size as features.
model = Sequential()
model.add(LSTM(units=nodes, input_shape=(x.shape[1], x.shape[2])))
model.add(Dense(out_size)) # For the output a Dense (fully connected) layer is used.
model.compile(optimizer='adam', loss='mae')
model.fit(x, y, batch_size=10, epochs=10)
Well, just to finalize this issue I would like to provide one solution I have meanwhile worked on. The class TimeseriesGenerator in tf.keras.... enabled me quite easy to provide the data in the right shape to an LSTM model
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
window_size = 7
batch_size = 8
sampling_rate = 1
train_gen = TimeseriesGenerator(X_train.values, y_train.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
valid_gen = TimeseriesGenerator(X_valid.values, y_valid.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
test_gen = TimeseriesGenerator(X_test.values, y_test.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
There are many other ways on implementing generators e.g. using the more_itertools which provides the function windowed, or making use of tensorflow.Dataset and its function window.
For me the TimeseriesGenerator was sufficient to feed the tests I did.
In case you would like to see an example modeling the DAX based on some stocks I'm sharing a notebook on Github.
Im trying to implement a text-classifier using triplet loss to classify different job descriptions into categories based on this paper. But whatever i do, the classifier yields very bad results.
For the embedding i followed this tutorial and the NN architecture is based on this article.
I create my encodings using:
max_char_len = 20
group_numbers = range(0, len(job_groups))
char_vocabulary = {'PAD':0}
X_char = []
y_temp = []
i = 1
for group, number in zip(job_groups, group_numbers):
for job in group:
job_cleaned = some_cleaning_function(job)
job_enc = []
for c in job_cleaned:
if c in char_vocabulary.keys():
job_enc.append(char_vocabulary[c])
else:
char_vocabulary[c] = i
job_enc.append(char_vocabulary[c])
i+=1
X_char.append(job_enc)
y_temp.append(number)
X_char = pad_sequences(X_char, maxlen = max_char_length, truncating='post')
My Neural Network is set up the following way:
def create_base_model():
char_in = Input(shape=(max_char_length,), name='Char_Input')
char_enc = Embedding(input_dim=len(char_vocabulary)+1, output_dim=20, mask_zero=True,name='Char_Embedding')(char_in)
x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.2, dropout=0.4))(char_enc)
x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.2, dropout=0.4))(x)
x = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.2, dropout=0.4))(x)
x = Bidirectional(LSTM(64, return_sequences=False, recurrent_dropout=0.2, dropout=0.4))(x)
out = Dense(128, activation = "softmax")(x)
return Model(char_in, out)
def get_siamese_triplet_char():
anchor_input_c = Input(shape=(max_char_length,),name='Char_Input_Anchor')
pos_input_c = Input(shape=(max_char_length,),name='Char_Input_Positive')
neg_input_c = Input(shape=(max_char_length,),name='Char_Input_Negative')
base_model = create_base_model(encoding_generator)
encoded_anchor = base_model(anchor_input_c)
encoded_positive = base_model(pos_input_c)
encoded_negative = base_model(neg_input_c)
inputs = [anchor_input_c, pos_input_c, neg_input_c]
outputs = [encoded_anchor, encoded_positive, encoded_negative]
siamese_triplet = Model(inputs, outputs)
siamese_triplet.add_loss((triplet_loss(outputs)))
siamese_triplet.compile(loss=None, optimizer='adam')
return siamese_triplet, base_model
The triplet loss is defined as follows:
def triplet_loss(inputs):
anchor, positive, negative = inputs
positive_distance = K.square(anchor - positive)
negative_distance = K.square(anchor - negative)
positive_distance = K.sqrt(K.sum(positive_distance, axis=-1, keepdims = True))
negative_distance = K.sqrt(K.sum(negative_distance, axis=-1, keepdims = True))
loss = positive_distance - negative_distance
loss = K.maximum(0.0, 1 + loss)
return K.mean(loss)
The model is then trained with:
siamese_triplet_char.fit(x=
[Anchor_chars_train,
Positive_chars_train,
Negative_chars_train],
shuffle=True, batch_size=8, epochs=22, verbose=1)
My goal is to: First, train the network with no label data in order to minimize the space of the different phrases and second, add a classification layer and create the final classifier.
My general problem is that even the first phase shows sinking cost-values it overfits and the validation results jump around and the second phase fails badly as I'm not able to train the model to actually classify.
My questions are the following:
Could someone explain the Embedding Architecture? What is the output dimension refering to? The individual characters? Would that even make sense? Or is there a better way to encode the input data?
How can i add validation_data to a network that does not contain labeled data? I could use validation_split, but i would rather prefer passing specific data to validate as my data is stratified.
Is there a reason why the classification does not work? Applying a simple K-Nearest Neighbor algorithm achieves at best 0.5 accuracy! Is it because of the data? Or is there a systematic error in my system?
All ideas and suggestions are really appreciated!
Using examples from Lipton et al (2016), target replication is basically calculating the loss at each time step (except final) of the LSTM (or GRU) and averaging this loss and adding it to the main loss while training. Mathematically, it is given by -
Graphically, it can be represented as -
So how do I go about exactly implementing this in Keras? Say, I have binary classification task. Let's say my model is a simple one given below -
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='binary_crossentropy', class_weights={0:0.5, 1:4}, optimizer=Adam(), metrics=['accuracy'])
model.fit(x_train, y_train)
I think y_train needs to be reshaped/tiled from (batch_size, 1) to (batch_size, time_step) right?
The dense layer needs TimeDistributed to be applied correctly to the LSTM after setting return_sequences=True?
How do I exactly implement the exact loss function given above? Will class_weights need to be modified?
Target replication is only during training. How to implement validation set evaluation using only the main loss?
How should I deal with zero paddings in target replication? My sequences are padded to a max_len of 15 with average length being 7. Since the target replication loss averages over all the steps, how do I make sure it doesn't use the padded words in calculating the loss? Basically, dynamically assign T the actual sequence length.
Question 1:
So, for the targets, you need it shaped as (batch_size, time_steps, 1). Just use:
y_train = np.stack([y_train]*time_steps, axis=1)
Question 2:
You're correct, but TimeDistributed is optional in Keras 2.
Question 3:
I don't know how class weights will behave, but a regular loss function should go like:
from keras.losses import binary_crossentropy
def target_replication_loss(alpha):
def inner_loss(true,pred):
losses = binary_crossentropy(true,pred)
return (alpha*K.mean(losses[:,:-1], axis=-1)) + ((1-alpha)*losses[:,-1])
return inner_loss
model.compile(......, loss = target_replication_loss(alpha), ...)
Question 3a:
Since the above doens't work well with class weights, I created an alternative where the weights go into the loss:
def target_replication_loss(alpha, class_weights):
def get_weights(x):
b = class_weights[0]
a = class_weights[1] - b
return (a*x) + b
def inner_loss(true,pred):
#this will only work for classification with only one class 0 or 1
#and only if the target is the same for all classes
true_classes = true[:,-1,0]
weights = get_weights(true_classes)
losses = binary_crossentropy(true,pred)
return weights*((alpha*K.mean(losses[:,:-1], axis=-1)) + ((1-alpha)*losses[:,-1]))
return inner_loss
Question 4:
To avoid complexity, I'd say you should use an additional metric in validation:
def last_step_BC(true,pred):
return binary_crossentropy(true[:,-1], pred[:,-1])
model.compile(....,
loss = target_replication_loss(alpha),
metrics=[last_step_BC])
Question 5:
This is a hard one and I'd need to research a little....
As an initial workaround, you can set the model with an input shape of (None, features), and train each sequence individually.
Working example without class_weight
def target_replication_loss(alpha):
def inner_loss(true,pred):
losses = binary_crossentropy(true,pred)
#print(K.int_shape(losses))
#print(K.int_shape(losses[:,:-1]))
#print(K.int_shape(K.mean(losses[:,:-1], axis=-1)))
#print(K.int_shape(losses[:,-1]))
return (alpha*K.mean(losses[:,:-1], axis=-1)) + ((1-alpha)*losses[:,-1])
return inner_loss
alpha = 0.6
i1 = Input((5,2))
i2 = Input((5,2))
out = LSTM(1, activation='sigmoid', return_sequences=True)(i1)
model = Model(i1, out)
model.compile(optimizer='adam', loss = target_replication_loss(alpha))
model.fit(np.arange(30).reshape((3,5,2)), np.arange(15).reshape((3,5,1)), epochs = 200)
Working example with class weights:
def target_replication_loss(alpha, class_weights):
def get_weights(x):
b = class_weights[0]
a = class_weights[1] - b
return (a*x) + b
def inner_loss(true,pred):
#this will only work for classification with only one class 0 or 1
#and only if the target is the same for all classes
true_classes = true[:,-1,0]
weights = get_weights(true_classes)
losses = binary_crossentropy(true,pred)
print(K.int_shape(losses))
print(K.int_shape(losses[:,:-1]))
print(K.int_shape(K.mean(losses[:,:-1], axis=-1)))
print(K.int_shape(losses[:,-1]))
print(K.int_shape(weights))
return weights*((alpha*K.mean(losses[:,:-1], axis=-1)) + ((1-alpha)*losses[:,-1]))
return inner_loss
alpha = 0.6
class_weights={0: 0.5, 1:4.}
i1 = Input(batch_shape=(3,5,2))
i2 = Input((5,2))
out = LSTM(1, activation='sigmoid', return_sequences=True)(i1)
model = Model(i1, out)
model.compile(optimizer='adam', loss = target_replication_loss(alpha, class_weights))
model.fit(np.arange(30).reshape((3,5,2)), np.arange(15).reshape((3,5,1)), epochs = 200)
I want train a sort network based on rnn using the keras library. But when I train the net in the CPU, the CPU memory usage is increasing all the time. And the cpu will break down soon.
I use the gc module of python to observe the size change of all objects, but sizes of all objects seem to be same all the time. And I found when I set a bigger size, the speed of increasing of memory usage is faster.
The network is as followed
code
#!/usr/bin/env python
from collections import defaultdict
from gc import get_objects
from keras.models import Sequential
from keras.layers.core import Activation, RepeatVector, TimeDistributedDense, Dropout, Dense
from keras.layers import recurrent
import numpy as np
from keras import backend as K
import sys
#from data import batch_gen, encode
RNN = recurrent.LSTM
# Neural networks take input as vectors so we have to convert integers to
vectors using one-hot encoding
# This function will encode a given integer sequence into RNN compatible format (one-hot representation)
def encode(X,seq_len, vocab_size):
x = np.zeros((len(X),seq_len, vocab_size), dtype=np.float32)
for ind,batch in enumerate(X):
for j, elem in enumerate(batch):
x[ind, j, elem] = 1
return x
# This is a generator function which can generate infinite-stream of inputs for training
def batch_gen(batch_size=32, seq_len=6, max_no=10):
# Randomly generate a batch of integer sequences (X) and its sorted
# counterpart (Y)
x = np.zeros((batch_size, seq_len, max_no), dtype=np.float32)
y = np.zeros((batch_size, seq_len, max_no), dtype=np.float32)
while True:
# Generates a batch of input
X = np.random.randint(max_no, size=(batch_size, seq_len))
Y = np.sort(X, axis=1)
for ind,batch in enumerate(X):
for j, elem in enumerate(batch):
x[ind, j, elem] = 1
for ind,batch in enumerate(Y):
for j, elem in enumerate(batch):
y[ind, j, elem] = 1
yield x, y
x.fill(0.0)
y.fill(0.0)
# global parameters.
batch_size=32
seq_len = 6
max_no = 10
# Initializing model
model = Sequential()
# This is encoder RNN (we are taking a variant of RNN called LSTM, because plain RNN's suffer from long-term dependencies issues
model.add(RNN(max_no))#, input_shape=(seq_len, max_no)))
# Dropout to enhace RNN's generalization capacities
model.add(Dropout(0.25))
# At this point RNN will generate a summary vector of the sequence, so to feed it to decoder we need to repeat it lenght of output seq. number of times
model.add(RepeatVector(seq_len))
# Decoder RNN, which will return output sequence
model.add(RNN(128, return_sequences=True))
# Adding linear layer at each time step
model.add(TimeDistributedDense(128,max_no))
# Adding non-linearity on top of linear layer at each time-step, since output at each time step is supposed to be probability distribution over max. no of integer in sequence
# we add softmax non-linearity
model.add(Dropout(0.5))
model.add(Activation('softmax'))
# Since this is a multiclass classification task, crossentropy loss is being used. Optimizer is adam, which is a particular instance of adaptive learning rate Gradient Descent methods
model.compile(loss='categorical_crossentropy', optimizer='adam')#,
#metrics=['accuracy'])
# Now the training loop, we'll sample input batches from the generator
function written previously and feed it to the RNN for learning
#using get_object to observe the object
before = defaultdict(int)
after = defaultdict(int)
for i in get_objects():
before[type(i)] += 1
for ind,(X,Y) in enumerate(batch_gen(batch_size, seq_len, max_no)):
for i in get_objects():
after[type(i)] += 1
print(after)
print([(k, after[k] - before[k]) for k in after if after[k] - before[k]])
before = after
loss, acc = model.train_on_batch(X, Y, accuracy=True)
if ind % 100 == 99:
print(ind, 'loss=',loss, 'acc=', acc)