Skip-gram word2vec loss doesn't decrease - python

I'm working on implementaion of word2vec architecture from scratch. But my model doesn't converge.
class SkipGramBatcher:
def __init__(self, text):
self.text = text.results
def get_batches(self, batch_size):
n_batches = len(self.text)//batch_size
pairs = []
for idx in range(0, len(self.text)):
window_size = 5
idx_neighbors = self._get_neighbors(self.text, idx, window_size)
#one_hot_idx = self._to_one_hot(idx)
#idx_pairs = [(one_hot_idx, self._to_one_hot(idx_neighbor)) for idx_neighbor in idx_neighbors]
idx_pairs = [(idx,idx_neighbor) for idx_neighbor in idx_neighbors]
pairs.extend(idx_pairs)
for idx in range(0, len(pairs), batch_size):
X = [pair[0] for pair in pairs[idx:idx+batch_size]]
Y = [pair[1] for pair in pairs[idx:idx+batch_size]]
yield X,Y
def _get_neighbors(self, text, idx, window_size):
text_length = len(text)
start = max(idx-window_size,0)
end = min(idx+window_size+1,text_length)
neighbors_words = set(text[start:end])
return list(neighbors_words)
def _to_one_hot(self, indexes):
n_values = np.max(indexes) + 1
return np.eye(n_values)[indexes]
I use text8 corpus and have applied preprocessing techniques such as stemming, lemmatization and subsampling. Also I've excluded English stop words and limited vocabulary
vocab_size = 20000
text_len = len(text)
test_text_len = int(text_len*0.15)
preprocessed_text = PreprocessedText(text,vocab_size)
I use tensorflow for graph computation
train_graph = tf.Graph()
with train_graph.as_default():
inputs = tf.placeholder(tf.int32, [None], name='inputs')
labels = tf.placeholder(tf.int32, [None, None], name='labels')
n_embedding = 300
with train_graph.as_default():
embedding = tf.Variable(tf.random_uniform((vocab_size, n_embedding), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs)
And apply negative sampling
# Number of negative labels to sample
n_sampled = 100
with train_graph.as_default():
softmax_w = tf.Variable(tf.truncated_normal((vocab_size, n_embedding))) # create softmax weight matrix here
softmax_b = tf.Variable(tf.zeros(vocab_size), name="softmax_bias") # create softmax biases here
# Calculate the loss using negative sampling
loss = tf.nn.sampled_softmax_loss(
weights=softmax_w,
biases=softmax_b,
labels=labels,
inputs=embed,
num_sampled=n_sampled,
num_classes=vocab_size)
cost = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer().minimize(cost)
Finally I train my model
epochs = 10
batch_size = 64
avg_loss = []
with train_graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=train_graph) as sess:
iteration = 1
loss = 0
sess.run(tf.global_variables_initializer())
for e in range(1, epochs+1):
batches = skip_gram_batcher.get_batches(batch_size)
start = time.time()
for batch_x,batch_y in batches:
feed = {inputs: batch_x,
labels: np.array(batch_y)[:, None]}
train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
loss += train_loss
if iteration % 100 == 0:
end = time.time()
print("Epoch {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Avg. Batch loss: {:.4f}".format(loss/iteration),
"{:.4f} sec/batch".format((end-start)/100))
#loss = 0
avg_loss.append(loss/iteration)
start = time.time()
iteration += 1
save_path = saver.save(sess, "checkpoints/text8.ckpt")
But after running this model my average batch loss doesn't decrease dramatically
I guess I should have made a mistake somewhere. Any help is apprciated

What makes you say "my average batch loss doesn't decrease dramatically"? The graph you've attached shows some (unlabeled) value decreasing significantly, and still decreasing at a strong slope towards the end of data.
"Convergence" would show up as the improvement-in-loss first slowing, then stopping.
But if your loss is still noticeably dropping, just keep training! Using more epochs can be especially important on small datasets – like the tiny text8 you're using.

Related

How to add BiLSTM on top of BERT from Huggingface + CUDA out of memory. Tried to allocate 16.00 MiB

I have the below code for a binary classification and it works fine but i would like to modify the nn.Sequential parameters and add an BiLSTM layer. I have the below code:
class BertClassifier(nn.Module):
def __init__(self, freeze_bert=False):
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.Sequential(nn.Linear(D_in, H),nn.ReLU(),nn.Linear(H, D_out))
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
# Feed input to BERT
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
# Feed input to classifier to compute logits
logits = self.classifier(last_hidden_state_cls)
return logits
I have tried to modify the sequential like this self.classifier = nn.Sequential(nn.LSTM(D_in, H, batch_first=True, bidirectional=True),nn.ReLU(),nn.Linear(H, D_out)) but then it throws the error RuntimeError: input must have 3 dimensions, got 2 on line logits = self.classifier(last_hidden_state_cls). I found that I can use nn.ModuleDict instead of nn.Sequential and i made the below :
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H,batch_first=True, bidirectional=True ),
'linear': nn.Linear(in_features=H,out_features=D_out)})
But now I'm having issues computing the forward function with this. Can someone advice how i can properly modify the forward function?
Update: I also installed CUDA and now when I run the code it returns the error CUDA out of memory. Tried to allocate 16.00 MiB and I tried to lower the batch size but that doesn't fix the problem. I also tried the below but didn't resolved either. Any advice, please?
import torch, gc
gc.collect()
torch.cuda.empty_cache()
Update with the code:
MAX_LEN = 64
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32
VALID_BATCH_SIZE = 4
file1 = open('MH.txt', 'r')
list_com = []
list_label = []
for line in file1:
possible_labels = 'positive|negative'
label = re.findall(possible_labels, line)
line = re.sub(possible_labels, ' ', line)
line = re.sub('\n', ' ', line)
list_com.append(line)
list_label.append(label[0])
list_tuples = list(zip(list_com, list_label))
file1.close()
labels = ['positive', 'negative']
df = pd.DataFrame(list_tuples, columns=['text', 'label'])
df['label'] = df['label'].map({'positive': 1, 'negative': 0})
for i in range(0,len(df['label'])):
list_label[i] = df['label'][i]
#print(df)
#print(df['label'].value_counts())
X = df.text.values
y = df.label.values
X_train, X_val, y_train, y_val =\
train_test_split(X, y, test_size=0.1, random_state=2020)
def text_preprocessing(text):
# Remove '#name'
text = re.sub(r'(#.*?)[\s]', ' ', text)
# Replace '&' with '&'
text = re.sub(r'&', '&', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
input_ids = []
attention_masks = []
for sent in data:
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
pad_to_max_length=True, # Pad sentence to max length
# return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# Create the BertClassfier class
class BertClassifier(nn.Module):
"""Bert Model for Classification Tasks."""
def __init__(self, freeze_bert=False):
"""
#param bert: a BertModel object
#param classifier: a torch.nn.Module classifier
#param freeze_bert (bool): Set `False` to fine-tune the BERT model
"""
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.ModuleDict({
'lstm': nn.LSTM(input_size=D_in, hidden_size=H, batch_first=True, bidirectional=True),
'linear': nn.Linear(in_features=H, out_features=D_out)})
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
sequence_output = outputs[0]
sequence_output, _ = self.lstm(sequence_output)
linear_output = self.linear(sequence_output[:, -1])
return linear_output
def initialize_model(epochs=4):
# Instantiate Bert Classifier
bert_classifier = BertClassifier(freeze_bert=False)
print(bert_classifier)
# Tell PyTorch to run the model on GPU
bert_classifier.to(device)
# Create the optimizer
optimizer = AdamW(bert_classifier.parameters(), lr=5e-5)
# Total number of training steps
total_steps = len(train_dataloader) * epochs
# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
return bert_classifier, optimizer, scheduler
# Specify loss function
loss_fn = nn.CrossEntropyLoss()
def set_seed(seed_value=42):
"""Set seed for reproducibility."""
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
"""Train the BertClassifier model."""
# Start training loop
print("Start training...\n")
for epoch_i in range(epochs):
# Print the header of the result table
print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
print("-" * 70)
# Measure the elapsed time of each epoch
t0_epoch, t0_batch = time.time(), time.time()
# Reset tracking variables at the beginning of each epoch
total_loss, batch_loss, batch_counts = 0, 0, 0
# Put the model into the training mode
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
batch_counts += 1
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Zero out any previously calculated gradients
model.zero_grad()
# Perform a forward pass. This will return logits.
logits = model(b_input_ids, b_attn_mask)
# Compute loss and accumulate the loss values
loss = loss_fn(logits, b_labels)
batch_loss += loss.item()
total_loss += loss.item()
# Perform a backward pass to calculate gradients
loss.backward()
# Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and the learning rate
optimizer.step()
scheduler.step()
# Print the loss values and time elapsed for every 20 batches
if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
# Calculate time elapsed for 20 batches
time_elapsed = time.time() - t0_batch
# Print training results
print(
f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
# Reset batch tracking variables
batch_loss, batch_counts = 0, 0
t0_batch = time.time()
# Calculate the average loss over the entire training data
avg_train_loss = total_loss / len(train_dataloader)
print("-" * 70)
#Evaluation
if evaluation == True:
# After the completion of each training epoch, measure the model's performance
# on our validation set.
val_loss, val_accuracy = evaluate(model, val_dataloader)
# Print performance over the entire training data
time_elapsed = time.time() - t0_epoch
print(
f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
print("-" * 70)
print("\n")
print("Training complete!")
def evaluate(model, val_dataloader):
"""After the completion of each training epoch, measure the model's performance
on our validation set.
"""
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
model.eval()
# Tracking variables
val_accuracy = []
val_loss = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
val_loss.append(loss.item())
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
val_accuracy.append(accuracy)
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
return val_loss, val_accuracy
def accuracy(probs, y_true):
"""
- Print AUC and accuracy on the test set
#params probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
#params y_true (np.array): an array of the true values with shape (len(y_true),)
fpr, tpr, threshold = roc_curve(y_true, preds)
roc_auc = auc(fpr, tpr)
print(f'AUC: {roc_auc:.4f}')
"""
preds = probs[:, 1]
# Get accuracy over the test set
y_pred = np.where(preds >= 0.5, 1, 0)
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
def bert_predict(model, test_dataloader):
"""Perform a forward pass on the trained BERT model to predict probabilities on the test set."""
# Put the model into the evaluation mode. The dropout layers are disabled during the test time.
model.eval()
all_logits = []
# For each batch in our test set...
for batch in test_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
all_logits.append(logits)
# Concatenate logits from each batch
all_logits = torch.cat(all_logits, dim=0)
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
return probs
set_seed(42) # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
# start training
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, val_dataloader)
# Evaluate the Bert classifier
accuracy(probs, y_val)

Dropout not computed in tensorflow

I am trying to set an instance so that dropout is compute only during the training session, but somehow it seems that the model doesn't see the dropout layer, as when modifying the probabilities nothing happens. I suspect it's a logic issue in my code, but I can't spot where. Also, I'm relatively new to this world, so please cope with my inexperience. Any help will be much appreciated.
Here's the code. I first create a Boolean placeholder
Train = tf.placeholder(tf.bool,shape=())
which will be then passed into a dictionary value as true(training) or False(test). Then I implemented the forward propagation as follows.
def forward_prop_cost(X, parameters,string,drop_probs,Train):
"""
Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX
Arguments:
X -- input dataset placeholder, of shape (input size, number of examples)
parameters -- python dictionary containing your parameters "W1", "b1", ...
string - ReLU or tanh
drop_probs = drop probabilities for each layer. First and last == 0
Train = boolean
Returns:
ZL -- the output of the last LINEAR unit
"""
L = len(drop_probs)-1
activations = []
activations.append(X)
if string == 'ReLU':
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
if (Train == True and drop_probs[i] != 0):
Ai = tf.nn.dropout(tf.nn.relu(Zi),drop_probs[i])
else:
Ai = tf.nn.relu(Zi)
activations.append(Ai)
elif string == 'tanh': #needs update!
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
Ai = tf.nn.dropout(tf.nn.tanh(Zi),drop_probs[i])
activations.append(Ai)
ZL = tf.matmul(parameters['W'+str(L)],activations[L-1]) + parameters['b'+str(L)]
logits = tf.transpose(ZL)
labels = tf.transpose(Y)
return ZL
Then I call the model function, where just at the end I pass the values of the Train as true or false, depending on the data set I'm using.
def model(X_train, Y_train, X_test, Y_test,hidden = [12288,25,12,6], string = 'ReLU',drop_probs = [0.,0.4,0.2,0.],
regular_param = 0.0, starter_learning_rate = 0.0001,
num_epochs = 1500, minibatch_size = 32, print_cost = True, learning_decay = False):
'''
Returns:
parameters -- parameters learnt by the model. They can then be used to predict.
'''
ops.reset_default_graph()
tf.set_random_seed(1)
seed = 3
(n_x, m) = X_train.shape # (n_x: input size, m : number of examples in the train set)
n_y = Y_train.shape[0] # n_y : output size
costs = [] # To keep track of the cost
graph = tf.Graph()
X, Y ,Train = create_placeholders(n_x, n_y)
parameters = initialize_parameters(hidden)
#print([n.name for n in tf.get_default_graph().as_graph_def().node])
ZL = forward_prop_cost(X, parameters,'ReLU',drop_probs,Train)
#cost = forward_prop_cost(X, parameters,'ReLU',drop_probs,regular_param )
cost = compute_cost(ZL,Y,parameters,regular_param)
#optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
if learning_decay == True:
increasing = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate,increasing * minibatch_size,m, 0.95, staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost,global_step=increasing)
else:
optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
# Initialize all the variables
init = tf.global_variables_initializer()
# Start the session to compute the tensorflow graph
with tf.Session() as sess:
# Run the initialization
sess.run(init, { Train: True } )
# Do the training loop
for epoch in range(num_epochs):
epoch_cost = 0.
num_minibatches = int(m / minibatch_size)
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatches
# Print the cost every 100 epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per fives)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate accuracy on the test set
correct_prediction = tf.equal(tf.argmax(ZL), tf.argmax(Y))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train, Train: True}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test, Train: False}))
return parameters

LSTM Loss remains the same after numerous iterations

Thanks for looking into this question! :)
I attempted to train an LSTM network to predict next 10-day stock prices of Google based on past 30-day stock prices. I trained the LSTM but the loss barely reduced even after 200 iterations. I suspected that the issue might be due to the feed_dict in tf Session. However, I have not identified any issue with that (perhaps due to my superficial knowledge). It seems that the optimizer refreshes every iterations in the tf Session.
Would appreciate if I could seek advice on what might have gone wrong in the code, if my understanding on the usage of Optimizer has been wrong.
Thanks for your help!!
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import csv
import random
import tensorflow as tf
from tensorflow.contrib import rnn
# Define data reader
def read_data(fname):
with open(fname) as f:
data = list(csv.reader(f))
d_mat = np.array(data)
d_trsp = np.transpose(d_mat)
date = np.transpose(d_trsp[0])
p_open = d_trsp[1]
vol = d_trsp[6]
chg = d_trsp[7]
chg = [float(i) for i in chg]
return vol, chg
vol, training_data = read_data('GOOGL.csv')
training_data = training_data[0:300]
print("Loading training data..")
#Split data for learning
ratio_train = 0.70
ratio_valid = 0.90-ratio_train
ratio_test = 0.10 #fixed at 10% of dataset
# Parameters
learning_rate = 0.005
training_iters = 100
display_step = 1
x_size = 30
y_size = 5
n_hidden = 256
# Variables
x = tf.placeholder("float", [265, x_size])
y = tf.placeholder("float", [265, y_size])
weights = {
'out': tf.Variable(tf.random_normal([n_hidden, y_size]))
}
biases = {
'out': tf.Variable(tf.random_normal([y_size]))
}
# Preprocess Data
def prod_data(data):
x = []
y = []
iter = len(data)-x_size-y_size
for i in range(0, iter):
x.append(data[i:i+x_size])
y.append(data[i+x_size+1: i+x_size+1+y_size])
return x, y
a,b = prod_data(training_data)
# Define RNN architecture
def RNN(x, weights, biases):
# Reshape x to [1, n_input]
x = tf.reshape(x, [-1, x_size])
x = tf.split(x, x_size, 1)
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])
outputs, states = rnn.static_rnn(rnn_cell, x, dtype = tf.float32)
return tf.matmul(outputs[-1], weights['out'] + biases['out'])
pred = RNN(x, weights, biases)
# Loss and Optimizer
cost = tf.reduce_mean((pred-y)**2)
optimizer = tf.train.RMSPropOptimizer(learning_rate = learning_rate).minimize(cost)
# Initialization
init = tf.global_variables_initializer()
# Launch Tensor graph
with tf.Session() as sess:
sess.run(init)
step = 0
loss_total = 0
loss_coll = []
end_offset = len(training_data)-y_size-x_size-1
while step < training_iters:
_, loss, model_pred = sess.run([optimizer, cost, pred], \
feed_dict={x: a, y: b})
# Update total loss and accuracy
loss_total += loss
loss_coll.append(loss)
if (step+1) % display_step == 0:
print("Loss at step " + str(step) + " = " + str(loss))
loss_total = 0
step += 1
print("Optimization Finished!")

Not able to train a simple Char RNN

I have been working on a vanilla char rnn in tensorflow. I am not able it to produce any thing sensible even after training it a couple of hours. The code is tf version of Keras code from Chollet's Deep learning with pythonGithub
I tried playing around with hyper params without much success. Chollet mentioned in the book that the model produced good output after 80 epochs. I have able to get anything resonable after 50K+ epochs :( Curious if there is something I missed while converting this code to tensorflow.
n_layers = 1
num_units = 128
batch_size = 150
X = tf.placeholder(tf.float32, [None, maxlen, len(unique_chars)], name="Placeholder_X")
y = tf.placeholder(tf.int64, [None, len(unique_chars)], name="Placeholder_Y")
lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=num_units) for layer in range(n_layers)]
multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
outputs, current_state = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
top_layer_h_state = current_state[-1][1]
logits = tf.layers.dense(top_layer_h_state, len(unique_chars), name="softmax")
xentropy=tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=y)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)
pred = tf.nn.softmax(logits)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Sampling Code:
with tf.Session() as sess:
init.run()
saver.restore(sess, model_name)
# Output some data
start_index = random.randint(0, len(text) - maxlen - 1)
generated_text = text[start_index: start_index + maxlen]
print("Seed: ", generated_text)
final_string = ""
sampled = np.zeros((1, maxlen, len(unique_chars)))
for i in range(50):
for t, char in enumerate(generated_text):
sampled[0, t, char_to_idx[char]] = 1.
preds_eval = sess.run([pred], feed_dict={X: sampled})
preds=preds_eval[0][0]
next_index = sample(preds, 0.5)
next_char = unique_chars[next_index]
generated_text += next_char
final_string += next_char
generated_text = generated_text[1:]
print("New String: " , final_string)
Sample Input Seed: is,
as is generally acknowledged nowadays, no better sopori
Input generation:
maxlen = 60
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i:i + maxlen])
next_chars.append(text[i + maxlen])
unique_chars = sorted(list(set(text)))
char_to_idx = dict((char, unique_chars.index(char)) for char in unique_chars)
data_X = np.zeros((len(sentences), maxlen, len(unique_chars)), dtype=np.float32)
data_Y = np.zeros((len(sentences), len(unique_chars)), dtype=np.int64)
for idx, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
data_X[idx, t, char_to_idx[char]] = 1
data_Y[idx, char_to_idx[next_chars[idx]]] = 1
Output from the model: vatsoéätlæéättire
It looks like you are trying to make a language model. I didn't read your entire code carefully. Just from the first part I noticed a couple of things. Why is your placeholder for x of type tf.float32 instead of integers? More importantly, why is the shape of y equal to batch size by vocab size? It should be batch_size by max_len -1 by vocab_size. In a language model you are always trying to predict the next character at every step. It's not a good way to train it to read a whole sequence of characters and then just predict one more at the end.

Tensorflow - Retrieving weights/biases of the trained feedforward neural network after training

I'm currently trying to create a simple web application for interactive neural network training with Flask. What I'm struggling with is retrieving the weights of the hidden layers after a feedforward neural network has been trained - my goal is to make a real back-end for the Tensorflow's Playground.
Take into consideration the following weights initialisation:
# Weight initializations
tW1 = init_weights(shape=(n_features, hidden_nodes))
tW2 = init_weights(shape=(hidden_nodes, output_nodes))
How does one go about retrieving the calculated weights of tW1 and tW2 after the training has been completed in Tensorflow?
Here's a sample of the code:
def retrieve_data():
"""Retrieves the data - to be expanded for custom database access + S3 retrieval + URL"""
result = pd.read_csv('snp_data.csv', parse_dates=['Date'], index_col=['Date'])
return result
def get_columns(data, columns):
features = data.ix[:, columns]
return features
def preprocess(data):
"""Data preprocessing"""
result = (data - data.mean()) / data.std(ddof=0)
result = result.fillna(0)
return result
def init_weights(shape):
""" Weights initialization """
weights = tf.random_normal(shape=shape, stddev=0.1)
return tf.Variable(weights)
def forwardprop(X, w_1, w_2):
"""Forward propagation"""
h = tf.nn.relu(tf.matmul(X, w_1))
y_hat = tf.matmul(h, w_2)
return y_hat
# #app.route('/train')
def train():
data = retrieve_data()
train_x = get_columns(data, columns=['Open', 'Close'])
train_x = preprocess(data=train_x).as_matrix().astype(np.float32)
train_x = train_x[:(len(train_x) - (len(train_x) % 32))]
train_y = get_columns(data, columns=['Adj Close']).as_matrix().astype(np.float32)
train_y = train_y[:(len(train_y) - (len(train_y) % 32))]
# Number of input nodes
n_features = train_x.shape[1]
# Number of output nodes
output_nodes = train_y.shape[1]
# Number of hidden nodes
hidden_nodes = 20
# TF Placeholders for the inputs and outputs
tx = tf.placeholder(tf.float32, shape=(None, n_features))
ty = tf.placeholder(tf.float32, shape=(None, output_nodes))
# Weight initializations
tW1 = init_weights(shape=(n_features, hidden_nodes))
tW2 = init_weights(shape=(hidden_nodes, output_nodes))
# Forward propagation
y_hat = forwardprop(tx, tW1, tW2)
# Backward Propagation
tMSE = tf.reduce_mean(tf.square(y_hat - ty))
learning_rate = 0.001
tOptimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
tOptimize = tOptimizer.minimize(tMSE)
batch_size = 32
n_epochs = 8
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i_e in range(n_epochs):
for i in range(0, train_x.shape[0], batch_size):
batch_X = train_x[i:i + batch_size, ...]
batch_y = train_y[i:i + batch_size]
_, loss = sess.run([tOptimize, tMSE], feed_dict={tx: batch_X, ty: batch_y})
print(i, loss)
return 'Flask Dockerized'
This should be as simple as final_tW1, final_tW2 = sess.run([tW1, tW2]) after the for loop has completed. You don't need to feed anything because the variables maintain their own values that don't depend on placeholders.

Categories

Resources