I'm trying to implement English to Hindi translation using Deep Learning LSTM. But when I train the model it shows 'nan' loss in both actual and validation.
Link of text file containing translation pairs-: http://www.manythings.org/anki/
Below is my Jupyter notebook code-:
import string
import re
from numpy import array, argmax, random, take, delete
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
# function to read raw text file
def read_text(filename):
# open the file
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()
return text
# split a text into sentences
def to_lines(text):
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
return sents
data = read_text("/content/drive/My Drive/Colab Notebooks/Language Translator New/hin.txt")
eng_hin = to_lines(data)
eng_hin = array(eng_hin)
eng_hin = delete(eng_hin, 2, axis=1)
# Remove punctuation
eng_hin[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,0]]
eng_hin[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,1]]
# convert to lowercase
for i in range(len(eng_hin)):
eng_hin[i,0] = eng_hin[i,0].lower()
eng_hin[i,1] = eng_hin[i,1].lower()
# empty lists
eng_l = []
hin_l = []
# populate the lists with sentence lengths
for i in eng_hin[:,0]:
eng_l.append(len(i.split()))
for i in eng_hin[:,1]:
hin_l.append(len(i.split()))
print(max(eng_l))
print(max(hin_l))
# function to build a tokenizer
def tokenization(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# prepare english tokenizer
eng_tokenizer = tokenization(eng_hin[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 25
print('English Vocabulary Size: %d' % eng_vocab_size)
# prepare Hindi tokenizer
hin_tokenizer = tokenization(eng_hin[:, 1])
hin_vocab_size = len(hin_tokenizer.word_index) + 1
hin_length = 25
print('Hindi Vocabulary Size: %d' % hin_vocab_size)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post')
return seq
# Model Building
from sklearn.model_selection import train_test_split
train, test = train_test_split(eng_hin, test_size=0.2, random_state = 12)
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(hin_tokenizer, hin_length, train[:, 1])
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(hin_tokenizer, hin_length, test[:, 1])
# build NMT model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
model = Sequential()
model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
model.add(LSTM(units))
model.add(RepeatVector(out_timesteps))
model.add(LSTM(units, return_sequences=True))
model.add(Dense(out_vocab, activation='softmax'))
return model
model = build_model(hin_vocab_size, eng_vocab_size, hin_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
filename = '/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/model.h1.31_dec_19'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
epochs=100, batch_size=64,
validation_split = 0.2,
callbacks=[checkpoint], verbose=1)
model.save('/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/eng2hin.h5')
When I try to fit the model, it runs but shows 'nan' in loss. Please help me to resolve my issue.
In simple words, it usually happens because the loss function / optimizer isn't suiting the network calculations. I recently used this network to create a calculator. Try using loss='categorical_crossentropy' and optimizer='adam' and see if it works.
Related
I am working on an NLP sentiment analysis model to classify the sentiment (neutral, positive, negative) of a tweet based on the content of the tweet on Google Colab. I have prepped the test_x and train_x data into sequences of ints using the Tokenizer module. I followed the Tokenizer tutorial on the official TensorFlow Youtube channel so there should be nothing wrong with that part.
However, when beginning to train the model, I run into UnimplementedError: Graph Execution.
I tried changing the layers of the model and decreasing the size of my data sets but the same error still popped up every time.
Could anyone clarify what this error means and is trying to say and also point out what is wrong with my code? Thanks!
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
print("Found GPU at: {}".format(device_name))
else:
device_name = "/device:CPU:0"
print("No GPU, using {}.".format(device_name))
# Load dataset into a dataframe
train_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/train.csv"
test_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/test.csv"
train_df = pd.read_csv(train_data_path, encoding='unicode_escape')
test_df = pd.read_csv(test_data_path, encoding='unicode_escape')
train_df.head()
# Function to convert df into a list of strings
def convert_to_list(df, x):
selected_text_list = []
labels = []
for index, row in df.iterrows():
selected_text_list.append(str(row[x]))
labels.append(str(row['sentiment']))
return np.array(selected_text_list), np.array(labels)
train_sentences, train_labels = convert_to_list(train_df, 'selected_text')
test_sentences, test_labels = convert_to_list(test_df, 'text')
print(train_sentences)
print(train_labels)
# Instantiate tokenizer and create word_index
tokenizer = Tokenizer(num_words=1000, oov_token='<oov>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# Convert sentences into a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
# Padding sequences
pad_test_seq = pad_sequences(test_sequence, padding='post')
max_len = pad_test_seq[0].size
pad_train_seq = pad_sequences(train_sequence, padding='post', maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 24, input_length=max_len),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
with tf.device(device_name):
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 20
with tf.device(device_name):
history = model.fit(pad_train_seq, train_labels, epochs=num_epochs, validation_data=(pad_test_seq, test_labels), verbose=2)
Here is a screenshot of the error:
I've been following the following tutorial to try and understand LSTMs and tensorflow a bit more. From running, it the training of the model goes smoothly, but when I try to use the trained tokenizer on the test data and then convert it to a numpy array, it doesn't work and I'm not really sure what the problem is. The relevant portion that goes wrong is below:
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
The error it presents is as below:
Traceback (most recent call last):
File "/Users/pranavnair/Documents/Code/wpd/wpd.py", line 85, in <module>
x_test = np.array(x_test_data)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10824,) + inhomogeneous part.
I've tried using np.hstack instead of np.array, and that doesn't fix it. Would appreciate any help at all, thanks in advance.
Full code below for reference
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras import utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
# set random seed for reproducibility
RANDOM_SEED = 4
np.random.seed(RANDOM_SEED)
# import datasets
df_neut = pd.read_csv("./input/good.csv")
df_prom = pd.read_csv("./input/promotional.csv")
# clean up data to only include text
df_prom = df_prom.drop(df_prom.columns[1:], axis=1)
df_neut = df_neut.drop(df_neut.columns[1:], axis=1)
# combine datasets
df_neut.insert(1, 'label', 0) # neutral labels
df_prom.insert(1, 'label', 1) # promotional labels
# merge dataframes
df = pd.concat((df_neut, df_prom), ignore_index=True, axis=0)
# randomize order of dataframes
df = df.reindex(np.random.permutation(df.index))
# split into training and testing datasets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# perform data preprocessing using keras tokenizer
text_data = [str(txt) for txt in df_train['text'].values] # convert text data to strings
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', lower=True) # create tokenizer
tokenizer.fit_on_texts(text_data) # make dictionary
# vectorize dataset
x_train = tokenizer.texts_to_sequences(text_data)
# Max number of words in each sequence
MAX_SEQUENCE_LENGTH = 400
# pad sequence lengths
x_train = utils.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
# get test labels
y_train = df_train['label'].values
# create sequential model
model = Sequential()
# create embedding layer
EMBEDDING_DIM = 100
model.add(Embedding(MAX_NB_WORDS+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
# add LSTM layer to model
model.add(LSTM(80))
# setup model layers
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# setup binary classification via binary cross entropy loss
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
# train for two epochs
EPOCHS = 4
BATCH_SIZE = 64
history = model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.15)
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
x_test = utils.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)
y_test = np.array(df_test['label'].values)
# evaluate model
scores = model.evaluate(x_test, y_test, batch_size=128)
print("The model has a test loss of %.2f and a test accuracy of %.1f%%" % (scores[0], scores[1]*100))
I'm trying to build a deep learning model to predict the top 5 probable movie genres, using movies' synopses as input. The movie genres I'm including in the data are 19, but regardless of test input, the model always predicts the same 5 movie genres. Below is my code building the model. However, the accuracy during fitting is 90%. Can you point me to the right direction as to what I'm doing wrong?
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import pandas as pd
import numpy as np
import re
data = pd.read_csv('train.csv', encoding = 'utf-8')
#Create column with comma separated genres
data['genres_comma'] = data['genres'].str.split()
mlb = MultiLabelBinarizer()
#Create new dataframe with one hot encoded labels
train = pd.concat([
data.drop(['genres', 'genres_comma'], 1),
pd.DataFrame(mlb.fit_transform(data['genres_comma']), columns=mlb.classes_),
], 1)
genre_names = list(mlb.classes_)
genres = train.drop(['movie_id', 'synopsis'], 1)
def preprocess_text(sen):
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sen)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
return sentence
X = []
sentences = list(train['synopsis'])
for sen in sentences:
X.append(preprocess_text(sen))
y = genres.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
#Convert text inputs into embedded vectors.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
#GloVe word embeddings to convert text inputs to their numeric counterparts
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = asarray(records[1:], dtype='float32')
embeddings_dictionary[word] = vector_dimensions
glove_file.close()
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#Model Creation
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(19, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)
Did you check for the class distribution in your training data? accuracy is not a good measure when you have strongly imbalanced classes. For instance, if these 5 genres make up >90% of your training data, the model may not have learned how to classify any other genre, but still achieve 90% accuracy (a common failure mode is constant output, which may still have a high accuracy with imbalanced classes). So, the first step should be to look at the number of training movies in the respective categories.
If my hunch is correct, you may try looking into class balancing weights, or into other loss functions that essentially also give more weight to rare classes. You may also want to fine-tune your model without class weights, after it has learned to classify all genres, in order to learn the real prior probabilities.
Another approach that sometimes helps is to group the 14 minority classes into a class "other". Maybe that is frequent enough for your classifier to learn it.
If so, then you can train another classifier to predict the right class, only for the "other" examples.
Alternatively, you could enrich the training set with more examples of the minority classes.
I've made simple prediction model with keras and bag of words based on the code which I found in the tutorials. Loading dataset and training finished without problem and accuracy is around 88%.
Dataset has two columns text and tag (i.e. "some text, a"). How can I test trained model with some other data which is not in dataset like model.predict(some text)?
This is sample dataset:
tekst,tag
Sconto,n
Trg Vinodolskog zakona 5,a
I wish to save the model so I don't have to train it every time I run the script. Is correct way to put at the end of the script "model.save('my_model.h5')"?
How can I the load the model and make prediction with data that it's not in dataset?
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
df = pd.read_csv('dataset3.csv')
df = df[pd.notnull(df['tag'])]
df.head(10)
def print_plot(index):
example = df[df.index == index][['tekst', 'tag']].values[0]
if len(example) > 0:
print(example[0])
print('Tag:', example[1])
print_plot(0)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|#,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['tekst'] = df['tekst'].apply(clean_text)
print_plot(0)
import itertools
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
train_size = int(len(df) * .7)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))
train_posts = df['tekst'][:train_size]
train_tags = df['tag'][:train_size]
test_posts = df['tekst'][train_size:]
test_tags = df['tag'][train_size:]
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
batch_size = 32
epochs = 2
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_split=0.1)
score = model.evaluate(x_test, y_test,
batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])
Once you have finished training your model, you can save the weights to disk by using model.save_weights(path).
You can then load the weights into a model of the same architecture using model.load_weights(path).
If you also want to save the model architecture, you can use the more general model.save(path) which will save
The model weights,
The model architecture,
The optimizer states.
You can then load the model using
from keras.models import load_model
model = load_model(path)
After you have recovered the model and its weight, you can then evaluate the model to determine its accuracy or do predictions on new data using
prediction = model.predict(x_test)
loss, metrics = model.evaluate(x_test, y_test)
Yes, according to the Keras Documentation FAQ page. You just type: model.save(filepath).
In case you want to load an already existing model, go with: keras.models.load_model(filepath).
I want to use keras for authorship attribution. I have a list of (text,labels). I am trying to use the keras builtin vectorizer but I get the following error:
Vectorizing sequence data... Traceback (most recent call last): File "", line 1, in File
"/home/angelo/org/courses/corpusling/finalproject/src/neuralnet.py",
line 46, in
X_train = tokenizer.texts_to_matrix(X_train, mode='binary') File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py",
line 166, in texts_to_matrix
sequences = self.texts_to_sequences(texts) File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py",
line 131, in texts_to_sequences
for vect in self.texts_to_sequences_generator(texts): File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py",
line 150, in texts_to_sequences_generator
i = self.word_index.get(w) AttributeError: 'Tokenizer' object has no attribute 'word_index'
The following is my code so far:
import glob
import os
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
def get_label(filename):
tmp = os.path.split(filename)[0]
label = os.path.basename(tmp)
return label
def read_file(filename):
with open(filename) as f:
text = f.read()
return text
traindocs = "../data/C50/C50train/*/*.txt"
testdocs = "../data/C50/C50test/*/*.txt"
documents_train = (read_file(f) for f in glob.iglob(traindocs))
labels_train = (get_label(f) for f in glob.iglob(traindocs))
documents_test = (read_file(f) for f in glob.iglob(testdocs))
labels_test = (get_label(f) for f in glob.iglob(testdocs))
df_train = pd.DataFrame([documents_train, labels_train])
df_train = df_train.transpose()
df_train.rename(columns={0: 'text', 1: 'author'}, inplace=True)
df_test = pd.DataFrame([documents_test, labels_test])
df_test = df_test.transpose()
df_test.rename(columns={0: 'text', 1: 'author'}, inplace=True)
max_words = 1000
print('Vectorizing sequence data...')
tokenizer = Tokenizer(nb_words=max_words)
X_train, Y_train = df_train.text, df_train.author
X_test, Y_test = df_test.text, df_test.author
X_train = tokenizer.texts_to_matrix(X_train, mode='binary')
X_test = tokenizer.texts_to_matrix(X_test, mode='binary')
nb_classes = np.max(Y_train) + 1
print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
model = Sequential()
model.add(Dense(output_dim=512, input_dim=(max_words,)))
model.add(Activation("relu"))
model.add(Dense(output_dim=(np.max(Y_train)+1)))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy',
optimizer='sgd', metrics=['accuracy'])
model.fit(X_train, Y_train, nb_epoch=5, batch_size=32)
loss_and_metrics = model.evaluate(X_test, Y_test, batch_size=32)
You need to use tokenizer.fit_on_texts(texts) before using tokenizer.texts_to_matrix()
here texts is the list of the the text data (both train and test).
fit_on_texts() uses it to build word_index. Its nothing but unique word to number mapping. And this mapping is later used to generate the matrix.