I'm trying to train a simple tensorflow model to detect the sentiment of tweets. The datatypes and sizes of arrays are consistent and the model trains just fine when the recurrent_dropout is set to some float value. However this disables cuDNN and I'd really like to speed this up (don't we all) but whenever I remove the recurrent dropout argument the model training will crash before the end of the first epoch.
Below is the relevant code, I've left out imports, and loading the csv files. After the relevant code are the final input dimensions and the error code. Additionally, I have figured out why colab seemed to be cutting the training data. Colab displays the number of sequences after it has been split into batches, so with the default batch size of 32 we were getting 859 sequences. The crashing issue when not using the recurrent dropout is still an issue. Side note, this code is a very rough draft with the data cleaning all being done within the same notebook, hence the lack of typical formatting.
def remove_case(X):
removed_case = []
X = X.copy()
for text in X:
text = str(text).lower()
removed_case.append(text)
X = removed_case
return X
def remove_hyperlinks(X):
removed_hyperlinks = []
X = X.copy()
for text in X:
text = str(text)
text = re.sub(r'http\S+', '', text)
text = re.sub(r'https\S+', '', text)
text = re.sub(r'www\S+', '', text)
removed_hyperlinks.append(text)
X = removed_hyperlinks
return X
def remove_punctuation(X):
removed_punc = []
X = X.copy()
for text in X:
text = str(text)
text = "".join([char for char in text if char not in punctuation])
removed_punc.append(text)
X = removed_punc
return X
def split_text(X):
split_tweets = []
X = X.copy()
for text in X:
text = str(text).split()
split_tweets.append(text)
X = split_tweets
return X
def map_sentiment(X, l, m, n):
keys = ['negative', 'neutral', 'positive']
values = [l, m, n]
dictionary = dict(zip(keys, values))
X = X.copy()
X = X.map(dictionary)
return X
# # def sentiment_to_onehot(X):
# sentiment_foofs = []
# X = X.copy()
# for integer in X:
# if integer == "negative": # Negative
# integer = [1, 0, 0]
# elif integer == "neutral": # Neutral
# integer = [0, 1, 0]
# elif integer == "positive": # Positive
# integer = [0, 0, 1]
# else:
# break
# sentiment_foofs.append(integer)
# X = sentiment_foofs
# return X
train_no_punc_lowercase = train.copy()
train_no_punc_lowercase['text'] = remove_case(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_hyperlinks(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_punctuation(train_no_punc_lowercase['text'])
train_no_punc_lowercase['sentiment'] = map_sentiment(train_no_punc_lowercase['sentiment'], 0, 1, 2)
train_no_punc_lowercase.head()
test_no_punc_lowercase = test.copy()
test_no_punc_lowercase['text'] = remove_case(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_hyperlinks(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_punctuation(test_no_punc_lowercase['text'])
test_no_punc_lowercase['sentiment'] = map_sentiment(test_no_punc_lowercase['sentiment'], 0, 1, 2)
features = train.columns.tolist()
features.remove('textID') # all unique, high cardinality feature
features.remove('selected_text') # target
target = 'selected_text'
X_train_no_punc_lowercase = train_no_punc_lowercase[features]
y_train_no_punc_lowercase = train_no_punc_lowercase[target]
X_test_no_punc_lowercase = test_no_punc_lowercase[features]
def stemming_column(df_column):
ps = PorterStemmer()
stemmed_word_list = []
for i, string in enumerate(df_column):
tokens = word_tokenize(string)
new_string = ""
for j, words in enumerate(tokens):
new_string = new_string + ps.stem(words) + " "
stemmed_word_list.append(new_string)
return stemmed_word_list
def create_lookup_table(list1, list2):
main_list = []
lookup_dict = {}
i = 1 # used to create a value in the dictionary
main_list.append(list1)
main_list.append(list2)
for list in main_list:
for string in list:
for word in string.split():
if word not in lookup_dict:
lookup_dict[word] = i
i += 1
return lookup_dict
def encode(input_list, input_dict):
encoded_list = []
for string in input_list:
sentence_list = []
for word in string.split():
sentence_list.append(input_dict[word]) # value lookup from dictionary.. int
encoded_list.append(sentence_list)
return encoded_list
def pad_data(list_of_lists):
padded_data = tf.keras.preprocessing.sequence.pad_sequences(list_of_lists, padding='post')
return padded_data
def create_array_sentiment_integers(list):
sent_int_list = []
for sentiment in list:
sent_int_list.append(sentiment)
return np.asarray(sent_int_list, dtype=np.int32)
X_train_stemmed_list = stemming_column(X_train_no_punc_lowercase['text'])
X_test_stemmed_list = stemming_column(X_test_no_punc_lowercase['text'])
lookup_table = create_lookup_table(X_train_stemmed_list, X_test_stemmed_list)
X_train_encoded_list = encode(X_train_stemmed_list, lookup_table)
X_train_padded_data = pad_data(X_train_encoded_list)
Y_train = create_array_sentiment_integers(train_no_punc_lowercase['sentiment'])
max_features = 3 # 3 choices 0, 1, 2
Y_train_final = np.zeros((Y_train.shape[0], max_features), dtype=np.float32)
Y_train_final[np.arange(Y_train.shape[0]), Y_train] = 1.0
input_dimension = len(lookup_table) + 1
output_dimension = 64
input_length = 33
model = Sequential()
model.add(tf.keras.layers.Embedding(input_dim=input_dimension,
output_dim=output_dimension,
input_length=input_length,
mask_zero=True))
model.add(tf.keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(X_train_padded_data, Y_train_final, validation_split=0.20, epochs=10)
model.save('Tweet_sentiment.model')
Additionally, here are the shapes of the datasets..
x train shape: (27481, 33, 1) x train type: <class 'numpy.ndarray'> y train shape: (27481, 3)
Error code
Epoch 1/3
363/859 [===========>..................] - ETA: 9s - loss: 0.5449 - accuracy: 0.5674
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-103-1d4af3962607> in <module>()
----> 1 model.fit(X_train_padded_data, Y_train_final, epochs=3,)
8 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
UnknownError: [_Derived_] CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1496): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)'
[[{{node cond_38/then/_0/CudnnRNNV3}}]]
[[sequential_5/lstm_4/StatefulPartitionedCall]] [Op:__inference_train_function_36098]
Function call stack:
train_function -> train_function -> train_function
I see some problems in your code. They are mentioned below:
You are using input_dimension = len(lookup_table) + 1. len(lookup_table) is nothing but the Number of Time Steps. It's value will be very high, at least more than 30,000. It is recommended to use only subset of those Values. So, you can set input_dimension = 10000 or input_dimension = 15000 (you may experiment with this value) it should solve the problem. Having said that, it will not impact the Accuracy of the Model.
Why is setting Recurrent Dropout a Float Value working ==> When we set the Recurrent Dropout, it actually drops the Number of Time Steps, input_dimension in your case, and hence it is not crashing.
You should use return_sequences=True only if you have another LSTM Layer, after an LSTM Layer. Since you have only one LSTM Layer, return_sequences should be set to False
Since you have 3 Classes, you shouldn't use binary_crossentropy. You should use sparse_categorical_crossentropy if you are not One-Hot-Encoding your Target or categorical_crossentropy if you are One-Hot-Encoding your Target.
Are you sure you want to use Masking in Embedding Layer?
Also, I see that you are using Many Functions and Many Lines of Code for Data-Preprocessing like Removing Hyperlinks, Removing Punctuations, Tokenizing, etc..
So, I thought I will provide an End-To-End Tutorial for Text Classification which will help you as well as the Stack Overflow Community. Code for the same is shown below:
#!pip install tensorflow==2.1
#!pip install nltk
#!pip install tika
#!pip install textblob
#!pip3 install --upgrade numpy
#!pip install scikit-learn
# To handle Paths
import os
# To remove Hyperlinks and Dates
import re
# To remove Puncutations
import string
# This helps to remove the unnecessary words from our Text Data
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# To Parse the Input Data Files
from tika import parser
from textblob import TextBlob
# In order to use the Libraries of Tensorflow
import tensorflow as tf
# For Preprocessing the Text => To Tokenize the Text
from tensorflow.keras.preprocessing.text import Tokenizer
# If the Two Articles are of different length, pad_sequences will make the length equal
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Package for performing Numerical Operations
import numpy as np
# MatplotLib for Plotting Graphs
import matplotlib.pyplot as plt
# To shuffle the Data
from random import shuffle
# To Partition the Data into Train Data and Test Data
from sklearn.model_selection import train_test_split
# To add Regularizer in order to reduce Overfitting
from tensorflow.keras.regularizers import l2
# Give the Path of our Data
Path_Of_Data = 'Data'
# Extract the Labels from the Folders inside the Path mentioned above
Unique_Labels_List = ['negative', 'neutral', 'positive']
def GetNumericLabel(EachLabel):
if EachLabel=='negative':
return 0
elif EachLabel=='neutral':
return 1
elif EachLabel=='positive':
return 2
def Pre_Process_Data_And_Create_BOW(folder_path):
#creating empty lists in order to Create Resume Text and the respective Label
Resumes_List = []
Labels_List = []
for EachLabel in Unique_Labels_List:
for root, dirs, files in os.walk(os.path.join(folder_path, EachLabel),topdown=False):
for file in files:
i = 0
if file.endswith('.pdf'):
#Access individual file
Full_Resume_Path = os.path.join(root, file)
# Parse the Data inside the file
file_data = parser.from_file(Full_Resume_Path)
# Extract the Content of the File
Resume_Text = file_data['content']
# Below Code removes the Hyperlinks in the Resume, like LinkedIn Profile, Certifications, etc..
HyperLink_Regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
Text_Without_HL = re.sub(HyperLink_Regex, ' ', Resume_Text, flags=re.MULTILINE)
# Below Code removes the Date from the Resume
Date_regEx = r'(?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+'
CleanedText = re.sub(Date_regEx,' ',Text_Without_HL)
List_Of_All_Punctuations = list(string.punctuation)
Important_Punctuations = ['#', '.', '+' , '-'] #Add more, if any other Punctuation is observed as Important
NewLineChar = '\n'
# Below Set Comprises all the Punctuations, which can be Removed from the Text of Resume
Total_Punct = len(List_Of_All_Punctuations)
for EachImpPunct in Important_Punctuations:
for CountOfPunct in range(Total_Punct):
if CountOfPunct == Total_Punct:
break
elif EachImpPunct == List_Of_All_Punctuations[CountOfPunct]:
del List_Of_All_Punctuations[CountOfPunct]
Total_Punct = Total_Punct - 1
List_Of_All_Punctuations.append(NewLineChar)
for EachPunct in List_Of_All_Punctuations:
CleanedText = CleanedText.replace(EachPunct, " ")
# Below Code converts all the Words in the Resume to Lowercase ======> Check if it has to come after Tokenization if Splitting Code is delet instead of integed
#Final_Cleaned_Resume_Text = Text_Without_Punct.lower()
Final_Cleaned_Resume_Text = CleanedText.lower()
#Code to remove Stopwords from each Resume
for word in STOPWORDS:
#stop_token = ' ' + word + ' '
stop_token = word
Resume_Text = Final_Cleaned_Resume_Text.replace(stop_token, ' ')
#Resume_Text = Resume_Text.replace(' ', ' ')
Resumes_List.append(Resume_Text)
Numeric_Label = GetNumericLabel(EachLabel)
Labels_List.append(Numeric_Label)
#print('Successfully executed for the Folder, ', EachLabel)
#Return Final Lists
return Resumes_List, Labels_List
#calling the function and passing the path
Resumes_List, Labels_List = Pre_Process_Data_And_Create_BOW(Path_Of_Data)
vocab_size = 10000 # This is very important for you
# We want the Output of the Embedding Layer to be 64
embedding_dim = 64
max_length = 800
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
# Taking 80% of the Data as Training Data and remaining 20% will be for Test Data
training_portion = .8
# Size of Train Data is 80% of the Entire Dataset => 0.8 * 2225
Train_Resume_Size = int(len(Resumes_List) * training_portion)
Labels_List = np.asarray(Labels_List)
Train_Resume_Data, Validation_Resume_Data, Train_Labels, Validation_Labels = \
train_test_split(Resumes_List, Labels_List, train_size = training_portion,
shuffle = True
, stratify= Labels_List)
from statistics import mean
print('Average Number of Words in Each Training Resume is {}'.format(mean([len(i) for i in Train_Resume_Data])))
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(Train_Resume_Data)
word_index = tokenizer.word_index
# Convert the Word Tokens into Integer equivalents, before passing it to keras embedding layer
train_sequences = tokenizer.texts_to_sequences(Train_Resume_Data)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
validation_sequences = tokenizer.texts_to_sequences(Validation_Resume_Data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(validation_sequences))
print(validation_padded.shape)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Check your Data
def decode_article(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('-------------------------------------------------------------------------')
print(Train_Resume_Data[10])
Regularizer = l2(0.001)
model = tf.keras.Sequential([
# Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
tf.keras.layers.Embedding(vocab_size, embedding_dim,
embeddings_regularizer = Regularizer),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
# use ReLU in place of tanh function since they are very good alternatives of each other.
tf.keras.layers.Dense(embedding_dim, activation='relu'),
# Add a Dense layer with 3 units and softmax activation.
# When we have multiple outputs, softmax convert outputs layers into a probability distribution.
tf.keras.layers.Dense(3, activation='softmax')
])
model.summary()
#Using Early Stopping in order to handle Overfitting
ES_Callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
num_epochs = 100
history = model.fit(x = train_padded, y = Train_Labels, epochs=num_epochs,
callbacks=[ES_Callback],
validation_data=(validation_padded, Validation_Labels),
batch_size = 32, shuffle=True, verbose=1)
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
version = 1
MODEL_DIR = 'Resume_Classification_Model'
export_path = os.path.join(MODEL_DIR, str(version))
tf.keras.models.save_model(model = model, filepath = export_path)
!ls -l {export_path}
!saved_model_cli show --dir {export_path} --all
For more information, please refer this Beautiful Article.
Hope this solves your issue. Happy Learning!
Related
Please house I am new in nmt, I did hard coding of a model to translate English to my dialect. The model have worked perfectly but I am having trouble defining user input for actual translation to the model. Blow is the src of the model evaluation. Your assistance will be really appreciated.
`
# load a clean dataset
def load_clean_sentences(filename):
return load(open(filename, 'rb'))
# load datasets
dataset = load_clean_sentences('english-tiv-both.pkl')
train = load_clean_sentences('english-tiv-train.pkl')
test = load_clean_sentences('english-tiv-test.pkl')
# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# max sentence length
def max_length(lines):
return max(len(line.split()) for line in lines)
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare tiv tokenizer
tiv_tokenizer = create_tokenizer(dataset[:, 1])
tiv_vocab_size = len(tiv_tokenizer.word_index) + 1
tiv_length = max_length(dataset[:, 1])
## encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
X = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X, maxlen=length, padding='post')
return X
## one hot encode target sequence
def encode_output(sequences, vocab_size):
ylist = list()
for sequence in sequences:
encoded = to_categorical(sequence, num_classes=vocab_size)
ylist.append(encoded)
y = array(ylist)
# prepare data
trainX = encode_sequences(tiv_tokenizer, tiv_length, train[:, 1])
testX = encode_sequences(tiv_tokenizer, tiv_length, test[:, 1])
# load model
model = load_model('model.h5')
# map an integer to a word
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
prediction = model.predict(source, verbose=0)[0]
integers = [argmax(vector) for vector in prediction]
target = list()
for i in integers:
word = word_for_id(i, tokenizer)
if word is None:
break
target.append(word)
return ' '.join(target)
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
actual, predicted = list(), list()
for i, source in enumerate(sources):
# translate encoded source text
source = source.reshape((1, source.shape[0]))
translation = predict_sequence(model, tiv_tokenizer, source)
raw_target, raw_src = raw_dataset[i]
if i < 10:
print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
actual.append(raw_target.split())
predicted.append(translation.split())
# calculate BLEU score
print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)
`
I want an interface where Users can input sentence in English and get output in Tiv language using the trained model. My presentation is coming up on Monday and I am stuck at the moment.
I tried to run a neural network to learn more about categorical embedding (the exaplanation of the neural network code is here https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/) but Spyder gives AttributeError after trying to run the loop in the end.
Traceback (most recent call last):
File "", line 1, in
File "C:\Workspace\Python_Runtime\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\Workspace\Python_Runtime\Python\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TabularDataset' on <module 'main' (built-in)>
My understanding is that this comes from Spyder having issue with multiprocessing functionality.
I have tried, as some answers suggested, to wrap everything that is not in a class or def in
if __name__ == '__main__':
but that did not seem to help, the error still comes up.
I also tried to import multiprocess package instead of multiprocessing but that did not help. I guess I would need to go and change the line in spawn.py file, but not sure how exactly.
The issue is that on my current PC I only have Spyder. I tried to run the same code on another dataset on my personal PC at home with Pycharm and it worked alright, with no errors at all.
Does anyone know how can I resolve the issue in Spyder?
The code for the neural network that I used is here:
from torch.utils.data import Dataset, DataLoader
class TabularDataset(Dataset):
def __init__(self, data, cat_cols=None, output_col=None):
"""
Characterizes a Dataset for PyTorch
Parameters
----------
data: pandas data frame
The data frame object for the input data. It must
contain all the continuous, categorical and the
output columns to be used.
cat_cols: List of strings
The names of the categorical columns in the data.
These columns will be passed through the embedding
layers in the model. These columns must be
label encoded beforehand.
output_col: string
The name of the output variable column in the data
provided.
"""
self.n = data.shape[0]
if output_col:
self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
else:
self.y = np.zeros((self.n, 1))
self.cat_cols = cat_cols if cat_cols else []
self.cont_cols = [col for col in data.columns
if col not in self.cat_cols + [output_col]]
if self.cont_cols:
self.cont_X = data[self.cont_cols].astype(np.float32).values
else:
self.cont_X = np.zeros((self.n, 1))
if self.cat_cols:
self.cat_X = data[cat_cols].astype(np.int64).values
else:
self.cat_X = np.zeros((self.n, 1))
def __len__(self):
"""
Denotes the total number of samples.
"""
return self.n
def __getitem__(self, idx):
"""
Generates one sample of data.
"""
return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):
def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
output_size, emb_dropout, lin_layer_dropouts):
"""
Parameters
----------
emb_dims: List of two element tuples
This list will contain a two element tuple for each
categorical feature. The first element of a tuple will
denote the number of unique values of the categorical
feature. The second element will denote the embedding
dimension to be used for that feature.
no_of_cont: Integer
The number of continuous features in the data.
lin_layer_sizes: List of integers.
The size of each linear layer. The length will be equal
to the total number
of linear layers in the network.
output_size: Integer
The size of the final output.
emb_dropout: Float
The dropout to be used after the embedding layers.
lin_layer_dropouts: List of floats
The dropouts to be used after each linear layer.
"""
super().__init__()
# Embedding layers
self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
for x, y in emb_dims])
no_of_embs = sum([y for x, y in emb_dims])
self.no_of_embs = no_of_embs
self.no_of_cont = no_of_cont
# Linear Layers
first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
lin_layer_sizes[0])
self.lin_layers = nn.ModuleList([first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)])
for lin_layer in self.lin_layers:
nn.init.kaiming_normal_(lin_layer.weight.data)
# Output Layer
self.output_layer = nn.Linear(lin_layer_sizes[-1],
output_size)
nn.init.kaiming_normal_(self.output_layer.weight.data)
# Batch Norm Layers
self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
for size in lin_layer_sizes])
# Dropout Layers
self.emb_dropout_layer = nn.Dropout(emb_dropout)
self.droput_layers = nn.ModuleList([nn.Dropout(size)
for size in lin_layer_dropouts])
def forward(self, cont_data, cat_data):
if self.no_of_embs != 0:
x = [emb_layer(cat_data[:, i])
for i,emb_layer in enumerate(self.emb_layers)]
x = torch.cat(x, 1)
x = self.emb_dropout_layer(x)
if self.no_of_cont != 0:
normalized_cont_data = self.first_bn_layer(cont_data)
if self.no_of_embs != 0:
x = torch.cat([x, normalized_cont_data], 1)
else:
x = normalized_cont_data
for lin_layer, dropout_layer, bn_layer in\
zip(self.lin_layers, self.droput_layers, self.bn_layers):
x = F.relu(lin_layer(x))
x = bn_layer(x)
x = dropout_layer(x)
x = self.output_layer(x)
return x
categorical_features = ["cat1", "cat2", "cat3"]
output_feature = ["output"]
data = data[output_feature + categorical_features + ["cont1", "cont2"]].copy().dropna()
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for cat_col in categorical_features:
label_encoders[cat_col] = LabelEncoder()
data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
dataset = TabularDataset(data=data, cat_cols=categorical_features,output_col=output_feature)
batchsize = 256
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)
cat_dims = [int(data[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FeedForwardNN(emb_dims, no_of_cont=2, lin_layer_sizes=[50, 100],
output_size=1, emb_dropout=0.04,
lin_layer_dropouts=[0.001,0.01]).to(device)
import tqdm
no_of_epochs = 5
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in tqdm.tqdm(range(no_of_epochs)):
for y, cont_x, cat_x in dataloader:
cat_x = cat_x.to(device)
cont_x = cont_x.to(device)
y = y.to(device)
# Forward Pass
preds = model(cont_x, cat_x)
loss = criterion(preds, y)
# Backward Pass and Optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
You could try to run the code using the console namespace instead of an empty one (to try to preserver the TabularDataset definition). For that you need to check the option Run in Console's namespace instead of an empty one in the preferences dialog: menu Tools > Preferences (or the 🔧 button to show the dialog) and Run > General settings > Run in Console's namespace instead of an empty one.
Train on 28624 samples
Epoch 1/10
32/28624 [..............................] - ETA: 15:20
InvalidArgumentError Traceback (most recent call last)
<ipython-input-25-4679097c6578> in <module>
----> 1 model.fit(X_train_indices, Y_train_OH, epochs = 10, batch_size = 32)**
InvalidArgumentError: indices[15,2] = -2147483648 is not in [0, 1193514)
[[node model_1/embedding_1/embedding_lookup (defined at <ipython-input-25-4679097c6578>:1) ]] [Op:__inference_distributed_function_6120]
Errors may have originated from an input operation.
Input Source operations connected to node model_1/embedding_1/embedding_lookup:
model_1/embedding_1/embedding_lookup/4992 (defined at C:\Users\shash\Anaconda3\envs\sentiment_analysis\lib\contextlib.py:81)
Function call stack:
distributed_function
System information
Have I written custom code (as opposed to using a stock example script provided in TensorFlow): Yes
OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Windows 10
TensorFlow installed from (source or binary): Conda
TensorFlow version (use command below): 2.1.0
Python version: 3.6.10
CUDA/cuDNN version: NA
GPU model and memory: Disabled (Hardcoding TensorFlow without GPU)
Code
->Embedding layer
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
vocab_len = len(word_to_index) + 1 #1193514
emb_matrix = np.zeros((vocab_len,embedding_dim))
for word, idx in word_to_index.items():
emb_matrix[idx, :] = word_to_vec_map[word]
# Definning a pre-trained Embedding layer
embedding_layer = layers.Embedding(
vocab_len,
embedding_dim,
trainable = False
)
# Build the embedding layer, it is required before setting the weights of the embedding layer.
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
embedding_layer.set_weights([emb_matrix])
return embedding_layer
->Model
def sentiment_model(input_shape, word_to_vec_map, word_to_index):
sentence_indices =layers.Input(shape=input_shape, dtype='float32')
# Create the embedding layer pretrained with GloVe Vectors
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
# Propagate sentence_indices through your embedding layer
# (See additional hints in the instructions).
embeddings = embedding_layer(sentence_indices)
x = layers.LSTM(128)(embeddings)
x = layers.Dropout(0.5)(x)
predictions = layers.Dense(2, activation="sigmoid", name="predictions")(x)
# Create Model instance which converts sentence_indices into X.
model = keras.Model(inputs=sentence_indices,outputs=predictions)
return model
def sentences_to_indices(X, word_to_index, max_len):
X_indices = np.zeros((m,max_len))
# Assign indices to words
for i,sentence in enumerate(X):
sentence_words = sentence.lower().split()
for j,word in enumerate(sentence_words):
X_indices[i, j] = word_to_index[word]
return X_indices
def get_word_embedding_dictionary():
""" creates word_to_vector, word_to_index and index_to_word dictionaries """
with open(embeding_path, 'r', encoding='utf-8') as f:
words = set()
word_to_vec_map = {}
# Extracting word and its vectors
for line in f:
line_list = line.split()
# Ignoring unresolvable words
if len(line_list)!=embedding_dim+1:
continue
curr_word = line_list[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line_list[1:], dtype=np.float64)
word_to_index = {}
index_to_word = {}
for i,w in enumerate(sorted(words)):
word_to_index[w] = i
index_to_word[i] = w
return word_to_index, index_to_word, word_to_vec_map
X_train_indices = sentences_to_indices(X_train, word_to_index, max_features)
Y_train_OH = to_categorical(Y_train)
model.fit(X_train_indices, Y_train_OH, epochs = 10, batch_size = 32)
The problem is when the words are being replaced by their corresponding index. If the word wasn't found in the vocabulary/word_to_index dictionary it was being stored as nan.
The vocabulary is all the words present in the word embeddings (I have used GloVe twitter embeddings).
Modified function:
def sentences_to_indices(X, word_to_index, max_len):
X_indices = np.zeros((m,max_len))
# Assign indices to words
for i,sentence in enumerate(X):
sentence_words = sentence.lower().split()
for j,word in enumerate(sentence_words):
X_indices[i, j] = word_to_index.get(word,0) #improvement
return X_indices
Though, I am not sure if words not present in word embeddings should be stored as zero.
I've been trying to research how to use Keras to train a POS tagger; specifically I want it to use an LSTM architecture and to use word embeddings, namely, GloVe. I've taken inspiration from two blogs. One uses an LSTM w/o pretrained embeddings to perform POS; the other uses LSTM w/ word embeddings to classify text.
https://nlpforhackers.io/lstm-pos-tagger-keras/
https://nlpforhackers.io/keras-intro/
The below script "works" in the sense that no errors are triggered, however, overpredicts "padding" cells and underpredicts other tokens. (When following the POS blog verbatim, the accuracy is ~99%.) I don't understand why the addition of word embeddings has hurt performance so bad.
Data preprocessing:
import nltk
tagged_sentences = nltk.corpus.treebank.tagged_sents()
import numpy as np
sentences, sentence_tags =[], []
for tagged_sentence in tagged_sentences:
sentence, tags = zip(*tagged_sentence)
sentences.append(np.array(sentence))
sentence_tags.append(np.array(tags))
from sklearn.model_selection import train_test_split
(train_sentences, test_sentences,
train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)
def assemble_text(array):
return ' '.join([word for word in array])
train_sentences = [assemble_text(arr) for arr in train_sentences]
test_sentences = [assemble_text(arr) for arr in test_sentences]
tags = set([])
for ts in train_tags:
for t in ts:
tags.add(t)
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0 # The special value used to padding
train_tags_y = []
for s in train_tags:
train_tags_y.append([tag2index[t] for t in s])
test_tags_y= []
for s in test_tags:
test_tags_y.append([tag2index[t] for t in s])
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True,
lowercase=True, min_df=3, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(train_sentences)
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
def to_sequence(tokenizer, preprocessor, index, text):
words = tokenizer(preprocessor(text))
indexes = [index[word] for word in words if word in index]
return indexes
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in train_sentences]
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in test_sentences]
# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)
from tensorflow.keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(vectorizer.get_feature_names())
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')
def to_categorical(sequences, categories):
cat_sequences = []
for s in sequences:
cats = []
for item in s:
cats.append(np.zeros(categories))
cats[-1][item] = 1.0
cat_sequences.append(cats)
return np.array(cat_sequences)
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
cat_test_tags_y = to_categorical(test_tags_y, len(tag2index))
Importing word vectors
import numpy as np
GLOVE_PATH = '/Users/jdmoore7/Downloads/glove.6B/glove.6B.50d.txt'
GLOVE_VECTOR_LENGHT = 50
def read_glove_vectors(path, lenght):
embeddings = {}
with open(path) as glove_f:
for line in glove_f:
chunks = line.split()
assert len(chunks) == lenght + 1
embeddings[chunks[0]] = np.array(chunks[1:], dtype='float32')
return embeddings
GLOVE_INDEX = read_glove_vectors(GLOVE_PATH, GLOVE_VECTOR_LENGHT)
# Init the embeddings layer with GloVe embeddings
embeddings_index = np.zeros((len(vectorizer.get_feature_names()) + 1, GLOVE_VECTOR_LENGHT))
for word, idx in word2idx.items():
try:
embedding = GLOVE_INDEX[word]
embeddings_index[idx+1] = embedding
except:
pass
Model and accuracy metrics
from tensorflow.keras import backend as K
def ignore_class_accuracy(to_ignore=0):
def ignore_accuracy(y_true, y_pred):
y_true_class = K.argmax(y_true, axis=-1)
y_pred_class = K.argmax(y_pred, axis=-1)
ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
return accuracy
return ignore_accuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Dropout
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(InputLayer(input_shape=(MAX_SEQ_LENGHT, )))
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
GLOVE_VECTOR_LENGHT, # Embedding size
weights=[embeddings_index],
input_length=MAX_SEQ_LENGHT,
trainable=False))
model.add(Bidirectional(LSTM(256, activation='relu', return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=Adam(0.001),
metrics=['accuracy',ignore_class_accuracy(0)])
model.fit(X_train_sequences, cat_train_tags_y,
epochs=40, batch_size=128, verbose=1,
validation_data=(X_test_sequences, cat_test_tags_y))
def logits_to_tokens(sequences, index):
token_sequences = []
for categorical_sequence in sequences:
token_sequence = []
for categorical in categorical_sequence:
token_sequence.append(index[np.argmax(categorical)])
token_sequences.append(token_sequence)
return token_sequences
import string
def pipe(text):
words = ''.join([char.lower() for char in text if char not in string.punctuation]).split(' ')
arr = [to_sequence(tokenize, preprocess, word2idx, text) ]
arr = pad_sequences(arr, maxlen=MAX_SEQ_LENGHT, padding='post')
pred = model.predict(arr)
values = logits_to_tokens(pred,
{i: t for t, i in tag2index.items()})[0]
return [(w,t) for w,t in zip(words,values)]
pipe('the walk down the hill')
>>>
[('the', '-PAD-'),
('walk', '-PAD-'),
('down', '-PAD-'),
('the', '-PAD-'),
('hill', '-PAD-')]
The accuracy produced in model fitting came out to be 0.00%. So I can only conclude that I've used word embeddings wrong in some way; it's a matter of, is my model architecture flawed? Or is the way I handle the word embeddings, themselves, flawed? Or something else?
Im trying to create an speaker recognition system which take sound files from any movie and than train these sounds files using Neural Network and MFCC ( sound feature ) and then the system will say me on another sound file which speaker talked in this sound file.
So that's what I did -
Created MFCC vector for each speaker and put it on an array named X ( speaker can be more than one time )
Created Output number for each speaker
Created this model with tensorflow -
Dense Layer(512, 'relu')
Dropout (0.3)
Dense Layer(256, 'relu')
Dense Layer(128, 'relu')
Flattern
Dense Layer(length of outputs, 'relu')
than I trained and finally checked my results but as I said unfortuentlly my results are not high enough, only ~45% accucarry :(
I add my full code and my data base, notice that my data base can make some mistakes for example take voice of leonard and call it sheldon because it base on the srt file of the movie and the srt file have sometimes mistakes.
My Full Code :
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
import os
import shutil
from os.path import isfile, join
from random import shuffle
from matplotlib import pyplot
from tqdm import tqdm
import tensorflow as tf
win_len = 0.04 # in seconds
step = win_len / 2
nfft = 2048
for TestNum in tqdm(range(5)): # We check it several times
X = [] # inputs
Y = [] # outputs
onlyfiles = [f for f in listdir("FinalAudios/") if isfile(join("FinalAudios/", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
namesWithoutDuplicate = list(dict.fromkeys(names))
namesWithoutDuplicateCopy = namesWithoutDuplicate[:]
for name in namesWithoutDuplicateCopy: # we remove low samples files
if names.count(name) < 60:
namesWithoutDuplicate.remove(name)
names = namesWithoutDuplicate
print(names) # print it
vector_names = [] # output for each name
i = 0
for name in names:
vector_for_each_name = i
vector_names.append(np.array(vector_for_each_name))
i += 1
for f in onlyfiles: # for all the files
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
if f_speaker in namesWithoutDuplicate:
fs, audio = wav.read("FinalAudios/" + f) # read the file
try:
# compute MFCC
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len,
winstep=step, nfft=nfft, appendEnergy=False)
flat_list = [item for sublist in mfcc_feat for item in sublist]
# Create output + inputs
X.append(np.array(flat_list))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
else:
if not os.path.exists("TooLowSamples"): # if path not exist we create it
os.makedirs("TooLowSamples")
shutil.move("FinalAudios\\" + f, "TooLowSamples\\" + f)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
lenX = len(X)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
y_test = np.asarray(Y[:100]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_test = np.asarray(X[:100]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_train = np.asarray(X[100:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
y_train = np.asarray(Y[100:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_val = x_train[-100:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_val = y_train[-100:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train[:-100] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_train = y_train[:-100] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train.reshape(np.append(x_train.shape, 1)) # RESHAPE FOR INPUT
x_test = x_test.reshape(np.append(x_test.shape, 1)) # RESHAPE FOR INPUT
x_val = x_val.reshape(np.append(x_val.shape, 1)) # RESHAPE FOR INPUT
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(len(names), activation='softmax'),
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
print("fitting")
history = model.fit(x_train, y_train, epochs=4, validation_data=(x_val, y_val))
print("testing")
results = model.evaluate(x_test, y_test)
print(results)
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
My data set - https://filebin.net/ajho6kgzx66xayyn
Note : I tried also convolution layers but it worked even worse