Please house I am new in nmt, I did hard coding of a model to translate English to my dialect. The model have worked perfectly but I am having trouble defining user input for actual translation to the model. Blow is the src of the model evaluation. Your assistance will be really appreciated.
`
# load a clean dataset
def load_clean_sentences(filename):
return load(open(filename, 'rb'))
# load datasets
dataset = load_clean_sentences('english-tiv-both.pkl')
train = load_clean_sentences('english-tiv-train.pkl')
test = load_clean_sentences('english-tiv-test.pkl')
# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# max sentence length
def max_length(lines):
return max(len(line.split()) for line in lines)
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare tiv tokenizer
tiv_tokenizer = create_tokenizer(dataset[:, 1])
tiv_vocab_size = len(tiv_tokenizer.word_index) + 1
tiv_length = max_length(dataset[:, 1])
## encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
X = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X, maxlen=length, padding='post')
return X
## one hot encode target sequence
def encode_output(sequences, vocab_size):
ylist = list()
for sequence in sequences:
encoded = to_categorical(sequence, num_classes=vocab_size)
ylist.append(encoded)
y = array(ylist)
# prepare data
trainX = encode_sequences(tiv_tokenizer, tiv_length, train[:, 1])
testX = encode_sequences(tiv_tokenizer, tiv_length, test[:, 1])
# load model
model = load_model('model.h5')
# map an integer to a word
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
prediction = model.predict(source, verbose=0)[0]
integers = [argmax(vector) for vector in prediction]
target = list()
for i in integers:
word = word_for_id(i, tokenizer)
if word is None:
break
target.append(word)
return ' '.join(target)
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
actual, predicted = list(), list()
for i, source in enumerate(sources):
# translate encoded source text
source = source.reshape((1, source.shape[0]))
translation = predict_sequence(model, tiv_tokenizer, source)
raw_target, raw_src = raw_dataset[i]
if i < 10:
print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
actual.append(raw_target.split())
predicted.append(translation.split())
# calculate BLEU score
print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)
`
I want an interface where Users can input sentence in English and get output in Tiv language using the trained model. My presentation is coming up on Monday and I am stuck at the moment.
Related
I am trying to follow this tutorial
when I try to predict I get the runtime error on the below line
# compute N predictions
pred = model_RNN.predict(n=FC_N, future_covariates=covariates)
RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 421 but got size 71 for tensor number 1 in the list.
which list it is referring to?
My train data size is 421 and valdiation is 105
not sure what 71 it is referring to.
Below is the full code
## load data
from darts import TimeSeries
df = df[['close']]
ts = TimeSeries.from_dataframe(df, freq='b')
#ts = pd.Series(df['close'], index=df.index)
series = df
plt.figure(100, figsize=(12,5))
series.plot()
# analyze its seasonality
is_seasonal, periodicity = check_seasonality(ts, max_lag=240)
dict_seas ={
"is seasonal?":is_seasonal,
"periodicity (months)":f'{periodicity:.1f}',
"periodicity (~years)": f'{periodicity/12:.1f}'}
_ = [print(k,":",v) for k,v in dict_seas.items()]
# split training vs test dataset
train, val = ts.split_after(0.8)
# normalize the time series
trf = Scaler()
# fit the transformer to the training dataset
train_trf = trf.fit_transform(train)
# apply the transformer to the validation set and the complete series
val_trf = trf.transform(val)
ts_trf = trf.transform(ts)
# create month and year covariate series
year_series = datetime_attribute_timeseries(
pd.date_range(start=ts.start_time(),
freq=ts.freq_str,
periods=len(ts)),
attribute='year',
one_hot=False)
year_series = Scaler().fit_transform(year_series)
month_series = datetime_attribute_timeseries(
year_series,
attribute='month',
one_hot=True)
covariates = year_series.stack(month_series)
cov_train, cov_val = covariates.split_after(0.8)
# helper function: fit the RNN model
def fit_it(model, train, val, flavor):
t_start = time.perf_counter()
print("\nbeginning the training of the {0} RNN:".format(flavor))
res = model.fit(train,
future_covariates=covariates,
val_series=val,
val_future_covariates=covariates,
verbose=True)
res_time = time.perf_counter() - t_start
print("training of the {0} RNN has completed:".format(flavor), f'{res_time:.2f} sec')
return res
# set up, fit, run, plot, and evaluate the RNN model
def run_RNN(flavor, ts, train, val):
# set the model up
model_RNN = RNNModel(
model=flavor,
model_name=flavor + str(" RNN"),
input_chunk_length=periodicity,
training_length=20,
hidden_dim=20,
batch_size=16,
n_epochs=EPOCH,
dropout=0,
optimizer_kwargs={'lr': 1e-3},
log_tensorboard=True,
random_state=42,
force_reset=True)
if flavor == "RNN": flavor = "Vanilla"
# fit the model
fit_it(model_RNN, train, val, flavor)
# compute N predictions
pred = model_RNN.predict(n=FC_N, future_covariates=covariates)
# plot predictions vs actual
plot_fitted(pred, ts, flavor)
# print accuracy metrics
res_acc = accuracy_metrics(pred, ts)
print(flavor + " : ")
_ = [print(k,":",f'{v:.4f}') for k,v in res_acc.items()]
return [pred, res_acc]
# run 3 different flavors of RNN on the time series:
flavors = ["LSTM", "GRU", "RNN"]
# call the RNN model setup for each of the 3 RNN flavors
res_flavors = [run_RNN(flv, ts_trf, train_trf, val_trf) for flv in flavors]
I was training my data set however when I called the training method, I reached to an error where it says my tensors has no dimension. Any one can tell me how can I solve this issue and determine if my tensor has a dimension? Code for the method of where the exception occurred is here and the method to call it is here.
METHODS
METHOD
def pad_sequence(sequences, batch_first=False, padding_value =0):
r"""Pad a list of variable length Tensors with ``padding_value``
``pad_sequence`` stacks a list of Tensors along a new dimension,
and pads them to equal length. For example, if the input is list of
sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
otherwise.
`B` is batch size. It is equal to the number of elements in ``sequences``.
`T` is length of the longest sequence.
`L` is length of the sequence.
`*` is any number of trailing dimensions, including none.
Example:
>>> from torch.nn.utils.rnn import pad_sequence
>>> a = torch.ones(25, 300)
>>> b = torch.ones(22, 300)
>>> c = torch.ones(15, 300)
>>> pad_sequence([a, b, c]).size()
torch.Size([25, 3, 300])
Note:
This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
where `T` is the length of the longest sequence. This function assumes
trailing dimensions and type of all the Tensors in sequences are same.
Arguments:
sequences (list[Tensor]): list of variable length sequences.
batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
``T x B x *`` otherwise
padding_value (float, optional): value for padded elements. Default: 0.
Returns:
Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
Tensor of size ``B x T x *`` otherwise
"""
print("test "+ str(padding_value))
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
max_size = sequences[0].size()
trailing_dims = max_size[1:]
max_len = max([s.size(0) for s in sequences])
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
else:
out_dims = (max_len, len(sequences)) + trailing_dims
out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
for i, tensor in enumerate(sequences):
length = tensor.size(0)
# use index notation to prevent duplicate references to the tensor
if batch_first:
out_tensor[i, :length, ...] = tensor
else:
out_tensor[:length, i, ...] = tensor
return out_tensor
METHOD
def pad_batch(batch):
#for i in batch:
# print("1" + str(i[0]))
#print("2" + str(i[1]))
padded_inputs = pad_sequence(batch[0],batch_first=True,padding_value=EOS_token)
padded_targets = pad_sequence(batch[1],batch_first=False,padding_value=EOS_token)
return (padded_inputs, padded_targets)
METHOD
"""seperates data into batches of size batch_size"""
def batchify(data, input_lang, batch_size, shuffle_data=True):
if shuffle_data == True:
shuffle(data)
number_of_batches = len(data) // batch_size
batches = list(range(number_of_batches))
longest_elements = list(range(number_of_batches))
for batch_number in range(number_of_batches):
longest_input = 0
longest_target = 0
input_variables = list(range(batch_size))
#target_variables = list(range(batch_size))
index = 0
for pair in range((batch_number*batch_size),((batch_number+1)*batch_size)):
input_variables[index] = tensorsFromPair(input_lang, data[pair])
#print(range((batch_number*batch_size),((batch_number+1)*batch_size)))
if len(input_variables[index]) >= longest_input:
longest_input = len(input_variables[index])
#if len(target_variables[index]) >= longest_target:
#longest_target = len(target_variables[index])
index += 1
batches[batch_number] = (input_variables)
longest_elements[batch_number] = (longest_input)
return batches , longest_elements, number_of_batches
METHOD
def train_and_test(epochs, test_eval_every, plot_every, learning_rate,
lr_schedule, train_pairs, test_pairs, input_lang
, batch_size, test_batch_size, encoder, decoder,
loss_criterion, trim, save_weights):
times = []
losses = {'train set':[], 'test set': []}
test_batches, longest_seq, n_o_b = batchify(test_pairs, input_lang,
test_batch_size,
shuffle_data=False)
start = time.time()
for i in range(1,epochs+1):
#adjust the learning rate according to the learning rate schedule
#specified in lr_schedule
if i in lr_schedule.keys():
learning_rate /= lr_schedule.get(i)
encoder.train()
decoder.train()
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
batches, longest_seq, n_o_b = batchify(train_pairs, input_lang,
batch_size,
shuffle_data=True)
train_loss = train(batches, encoder, decoder, encoder_optimizer,
decoder_optimizer, loss_criterion)
now = time.time()
print("Iter: %s \nLearning Rate: %s \nTime: %s \nTrain Loss: %s \n"
# % (i, learning_rate, asHours(now-start), train_loss))
if create_txt:
with open(print_to, 'a') as f:
f.write("Iter: %s \nLeaning Rate: %s \nTime: %s \nTrain Loss: %s \n" \
# % (i, learning_rate, asHours(now-start), train_loss))
if i % test_eval_every == 0:
if test_pairs:
test_loss = test(test_batches, encoder, decoder, criterion)
print("Test set loss: %s" % (test_loss))
if create_txt:
with open(print_to, 'a') as f:
f.write("Test Loss: %s \n" % (test_loss))
evaluate_randomly(encoder, decoder, test_pairs, trim)
else:
evaluate_randomly(encoder, decoder, train_pairs, trim)
if i % plot_every == 0:
times.append((time.time()-start)/60)
losses['train set'].append(train_loss)
if test_pairs:
losses['test set'].append(test_loss)
showPlot(times, losses, output_file_name)
if save_weights:
torch.save(encoder.state_dict(), output_file_name+'_enc_weights.pt')
torch.save(decoder.state_dict(), output_file_name+'_dec_weights.pt')
CALLING THE TRAINING METHOD
input_lang_name = 'test1.txt'
#output_lang_name = 'en'
"""name of your dataset"""
dataset = 'orig'
"""file path of dataset in the form of a tuple. If translated sentences are
stored in two files, this tuple will have two elements"""
raw_data_file_path = ('test1.txt',)
"""True if you want to reverse the order of the sentence pairs. For example,
in our dataset the sentence pairs list the English sentence first followed by
the French translation. But we want to translate from French to English,
so we set reverse as True."""
reverse=True
"""Remove sentences from dataset that are longer than trim (in either language)"""
trim = 10
"""max number of words in the vocabulary for both languages"""
max_vocab_size= 20000
"""if true removes sentences from the dataset that don't start with eng_prefixes.
Typically will want to use False, but implemented to compare results with Pytorch
tutorial. Can also change the eng_prefixes to prefixes of other languages or
other English prefixes. Just be sure that the prefixes apply to the OUTPUT
language (i.e. the language that the model is translating to NOT from)"""
start_filter = True
"""denotes what percentage of the data to use as training data. the remaining
percentage becomes test data. Typically want to use 0.8-0.9. 1.0 used here to
compare with PyTorch results where no test set was utilized"""
perc_train_set = 1.0
"""OUTPUT OPTIONS"""
"""denotes how often to evaluate a loss on the test set and print
sample predictions on the test set.
if no test set, simply prints sample predictions on the train set."""
test_eval_every = 1
"""denotes how often to plot the loss values of train and test (if applicable)"""
plot_every = 1
"""if true creates a txt file of the output"""
create_txt = True
"""if true saves the encoder and decoder weights to seperate .pt files for later use"""
save_weights= False
"""HYPERPARAMETERS: FEEL FREE TO PLAY WITH THESE TO TRY TO ACHIEVE BETTER RESULTS"""
"""signifies whether the Encoder and Decoder should be bidirectional LSTMs or not"""
bidirectional = True
if bidirectional:
directions = 2
else:
directions = 1
"""number of layers in both the Encoder and Decoder"""
layers = 2
"""Hidden size of the Encoder and Decoder"""
hidden_size = 440
"""Dropout value for Encoder and Decoder"""
dropout = 0.2
"""Training set batch size"""
batch_size = 32
"""Test set batch size"""
test_batch_size = 32
"""number of epochs (full passes through the training data)"""
epochs = 100
"""Initial learning rate"""
learning_rate= 1
"""Learning rate schedule. Signifies by what factor to divide the learning rate
at a certain epoch. For example {5:10} would divide the learning rate by 10
before the 5th epoch and {5:10, 10:100} would divide the learning rate by 10
before the 5th epoch and then again by 100 before the 10th epoch"""
lr_schedule = {}
"""loss criterion, see https://pytorch.org/docs/stable/nn.html for other options"""
criterion = nn.NLLLoss()
# Commented out IPython magic to ensure Python compatibility.
"""******************************************************************
********************NO NEED TO ALTER ANYTHING BELOW******************
******************************************************************"""
use_cuda = torch.cuda.is_available()
"""for plotting of the loss"""
plt.switch_backend('agg')
output_file_name = "testdata.%s_trim.%s_vocab.%s_directions.%s_layers.%s_hidden.%s_dropout.%s_learningrate.%s_batch.%s_epochs.%s" % (dataset,trim,max_vocab_size,directions,layers,hidden_size,dropout,learning_rate,batch_size,epochs)
if create_txt:
print_to = output_file_name+'.txt'
with open(print_to, 'w+') as f:
f.write("Starting Training \n")
else:
print_to = None
input_lang, train_pairs, test_pairs = prepareData(
input_lang_name, raw_data_file_path,
max_vocab_size=max_vocab_size, reverse=reverse, trim=trim,
start_filter=start_filter, perc_train_set=perc_train_set, print_to=print_to)
print('Train Pairs #')
print(len(train_pairs))
"""for gradient clipping from
https://github.com/pytorch/examples/blob/master/word_language_model/main.py"""
parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
args = parser.parse_args()
mem()
if create_txt:
with open(print_to, 'a') as f:
f.write("\nRandom Train Pair: %s \n\nRandom Test Pair: %s \n\n"
# % (random.choice(train_pairs),random.choice(test_pairs)
if test_pairs else "None"))
f.write(mem())
"""create the Encoder"""
encoder = EncoderRNN(input_lang.vocab_size, hidden_size, layers=layers,
dropout=dropout, bidirectional=bidirectional)
"""create the Decoder"""
decoder = DecoderAttn(hidden_size, input_lang.vocab_size, layers=layers,
dropout=dropout, bidirectional=bidirectional)
print('Encoder and Decoder Created')
mem()
if use_cuda:
print('Cuda being used')
encoder = encoder.cuda()
decoder = decoder.cuda()
print('Number of epochs: '+str(epochs))
if create_txt:
with open(print_to, 'a') as f:
f.write('Encoder and Decoder Created\n')
f.write(mem())
f.write("Number of epochs %s \n" % (epochs))
train_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule,
train_pairs, test_pairs, input_lang, batch_size,
test_batch_size, encoder, decoder, criterion, trim, save_weights)
Error Message
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-359-5d8455e6bed2> in <module>()
70 train_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule,
71 train_pairs, test_pairs, input_lang, batch_size,
---> 72 test_batch_size, encoder, decoder, criterion, trim, save_weights)
4 frames
<ipython-input-354-56663cee9c3b> in train_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule, train_pairs, test_pairs, input_lang, batch_size, test_batch_size, encoder, decoder, loss_criterion, trim, save_weights)
36 shuffle_data=True)
37 train_loss = train(batches, encoder, decoder, encoder_optimizer,
---> 38 decoder_optimizer, loss_criterion)
39
40 now = time.time()
<ipython-input-348-941378bc39df> in train(train_batches, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_criterion)
8 for batch in train_batches:
9 i += 1
---> 10 (input_batch, target_batch) = pad_batch(batch)
11 batch_loss = train_batch(input_batch, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_criterion)
12 round_loss += batch_loss
<ipython-input-343-bfbf80f81df7> in pad_batch(batch)
33 # print("1" + str(i[0]))
34 #print("2" + str(i[1]))
---> 35 padded_inputs = pad_sequence(batch[0],batch_first=True,padding_value=EOS_token)
36 padded_targets = pad_sequence(batch[1],batch_first=False,padding_value=EOS_token)
37 return (padded_inputs, padded_targets)
<ipython-input-337-c13d37c75733> in pad_sequence(sequences, batch_first, padding_value)
40 max_size = sequences[0].size()
41 trailing_dims = max_size[1:]
---> 42 max_len = max([s.size(0) for s in sequences])
43 if batch_first:
44 out_dims = (len(sequences), max_len) + trailing_dims
<ipython-input-337-c13d37c75733> in <listcomp>(.0)
40 max_size = sequences[0].size()
41 trailing_dims = max_size[1:]
---> 42 max_len = max([s.size(0) for s in sequences])
43 if batch_first:
44 out_dims = (len(sequences), max_len) + trailing_dims
IndexError: dimension specified as 0 but tensor has no dimensions
As explained here, the problem happens when the last batch contains only one sample.
An easy workaround is to either remove one sample from your set, or change your batch size.
I'm trying to train a simple tensorflow model to detect the sentiment of tweets. The datatypes and sizes of arrays are consistent and the model trains just fine when the recurrent_dropout is set to some float value. However this disables cuDNN and I'd really like to speed this up (don't we all) but whenever I remove the recurrent dropout argument the model training will crash before the end of the first epoch.
Below is the relevant code, I've left out imports, and loading the csv files. After the relevant code are the final input dimensions and the error code. Additionally, I have figured out why colab seemed to be cutting the training data. Colab displays the number of sequences after it has been split into batches, so with the default batch size of 32 we were getting 859 sequences. The crashing issue when not using the recurrent dropout is still an issue. Side note, this code is a very rough draft with the data cleaning all being done within the same notebook, hence the lack of typical formatting.
def remove_case(X):
removed_case = []
X = X.copy()
for text in X:
text = str(text).lower()
removed_case.append(text)
X = removed_case
return X
def remove_hyperlinks(X):
removed_hyperlinks = []
X = X.copy()
for text in X:
text = str(text)
text = re.sub(r'http\S+', '', text)
text = re.sub(r'https\S+', '', text)
text = re.sub(r'www\S+', '', text)
removed_hyperlinks.append(text)
X = removed_hyperlinks
return X
def remove_punctuation(X):
removed_punc = []
X = X.copy()
for text in X:
text = str(text)
text = "".join([char for char in text if char not in punctuation])
removed_punc.append(text)
X = removed_punc
return X
def split_text(X):
split_tweets = []
X = X.copy()
for text in X:
text = str(text).split()
split_tweets.append(text)
X = split_tweets
return X
def map_sentiment(X, l, m, n):
keys = ['negative', 'neutral', 'positive']
values = [l, m, n]
dictionary = dict(zip(keys, values))
X = X.copy()
X = X.map(dictionary)
return X
# # def sentiment_to_onehot(X):
# sentiment_foofs = []
# X = X.copy()
# for integer in X:
# if integer == "negative": # Negative
# integer = [1, 0, 0]
# elif integer == "neutral": # Neutral
# integer = [0, 1, 0]
# elif integer == "positive": # Positive
# integer = [0, 0, 1]
# else:
# break
# sentiment_foofs.append(integer)
# X = sentiment_foofs
# return X
train_no_punc_lowercase = train.copy()
train_no_punc_lowercase['text'] = remove_case(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_hyperlinks(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_punctuation(train_no_punc_lowercase['text'])
train_no_punc_lowercase['sentiment'] = map_sentiment(train_no_punc_lowercase['sentiment'], 0, 1, 2)
train_no_punc_lowercase.head()
test_no_punc_lowercase = test.copy()
test_no_punc_lowercase['text'] = remove_case(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_hyperlinks(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_punctuation(test_no_punc_lowercase['text'])
test_no_punc_lowercase['sentiment'] = map_sentiment(test_no_punc_lowercase['sentiment'], 0, 1, 2)
features = train.columns.tolist()
features.remove('textID') # all unique, high cardinality feature
features.remove('selected_text') # target
target = 'selected_text'
X_train_no_punc_lowercase = train_no_punc_lowercase[features]
y_train_no_punc_lowercase = train_no_punc_lowercase[target]
X_test_no_punc_lowercase = test_no_punc_lowercase[features]
def stemming_column(df_column):
ps = PorterStemmer()
stemmed_word_list = []
for i, string in enumerate(df_column):
tokens = word_tokenize(string)
new_string = ""
for j, words in enumerate(tokens):
new_string = new_string + ps.stem(words) + " "
stemmed_word_list.append(new_string)
return stemmed_word_list
def create_lookup_table(list1, list2):
main_list = []
lookup_dict = {}
i = 1 # used to create a value in the dictionary
main_list.append(list1)
main_list.append(list2)
for list in main_list:
for string in list:
for word in string.split():
if word not in lookup_dict:
lookup_dict[word] = i
i += 1
return lookup_dict
def encode(input_list, input_dict):
encoded_list = []
for string in input_list:
sentence_list = []
for word in string.split():
sentence_list.append(input_dict[word]) # value lookup from dictionary.. int
encoded_list.append(sentence_list)
return encoded_list
def pad_data(list_of_lists):
padded_data = tf.keras.preprocessing.sequence.pad_sequences(list_of_lists, padding='post')
return padded_data
def create_array_sentiment_integers(list):
sent_int_list = []
for sentiment in list:
sent_int_list.append(sentiment)
return np.asarray(sent_int_list, dtype=np.int32)
X_train_stemmed_list = stemming_column(X_train_no_punc_lowercase['text'])
X_test_stemmed_list = stemming_column(X_test_no_punc_lowercase['text'])
lookup_table = create_lookup_table(X_train_stemmed_list, X_test_stemmed_list)
X_train_encoded_list = encode(X_train_stemmed_list, lookup_table)
X_train_padded_data = pad_data(X_train_encoded_list)
Y_train = create_array_sentiment_integers(train_no_punc_lowercase['sentiment'])
max_features = 3 # 3 choices 0, 1, 2
Y_train_final = np.zeros((Y_train.shape[0], max_features), dtype=np.float32)
Y_train_final[np.arange(Y_train.shape[0]), Y_train] = 1.0
input_dimension = len(lookup_table) + 1
output_dimension = 64
input_length = 33
model = Sequential()
model.add(tf.keras.layers.Embedding(input_dim=input_dimension,
output_dim=output_dimension,
input_length=input_length,
mask_zero=True))
model.add(tf.keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(X_train_padded_data, Y_train_final, validation_split=0.20, epochs=10)
model.save('Tweet_sentiment.model')
Additionally, here are the shapes of the datasets..
x train shape: (27481, 33, 1) x train type: <class 'numpy.ndarray'> y train shape: (27481, 3)
Error code
Epoch 1/3
363/859 [===========>..................] - ETA: 9s - loss: 0.5449 - accuracy: 0.5674
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-103-1d4af3962607> in <module>()
----> 1 model.fit(X_train_padded_data, Y_train_final, epochs=3,)
8 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
UnknownError: [_Derived_] CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1496): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)'
[[{{node cond_38/then/_0/CudnnRNNV3}}]]
[[sequential_5/lstm_4/StatefulPartitionedCall]] [Op:__inference_train_function_36098]
Function call stack:
train_function -> train_function -> train_function
I see some problems in your code. They are mentioned below:
You are using input_dimension = len(lookup_table) + 1. len(lookup_table) is nothing but the Number of Time Steps. It's value will be very high, at least more than 30,000. It is recommended to use only subset of those Values. So, you can set input_dimension = 10000 or input_dimension = 15000 (you may experiment with this value) it should solve the problem. Having said that, it will not impact the Accuracy of the Model.
Why is setting Recurrent Dropout a Float Value working ==> When we set the Recurrent Dropout, it actually drops the Number of Time Steps, input_dimension in your case, and hence it is not crashing.
You should use return_sequences=True only if you have another LSTM Layer, after an LSTM Layer. Since you have only one LSTM Layer, return_sequences should be set to False
Since you have 3 Classes, you shouldn't use binary_crossentropy. You should use sparse_categorical_crossentropy if you are not One-Hot-Encoding your Target or categorical_crossentropy if you are One-Hot-Encoding your Target.
Are you sure you want to use Masking in Embedding Layer?
Also, I see that you are using Many Functions and Many Lines of Code for Data-Preprocessing like Removing Hyperlinks, Removing Punctuations, Tokenizing, etc..
So, I thought I will provide an End-To-End Tutorial for Text Classification which will help you as well as the Stack Overflow Community. Code for the same is shown below:
#!pip install tensorflow==2.1
#!pip install nltk
#!pip install tika
#!pip install textblob
#!pip3 install --upgrade numpy
#!pip install scikit-learn
# To handle Paths
import os
# To remove Hyperlinks and Dates
import re
# To remove Puncutations
import string
# This helps to remove the unnecessary words from our Text Data
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# To Parse the Input Data Files
from tika import parser
from textblob import TextBlob
# In order to use the Libraries of Tensorflow
import tensorflow as tf
# For Preprocessing the Text => To Tokenize the Text
from tensorflow.keras.preprocessing.text import Tokenizer
# If the Two Articles are of different length, pad_sequences will make the length equal
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Package for performing Numerical Operations
import numpy as np
# MatplotLib for Plotting Graphs
import matplotlib.pyplot as plt
# To shuffle the Data
from random import shuffle
# To Partition the Data into Train Data and Test Data
from sklearn.model_selection import train_test_split
# To add Regularizer in order to reduce Overfitting
from tensorflow.keras.regularizers import l2
# Give the Path of our Data
Path_Of_Data = 'Data'
# Extract the Labels from the Folders inside the Path mentioned above
Unique_Labels_List = ['negative', 'neutral', 'positive']
def GetNumericLabel(EachLabel):
if EachLabel=='negative':
return 0
elif EachLabel=='neutral':
return 1
elif EachLabel=='positive':
return 2
def Pre_Process_Data_And_Create_BOW(folder_path):
#creating empty lists in order to Create Resume Text and the respective Label
Resumes_List = []
Labels_List = []
for EachLabel in Unique_Labels_List:
for root, dirs, files in os.walk(os.path.join(folder_path, EachLabel),topdown=False):
for file in files:
i = 0
if file.endswith('.pdf'):
#Access individual file
Full_Resume_Path = os.path.join(root, file)
# Parse the Data inside the file
file_data = parser.from_file(Full_Resume_Path)
# Extract the Content of the File
Resume_Text = file_data['content']
# Below Code removes the Hyperlinks in the Resume, like LinkedIn Profile, Certifications, etc..
HyperLink_Regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
Text_Without_HL = re.sub(HyperLink_Regex, ' ', Resume_Text, flags=re.MULTILINE)
# Below Code removes the Date from the Resume
Date_regEx = r'(?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+'
CleanedText = re.sub(Date_regEx,' ',Text_Without_HL)
List_Of_All_Punctuations = list(string.punctuation)
Important_Punctuations = ['#', '.', '+' , '-'] #Add more, if any other Punctuation is observed as Important
NewLineChar = '\n'
# Below Set Comprises all the Punctuations, which can be Removed from the Text of Resume
Total_Punct = len(List_Of_All_Punctuations)
for EachImpPunct in Important_Punctuations:
for CountOfPunct in range(Total_Punct):
if CountOfPunct == Total_Punct:
break
elif EachImpPunct == List_Of_All_Punctuations[CountOfPunct]:
del List_Of_All_Punctuations[CountOfPunct]
Total_Punct = Total_Punct - 1
List_Of_All_Punctuations.append(NewLineChar)
for EachPunct in List_Of_All_Punctuations:
CleanedText = CleanedText.replace(EachPunct, " ")
# Below Code converts all the Words in the Resume to Lowercase ======> Check if it has to come after Tokenization if Splitting Code is delet instead of integed
#Final_Cleaned_Resume_Text = Text_Without_Punct.lower()
Final_Cleaned_Resume_Text = CleanedText.lower()
#Code to remove Stopwords from each Resume
for word in STOPWORDS:
#stop_token = ' ' + word + ' '
stop_token = word
Resume_Text = Final_Cleaned_Resume_Text.replace(stop_token, ' ')
#Resume_Text = Resume_Text.replace(' ', ' ')
Resumes_List.append(Resume_Text)
Numeric_Label = GetNumericLabel(EachLabel)
Labels_List.append(Numeric_Label)
#print('Successfully executed for the Folder, ', EachLabel)
#Return Final Lists
return Resumes_List, Labels_List
#calling the function and passing the path
Resumes_List, Labels_List = Pre_Process_Data_And_Create_BOW(Path_Of_Data)
vocab_size = 10000 # This is very important for you
# We want the Output of the Embedding Layer to be 64
embedding_dim = 64
max_length = 800
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
# Taking 80% of the Data as Training Data and remaining 20% will be for Test Data
training_portion = .8
# Size of Train Data is 80% of the Entire Dataset => 0.8 * 2225
Train_Resume_Size = int(len(Resumes_List) * training_portion)
Labels_List = np.asarray(Labels_List)
Train_Resume_Data, Validation_Resume_Data, Train_Labels, Validation_Labels = \
train_test_split(Resumes_List, Labels_List, train_size = training_portion,
shuffle = True
, stratify= Labels_List)
from statistics import mean
print('Average Number of Words in Each Training Resume is {}'.format(mean([len(i) for i in Train_Resume_Data])))
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(Train_Resume_Data)
word_index = tokenizer.word_index
# Convert the Word Tokens into Integer equivalents, before passing it to keras embedding layer
train_sequences = tokenizer.texts_to_sequences(Train_Resume_Data)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
validation_sequences = tokenizer.texts_to_sequences(Validation_Resume_Data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(validation_sequences))
print(validation_padded.shape)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Check your Data
def decode_article(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('-------------------------------------------------------------------------')
print(Train_Resume_Data[10])
Regularizer = l2(0.001)
model = tf.keras.Sequential([
# Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
tf.keras.layers.Embedding(vocab_size, embedding_dim,
embeddings_regularizer = Regularizer),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
# use ReLU in place of tanh function since they are very good alternatives of each other.
tf.keras.layers.Dense(embedding_dim, activation='relu'),
# Add a Dense layer with 3 units and softmax activation.
# When we have multiple outputs, softmax convert outputs layers into a probability distribution.
tf.keras.layers.Dense(3, activation='softmax')
])
model.summary()
#Using Early Stopping in order to handle Overfitting
ES_Callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
num_epochs = 100
history = model.fit(x = train_padded, y = Train_Labels, epochs=num_epochs,
callbacks=[ES_Callback],
validation_data=(validation_padded, Validation_Labels),
batch_size = 32, shuffle=True, verbose=1)
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
version = 1
MODEL_DIR = 'Resume_Classification_Model'
export_path = os.path.join(MODEL_DIR, str(version))
tf.keras.models.save_model(model = model, filepath = export_path)
!ls -l {export_path}
!saved_model_cli show --dir {export_path} --all
For more information, please refer this Beautiful Article.
Hope this solves your issue. Happy Learning!
I have tried writing training code for lang translation. Now during testing the classifiier I am facing issue. It just need adjusting the X and Y value for test input but I am bit confused.
This is the code for training the model
def train_seq2seq(self):
print("Input sequence read, starting training")
s2s = seq2seq(self.vocab_size + 3, self.maxlen + 2, \
self.vocab_size + 3)
self.model = s2s.seq2seq_plain()
#For testing considering 100 epoch instead of 10000
for e in range(10):
print("epoch %d \n" % e)
for ind, (X,Y) in enumerate(self.proproces.gen_batch()):
loss, acc = model.train_on_batch(X, Y)#, batch_size=64, nb_epoch=1)
#print("Loss is %f, accuracy is %f " % (loss, acc), end='\r')
# After one epoch test one sentence
if ind % 10 == 0:
testX = X[0,:].reshape(1, self.maxlen + 2)
testY = Y[0]
pred = model.predict(testX, batch_size=1)
self.decode(testX, pred)
Test code in which I am facing issue is -
def encode(self):
#Encodes input sentence into fixed length vector
#print("Enter sentence in hindi")
inp = raw_input("Please enter the sentence\n").decode("utf-8")
tokens = inp.split()
seq = []
for token in tokens:
if token in self.proproces.vocab_tar:
seq.append(self.proproces.vocab_tar[token])
else:
token = "UNK"
seq.append(self.proproces.vocab_tar[token])
#seq = map(lambda x:self.proproces.vocab_hind[x], tokens)
# Normalize seq to maxlen
X = []
X.append(seq)
print(X) #[[400, 23, 400]]
temp = pad_sequences(X, maxlen=self.maxlen)
print(temp.shape) #(1, 6)
temp[0:len(seq)] = seq
# print(len(temp))
# temp = np.asarray(temp).reshape(128,)
# print(temp.shape)
prob = self.model.predict_on_batch(temp)#, batch_size=1, verbose=0)
translated = self.decode(prob)
print("Tranlated is", translated)
print("Probabilities are", prob)
print("Shape of prob tensor is",prob.shape)
I am referring this tutorial - https://github.com/shashankg7/Seq2Seq/blob/master/seq2seq/seq2seq.py
I am confused in adjusting the shape of X and Y, any guidance much much appreciated
I am following through the tutorial here:
https://pythonprogramming.net/train-test-tensorflow-deep-learning-tutorial/
I can get the Neural Network trained and print out the accuracy.
However, I do not know how to use the Neural Network to make a prediction.
Here is my attempt. Specifically the issue is this line - I believe my issue is that I cannot get my input string into the format the model expects:
features = get_features_for_input("This was the best store i've ever seen.")
result = (sess.run(tf.argmax(prediction.eval(feed_dict={x:features}),1)))
Here is a larger listing:
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
i = 0
while i < len(train_x):
start = i
end = i + batch_size
batch_x = np.array(train_x[start:end])
batch_y = np.array(train_y[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
epoch_loss += c
i+=batch_size
print('Epoch', epoch, 'completed out of', hm_epochs, 'loss:', epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
print('Accuracy', accuracy.eval({x:test_x, y:test_y}))
# pos: [1,0] , argmax: 0
# neg: [0,1] , argmax: 1
features = get_features_for_input("This was the best store i've ever seen.")
result = (sess.run(tf.argmax(prediction.eval(feed_dict={x:features}),1)))
if result[0] == 0:
print('Positive:',input_data)
elif result[0] == 1:
print('Negative:',input_data)
def get_features_for_input(input):
current_words = word_tokenize(input.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
# OR DO +=1, test both
features[index_value] += 1
features = np.array(list(features))
train_neural_network(x)
Following your comment above, it feels like your error ValueError: Cannot feed value of shape () is due to the fact that features is None, because your function get_features_for_input doesn't return anything.
I added the return features line and gave features a correct shape of [1, len(lexicon)] to match the shape of the placeholder.
def get_features_for_input(input):
current_words = word_tokenize(input.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros((1, len(lexicon)))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
# OR DO +=1, test both
features[0, index_value] += 1
return features
Your get_features_for_input function returns a single list representing features of a sentences but for feed_dict, the input needs to be of size [num_examples, features_size], here num_examples is 1.
The following code should work.
def get_features_for_input(input):
current_words = word_tokenize(input.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
# OR DO +=1, test both
features[index_value] += 1
features = np.array(list(features))
batch_features = []
batch_features[0] = features
return np.array(batch_features)
Basic funda for any machine learning algorithm is dimention should be same during training and testing.
During training you created matrix shape number of training samples, len(lexicon). Here you are trying bag of words approach and lexicons are nothing but the unique word in your training data.
During testing your input vector size should be same as your vector size for training. And it is just the size of lexicon created during training. Also each element in test vector defines the corresponding index word in lexicons.
Now come to your problem, in get_features_for_input(input) you used the lexicon, you must have defined somewhere in program. Given the error what I conclude is your lexicon list is empty, so in get_features_for_input function features = np.zeros(len(lexicon)) will produce array of zero shape and also never enters in loop.
Few expected modifications:
You can find function create_feature_sets_and_labels in your tutorial. That returns your cleaned formatted training data. Change return statement to return the lexicon list along with data.
return train_x,train_y,test_x,test_y,lexicon
Make small change to collect lexicon list, ref:here
train_x,train_y,test_x,test_y,lexicon = create_feature_sets_and_labels('/path/to/pos.txt','/path/to/neg.txt')
And just pass this lexicon list alongwith your input to get_features_for_input function
features = get_features_for_input("This was the best store i've ever seen.",lexicon)
Make small change in get_features_for_input function
def get_features_for_input(text,lexicon):
featureset = []
current_words = word_tokenize(text.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
featureset.append(features)
return np.asarray(featureset)