I am working on an NLP sentiment analysis model to classify the sentiment (neutral, positive, negative) of a tweet based on the content of the tweet on Google Colab. I have prepped the test_x and train_x data into sequences of ints using the Tokenizer module. I followed the Tokenizer tutorial on the official TensorFlow Youtube channel so there should be nothing wrong with that part.
However, when beginning to train the model, I run into UnimplementedError: Graph Execution.
I tried changing the layers of the model and decreasing the size of my data sets but the same error still popped up every time.
Could anyone clarify what this error means and is trying to say and also point out what is wrong with my code? Thanks!
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
print("Found GPU at: {}".format(device_name))
else:
device_name = "/device:CPU:0"
print("No GPU, using {}.".format(device_name))
# Load dataset into a dataframe
train_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/train.csv"
test_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/test.csv"
train_df = pd.read_csv(train_data_path, encoding='unicode_escape')
test_df = pd.read_csv(test_data_path, encoding='unicode_escape')
train_df.head()
# Function to convert df into a list of strings
def convert_to_list(df, x):
selected_text_list = []
labels = []
for index, row in df.iterrows():
selected_text_list.append(str(row[x]))
labels.append(str(row['sentiment']))
return np.array(selected_text_list), np.array(labels)
train_sentences, train_labels = convert_to_list(train_df, 'selected_text')
test_sentences, test_labels = convert_to_list(test_df, 'text')
print(train_sentences)
print(train_labels)
# Instantiate tokenizer and create word_index
tokenizer = Tokenizer(num_words=1000, oov_token='<oov>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# Convert sentences into a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
# Padding sequences
pad_test_seq = pad_sequences(test_sequence, padding='post')
max_len = pad_test_seq[0].size
pad_train_seq = pad_sequences(train_sequence, padding='post', maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 24, input_length=max_len),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
with tf.device(device_name):
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 20
with tf.device(device_name):
history = model.fit(pad_train_seq, train_labels, epochs=num_epochs, validation_data=(pad_test_seq, test_labels), verbose=2)
Here is a screenshot of the error:
Related
I've been following the following tutorial to try and understand LSTMs and tensorflow a bit more. From running, it the training of the model goes smoothly, but when I try to use the trained tokenizer on the test data and then convert it to a numpy array, it doesn't work and I'm not really sure what the problem is. The relevant portion that goes wrong is below:
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
The error it presents is as below:
Traceback (most recent call last):
File "/Users/pranavnair/Documents/Code/wpd/wpd.py", line 85, in <module>
x_test = np.array(x_test_data)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10824,) + inhomogeneous part.
I've tried using np.hstack instead of np.array, and that doesn't fix it. Would appreciate any help at all, thanks in advance.
Full code below for reference
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras import utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
# set random seed for reproducibility
RANDOM_SEED = 4
np.random.seed(RANDOM_SEED)
# import datasets
df_neut = pd.read_csv("./input/good.csv")
df_prom = pd.read_csv("./input/promotional.csv")
# clean up data to only include text
df_prom = df_prom.drop(df_prom.columns[1:], axis=1)
df_neut = df_neut.drop(df_neut.columns[1:], axis=1)
# combine datasets
df_neut.insert(1, 'label', 0) # neutral labels
df_prom.insert(1, 'label', 1) # promotional labels
# merge dataframes
df = pd.concat((df_neut, df_prom), ignore_index=True, axis=0)
# randomize order of dataframes
df = df.reindex(np.random.permutation(df.index))
# split into training and testing datasets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# perform data preprocessing using keras tokenizer
text_data = [str(txt) for txt in df_train['text'].values] # convert text data to strings
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', lower=True) # create tokenizer
tokenizer.fit_on_texts(text_data) # make dictionary
# vectorize dataset
x_train = tokenizer.texts_to_sequences(text_data)
# Max number of words in each sequence
MAX_SEQUENCE_LENGTH = 400
# pad sequence lengths
x_train = utils.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
# get test labels
y_train = df_train['label'].values
# create sequential model
model = Sequential()
# create embedding layer
EMBEDDING_DIM = 100
model.add(Embedding(MAX_NB_WORDS+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
# add LSTM layer to model
model.add(LSTM(80))
# setup model layers
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# setup binary classification via binary cross entropy loss
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
# train for two epochs
EPOCHS = 4
BATCH_SIZE = 64
history = model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.15)
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
x_test = utils.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)
y_test = np.array(df_test['label'].values)
# evaluate model
scores = model.evaluate(x_test, y_test, batch_size=128)
print("The model has a test loss of %.2f and a test accuracy of %.1f%%" % (scores[0], scores[1]*100))
I am working on a text summarization task using encoder-decoder architecture in Keras. I would like to test the model's performance using different word embeddings such as GloVe and BERT. I already tested it out with GloVe embeddings but could not find an appropriate example for BERT embeddings in seq2seq models using Keras. This is an excerpt of my code:
<...>
# splitting the data
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(data['clean_texts'], data['clean_summaries'],
test_size=0.2,shuffle=True,random_state=0)
# prepare a tokenizer for inputs
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Xtrain)
X_train = tokenizer.texts_to_sequences(Xtrain)
X_test = tokenizer.texts_to_sequences(Xtest)
X_train = pad_sequences(X_train, maxlen= MAX_TEXT_LENGTH, padding='post')
X_test = pad_sequences(X_test, maxlen= MAX_TEXT_LENGTH, padding='post')
# prepare a tokenizer for outputs
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(ytrain)
y_train = y_tokenizer.texts_to_sequences(ytrain)
y_test = y_tokenizer.texts_to_sequences(ytest)
y_train = pad_sequences(y_train, maxlen= MAX_SUM_LENGTH, padding='post')
y_test = pad_sequences(y_test, maxlen= MAX_SUM_LENGTH, padding='post')
Textvocab_size = len(tokenizer.word_index) + 1
Sumvocab_size = len(y_tokenizer.word_index) + 1
# Encoder
encoder_inputs = Input(shape=(MAX_TEXT,))
encoder_embedding = Embedding(Textvocab_size, LATENT_DIMENSION,trainable=True)(encoder_inputs)
encoderlstm1 = Bidirectional(LSTM(LATENT_DIMENSION,return_sequences=True, return_state=True))
encoder_output1, forward_h1, forward_c1, backward_h1, backward_c1 = encoderlstm1(encoder_embedding)
state_h1 = Concatenate()([forward_h1, backward_h1])
state_c1 = Concatenate()([forward_c1, backward_c1])
encoder_states1 = [state_h1, state_c1]
<...>
How to add BERT word embeddings to such a model? I tried this implementation on my data frame before tokenization but I ran into an error:
AttributeError: 'str' object has no attribute 'device_typeid'
I could not find a solution to it. Are there any other ways how to simply add these word embeddings as GloVe?
The error says that what is you system processor type whether it is GPU or CPU machine.
for me i am using bert embedding is used bert_embedding library
from bert_embedding import BertEmbedding
embedding = BertEmbedding('man')
error code: not initializing machine type
Change the code to
embedding = BertEmbedding()# for cpu
embed = embedding('man')
#or for gpu
import mxnet as mx
ctx = mx.gpu(0)
embedding = BertEmbedding(ctx=ctx)
embed = embedding('man')
I'm trying to implement English to Hindi translation using Deep Learning LSTM. But when I train the model it shows 'nan' loss in both actual and validation.
Link of text file containing translation pairs-: http://www.manythings.org/anki/
Below is my Jupyter notebook code-:
import string
import re
from numpy import array, argmax, random, take, delete
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
# function to read raw text file
def read_text(filename):
# open the file
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()
return text
# split a text into sentences
def to_lines(text):
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
return sents
data = read_text("/content/drive/My Drive/Colab Notebooks/Language Translator New/hin.txt")
eng_hin = to_lines(data)
eng_hin = array(eng_hin)
eng_hin = delete(eng_hin, 2, axis=1)
# Remove punctuation
eng_hin[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,0]]
eng_hin[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,1]]
# convert to lowercase
for i in range(len(eng_hin)):
eng_hin[i,0] = eng_hin[i,0].lower()
eng_hin[i,1] = eng_hin[i,1].lower()
# empty lists
eng_l = []
hin_l = []
# populate the lists with sentence lengths
for i in eng_hin[:,0]:
eng_l.append(len(i.split()))
for i in eng_hin[:,1]:
hin_l.append(len(i.split()))
print(max(eng_l))
print(max(hin_l))
# function to build a tokenizer
def tokenization(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# prepare english tokenizer
eng_tokenizer = tokenization(eng_hin[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 25
print('English Vocabulary Size: %d' % eng_vocab_size)
# prepare Hindi tokenizer
hin_tokenizer = tokenization(eng_hin[:, 1])
hin_vocab_size = len(hin_tokenizer.word_index) + 1
hin_length = 25
print('Hindi Vocabulary Size: %d' % hin_vocab_size)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post')
return seq
# Model Building
from sklearn.model_selection import train_test_split
train, test = train_test_split(eng_hin, test_size=0.2, random_state = 12)
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(hin_tokenizer, hin_length, train[:, 1])
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(hin_tokenizer, hin_length, test[:, 1])
# build NMT model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
model = Sequential()
model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
model.add(LSTM(units))
model.add(RepeatVector(out_timesteps))
model.add(LSTM(units, return_sequences=True))
model.add(Dense(out_vocab, activation='softmax'))
return model
model = build_model(hin_vocab_size, eng_vocab_size, hin_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
filename = '/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/model.h1.31_dec_19'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
epochs=100, batch_size=64,
validation_split = 0.2,
callbacks=[checkpoint], verbose=1)
model.save('/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/eng2hin.h5')
When I try to fit the model, it runs but shows 'nan' in loss. Please help me to resolve my issue.
In simple words, it usually happens because the loss function / optimizer isn't suiting the network calculations. I recently used this network to create a calculator. Try using loss='categorical_crossentropy' and optimizer='adam' and see if it works.
I'm trying to build a deep learning model to predict the top 5 probable movie genres, using movies' synopses as input. The movie genres I'm including in the data are 19, but regardless of test input, the model always predicts the same 5 movie genres. Below is my code building the model. However, the accuracy during fitting is 90%. Can you point me to the right direction as to what I'm doing wrong?
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import pandas as pd
import numpy as np
import re
data = pd.read_csv('train.csv', encoding = 'utf-8')
#Create column with comma separated genres
data['genres_comma'] = data['genres'].str.split()
mlb = MultiLabelBinarizer()
#Create new dataframe with one hot encoded labels
train = pd.concat([
data.drop(['genres', 'genres_comma'], 1),
pd.DataFrame(mlb.fit_transform(data['genres_comma']), columns=mlb.classes_),
], 1)
genre_names = list(mlb.classes_)
genres = train.drop(['movie_id', 'synopsis'], 1)
def preprocess_text(sen):
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sen)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
return sentence
X = []
sentences = list(train['synopsis'])
for sen in sentences:
X.append(preprocess_text(sen))
y = genres.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
#Convert text inputs into embedded vectors.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
#GloVe word embeddings to convert text inputs to their numeric counterparts
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = asarray(records[1:], dtype='float32')
embeddings_dictionary[word] = vector_dimensions
glove_file.close()
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#Model Creation
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(19, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)
Did you check for the class distribution in your training data? accuracy is not a good measure when you have strongly imbalanced classes. For instance, if these 5 genres make up >90% of your training data, the model may not have learned how to classify any other genre, but still achieve 90% accuracy (a common failure mode is constant output, which may still have a high accuracy with imbalanced classes). So, the first step should be to look at the number of training movies in the respective categories.
If my hunch is correct, you may try looking into class balancing weights, or into other loss functions that essentially also give more weight to rare classes. You may also want to fine-tune your model without class weights, after it has learned to classify all genres, in order to learn the real prior probabilities.
Another approach that sometimes helps is to group the 14 minority classes into a class "other". Maybe that is frequent enough for your classifier to learn it.
If so, then you can train another classifier to predict the right class, only for the "other" examples.
Alternatively, you could enrich the training set with more examples of the minority classes.
I'm trying to train a model for a text classification and the model take a list of maximum 300 integer embedded from articles. The model trains without problem and all but the accuracy won't go up.
The target consists of 41 categories encoded into int from 0 to 41 and were then normalized.
The table would look like this
Also, I don't know how my model should look like since I refered on two different example as per below
A binary classifier with one input column and one output column Example 1
Multiple class classifier with multiple columns as input Example 2
I have tried modifying my model based on both model but the model accuracy won't change and even getting lower per epoch
Should I add more layers to my model or I have done something stupid that I haven't realized?
Note: If the 'df.pickle' download link broken, use this link
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from os.path import exists
from os import mkdir
import tensorflow as tf
import pandas as pd
import pickle
# Define dataframe path
df_path = 'df.pickle'
# Check if local dataframe exists
if not exists(df_path):
# Download binary from dropbox
content = urlopen('https://ucd92a22d5e0d4d29b8edb608305.dl.dropboxusercontent.com/cd/0/get/Askx_25n3JI-jmnZsWXmMmRgd4O2EH1w9l0U6zCMq7xdSXs_IN_i2zuUviseqa9N7-WrReFbGhQi8CeseV5cNsFTO8dzRmSdxjr-MWEDQNpPaZ8Ik29E_58YAjY57qTc4CA/file#').read()
# Write to file
with open(df_path, 'wb') as file: file.write(content)
# Load the dataframe from bytes
df = pickle.loads(content)
# If the file exists (aka. downloaded)
else:
# Load the dataframe from file
df = pickle.load(open(df_path, 'rb'))
# Normalize the category
df['Category_Code'] = df['Category_Code'].apply(lambda x: x / 41)
train_df, test_df = [pd.DataFrame() for _ in range(2)]
x_train, x_test, y_train, y_test = train_test_split(df['Content_Parsed'], df['Category_Code'], test_size=0.15, random_state=8)
train_df['Content_Parsed'], train_df['Category_Code'] = x_train, y_train
test_df['Content_Parsed'], test_df['Category_Code'] = x_test, y_test
# Variable containing the number of words we want to keep in our vocabulary
NUM_WORDS = 10000
# Input/Token length
SEQ_LEN = 300
# Create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train_df['Content_Parsed'])
# Convert text data to numerical indexes
train_seqs=tokenizer.texts_to_sequences(train_df['Content_Parsed'])
test_seqs=tokenizer.texts_to_sequences(test_df['Content_Parsed'])
# Pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=SEQ_LEN, padding="post")
test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=SEQ_LEN, padding="post")
# Create Models folder if not exists
if not exists('Models'): mkdir('Models')
# Define local model path
model_path = 'Models/model.pickle'
# Check if model exists/pre-trained
if not exists(model_path):
# Define word embedding size
EMBEDDING_SIZE = 16
# Create new model
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
# tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
# tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Stop training when a monitored quantity has stopped improving.
es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=1)
# Define batch size (Can be tuned to improve model accuracy)
BATCH_SIZE = 16
# Define number or cycle to train
EPOCHS = 20
# Using GPU (If error means you don't have GPU. Use CPU instead)
with tf.device('/GPU:0'):
# Train/Fit the model
history = model.fit(
train_seqs,
train_df['Category_Code'].values,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=0.2,
validation_steps=30,
callbacks=[es]
)
# Evaluate the model
model.evaluate(test_seqs, test_df['Category_Code'].values)
# Save the model into a file
with open(model_path, 'wb') as file: file.write(pickle.dumps(model))
else:
# Load the model
model = pickle.load(open(model_path, 'rb'))
# Check the model
model.summary()
After 2 days of tweaking and understanding more examples I found this website which explains quite well about the multi-class classification.
The details of changes I made are as follows:
Since I'm going to build a model for multiple classes, during the model compilation the model should use categorical_crossentropy as it's loss function instead of binary_crossentropy.
The model should produce number of output with similar length as your total class you're going to classify which in my case 41. (One hot encoding)
The last layer's activation function should be "softmax" since we're choosing a label with the highest confidence level (closest to 1.0).
You will need to tweak the layers accordingly based on the number of classes you're going to classify. See here on how to improve your model.
My final code would look something just like this
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from functools import reduce
from os.path import exists
from os import listdir
from sys import exit
import tensorflow as tf
import pandas as pd
import pickle
import re
# Specify dataframe path
df_path = 'df.pickle'
# Check if the file exists
if not exists(df_path):
# Specify url of the dataframe binary
url = 'https://www.dropbox.com/s/76hibe24hmpz3bk/df.pickle?dl=1'
# Read the byte content from url
content = urlopen(url).read()
# Write to a file to save up time
with open(df_path, 'wb') as file: file.write(pickle.dumps(content))
# Unpickle the dataframe
df = pickle.loads(content)
else:
# Load the pickle dataframe
df = pickle.load(open(df_path, 'rb'))
# Useful variables
MAX_NUM_WORDS = 50000 # Vocabulary size for our tokenizer
MAX_SEQ_LENGTH = 600 # Maximum length of tokens (for padding later)
EMBEDDING_SIZE = 256 # Embedding size (Tweak to improve accuracy)
OUTPUT_LENGTH = len(df['Category'].unique()) # Number of class to be classified
# Create our tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
# Fit our tokenizer with words/tokens
tokenizer.fit_on_texts(df['Content_Parsed'].values)
# Get our token vocabulary
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))
# Parse our text into sequence of numbers using our tokenizer
X = tokenizer.texts_to_sequences(df['Content_Parsed'].values)
# Pad the sequence up to the MAX_SEQ_LENGTH
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQ_LENGTH)
print('Shape of feature tensor: {}'.format(X.shape))
# Convert our labels into dummy variable (More info on the link provided above)
Y = pd.get_dummies(df['Category']).values
print('Shape of label tensor: {}'.format(Y.shape))
# Split our features and labels into test and train dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Creating our model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_NUM_WORDS, EMBEDDING_SIZE, input_length=MAX_SEQ_LENGTH))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
# The number 64 could be changed based on your model performance
model.add(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
# Our output layer with length similar to the OUTPUT_LENGTH
model.add(tf.keras.layers.Dense(OUTPUT_LENGTH, activation='softmax'))
# Compile our model with "categorical_crossentropy" loss function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Model variables
EPOCHS = 100 # Number of cycle to run (The early stopping may stop the training process accordingly)
BATCH_SIZE = 64 # Batch size (Tweaking this may improve model performance a bit)
checkpoint_path = 'model_checkpoints' # Checkpoint path of our model
# Use GPU if available
with tf.device('/GPU:0'):
# Fit/Train our model
history = model.fit(
x_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_split=0.1,
callbacks=[
tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001),
tf.keras.callbacks.ModelCheckpoint(
checkpoint_path,
monitor='val_acc',
save_best_only=True,
save_weights_only=False
)
],
verbose=1
)
Now, my model accuracies perform well and are increasing each epoch but since the validation accuracies (val_acc around 76~77 percent) are not performing well, I may need to tweak the model/layers a bit.
The output snapshot is provided below