Prediction using Keras NLP - python

I am a beginner in the field of Neural Networks.
I am trying to implement an LSTM model for predicting the secondary structure of a protein from a given primary sequence. The base kernel on which my program is based can be found on Kaggle - https://www.kaggle.com/helmehelmuto/secondary-structure-prediction-with-keras
I successfully trained the model, saved the model to a pickle file, and I am able to use the pickle file to load weights to my model and make predictions as well. However, these predictions are on the test set which was created by the test_train_split function in Keras.
I aim to feed in a String containing the sequence of a protein and get its secondary structure prediction.
The code uses Tokenizer to convert the data (the protein sequences) from the dataset into a numpy.ndarray which is used for making predictions.
The part where I am stuck is, taking a string as input(some protein sequence) and converting it into the same class and then making predictions on it.
I have tried using the same method the Kaggle Kernel author used for converting the data from the .csv file to a numpy.ndarray but I get an error - 'numpy.ndarray' object has no attribute 'lower'.
I would be grateful if someone could guide me here, converting strings to the objects which are used for prediction in the code.
Code (works fine, predictions made directly on the test set generated by test_train_split):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
df = pd.read_csv(r'C:\Users\Viktor\Desktop\2018-06-06-ss.cleaned.csv')
df.len.hist(bins=100)
print(df.shape)
def seq2ngrams(seqs, n=3):
return np.array([[seq[i:i+n] for i in range(len(seq))] for seq in seqs])
maxlen_seq = 128
input_seqs, target_seqs = df[['seq', 'sst3']][(df.len <= maxlen_seq) & (~df.has_nonstd_aa)].values.T
input_grams = seq2ngrams(input_seqs)
print(len(input_seqs))
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(input_grams)
input_data = tokenizer_encoder.texts_to_sequences(input_grams)
input_data = sequence.pad_sequences(input_data, maxlen=maxlen_seq, padding='post')
tokenizer_decoder = Tokenizer(char_level=True)
tokenizer_decoder.fit_on_texts(target_seqs)
target_data = tokenizer_decoder.texts_to_sequences(target_seqs)
target_data = sequence.pad_sequences(target_data, maxlen=maxlen_seq, padding='post')
target_data = to_categorical(target_data)
input_data.shape, target_data.shape
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1
print(n_words, n_tags)
input = Input(shape=(maxlen_seq,))
x = Embedding(input_dim=n_words, output_dim=128, input_length=maxlen_seq)(input)
x = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1))(x)
y = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
model = Model(input, y)
model.summary()
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras import backend as K
import tensorflow as tf
def q3_acc(y_true, y_pred):
y = tf.argmax(y_true, axis=-1)
y_ = tf.argmax(y_pred, axis=-1)
mask = tf.greater(y, 0)
return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy", q3_acc])
X_train, X_test, y_train, y_test = train_test_split(input_data, target_data, test_size=.4, random_state=0)
seq_train, seq_test, target_train, target_test = train_test_split(input_seqs, target_seqs, test_size=.4, random_state=0)
#model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=(X_test, y_test), verbose=1)
def onehot_to_seq(oh_seq, index):
s = ''
for o in oh_seq:
i = np.argmax(o)
if i != 0:
s += index[i]
else:
break
return s
def plot_results(x, y, y_):
print("---")
print("Input: " + str(x))
print("Target: " + str(onehot_to_seq(y, revsere_decoder_index).upper()))
print("Result: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
fig = plt.figure(figsize=(10,2))
plt.imshow(y.T, cmap='Blues')
plt.imshow(y_.T, cmap='Reds', alpha=.5)
plt.yticks(range(4), [' '] + [revsere_decoder_index[i+1].upper() for i in range(3)])
plt.show()
revsere_decoder_index = {value:key for key,value in tokenizer_decoder.word_index.items()}
revsere_encoder_index = {value:key for key,value in tokenizer_encoder.word_index.items()}
#N=3
#y_train_pred = model.predict(X_train[:N])
#y_test_pred = model.predict(X_test[:N])
#print('training')
#for i in range(N):
# plot_results(seq_train[i], y_train[i], y_train_pred[i])
#print('testing')
#for i in range(N):
# plot_results(seq_test[i], y_test[i], y_test_pred[i])
loaded_model = pickle.load(open( "save.p", "rb" ))
N=3
y_train_pred = loaded_model.predict(X_train[:N])
y_test_pred = loaded_model.predict(X_test[:N])
print('training')
for i in range(N):
plot_results(seq_train[i], y_train[i], y_train_pred[i])
print('testing')
for i in range(N):
plot_results(seq_test[i], y_test[i], y_test_pred[i])
#print(type(target_seqs))
CODE WHICH DOES NOT WORK AS EXPECTED:
In this, 'xf' is the csv file which has the sequences for which I require the predictions.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
df = pd.read_csv(r'C:\Users\Viktor\Desktop\2018-06-06-ss.cleaned.csv')
xf = pd.read_csv(r'C:\Users\Viktor\Desktop\sequence.csv')
df.len.hist(bins=100)
print(df.shape)
def seq2ngrams(seqs, n=3):
return np.array([[seq[i:i+n] for i in range(len(seq))] for seq in seqs])
maxlen_seq = 128
input_seqs, target_seqs = df[['seq', 'sst3']][(df.len <= maxlen_seq) & (~df.has_nonstd_aa)].values.T
input_grams = seq2ngrams(input_seqs)
print(len(input_seqs))
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(input_grams)
input_data = tokenizer_encoder.texts_to_sequences(input_grams)
input_data = sequence.pad_sequences(input_data, maxlen=maxlen_seq, padding='post')
tokenizer_decoder = Tokenizer(char_level=True)
tokenizer_decoder.fit_on_texts(target_seqs)
target_data = tokenizer_decoder.texts_to_sequences(target_seqs)
target_data = sequence.pad_sequences(target_data, maxlen=maxlen_seq, padding='post')
target_data = to_categorical(target_data)
input_data.shape, target_data.shape
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1
print(n_words, n_tags)
input = Input(shape=(maxlen_seq,))
x = Embedding(input_dim=n_words, output_dim=128, input_length=maxlen_seq)(input)
x = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1))(x)
y = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
model = Model(input, y)
model.summary()
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras import backend as K
import tensorflow as tf
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")
X_train, X_test, y_train, y_test = train_test_split(input_data, target_data, test_size=.4, random_state=0)
seq_train, seq_test, target_train, target_test = train_test_split(input_seqs, target_seqs, test_size=.4, random_state=0)
#model.fit(X_train, y_train, batch_size=128, epochs=1, validation_data=(X_test, y_test), verbose=1)
def onehot_to_seq(oh_seq, index):
s = ''
for o in oh_seq:
i = np.argmax(o)
if i != 0:
s += index[i]
else:
break
return s
def plot_results(x, y, y_):
print("---")
print("Input: " + str(x))
print("Target: " + str(onehot_to_seq(y, revsere_decoder_index).upper()))
print("Result: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
fig = plt.figure(figsize=(10,2))
plt.imshow(y.T, cmap='Blues')
plt.imshow(y_.T, cmap='Reds', alpha=.5)
plt.yticks(range(4), [' '] + [revsere_decoder_index[i+1].upper() for i in range(3)])
plt.show()
revsere_decoder_index = {value:key for key,value in tokenizer_decoder.word_index.items()}
revsere_encoder_index = {value:key for key,value in tokenizer_encoder.word_index.items()}
N=3
y_train_pred = model.predict(X_train[:N])
y_test_pred = model.predict(X_test[:N])
print('training')
for i in range(N):
plot_results(seq_train[i], y_train[i], y_train_pred[i])
print('testing')
for i in range(N):
plot_results(seq_test[i], y_test[i], y_test_pred[i])
loaded_model = pickle.load(open( "save.p", "rb" ))
N=3
y_train_pred = loaded_model.predict(X_train[:N])
y_test_pred = loaded_model.predict(X_test[:N])
print('training')
for i in range(N):
plot_results(seq_train[i], y_train[i], y_train_pred[i])
print('testing')
for i in range(N):
plot_results(seq_test[i], y_test[i], y_test_pred[i])
print("-----")
print(X_test[:3])
print("-----")
xf.len.hist(bins=100)
input_seqs1, target_seqs1 = xf[['seq', 'sst3']][(xf.len <= maxlen_seq) & (~xf.has_nonstd_aa)].values.T
input_grams1 = seq2ngrams(input_seqs1)
tokenizer_encoder1 = Tokenizer()
tokenizer_encoder1.fit_on_texts(input_grams1)
input_data1 = tokenizer_encoder1.texts_to_sequences(input_grams1)
input_data1 = sequence.pad_sequences(input_data1, maxlen=maxlen_seq, padding='post')
tokenizer_decoder1 = Tokenizer(char_level=True)
tokenizer_decoder1.fit_on_texts(target_seqs1)
target_data1 = tokenizer_decoder1.texts_to_sequences(target_seqs1)
target_data1 = sequence.pad_sequences(target_data1, maxlen=maxlen_seq, padding='post')
target_data1 = to_categorical(target_data1)
input_data1.shape, target_data1.shape
X_train, X_test, y_train, y_test = train_test_split(input_data1, target_data1, test_size=1, random_state=0)
seq_train, seq_test, target_train, target_test = train_test_split(input_seqs1, target_seqs1, test_size=1, random_state=0)
y_train_pred1 = loaded_model.predict(X_train)
y_test_pred1 = loaded_model.predict(X_test)
plot_results(seq_train, y_train, y_train_pred)
plot_results(seq_test, y_test, y_test_pred)
#print(input_data1[0])
##y_train_pred = loaded_model.predict(input_data)
#y_test_pred1 = loaded_model.predict(input_data1[0])
##plot_results(seq_train, y_train, y_train_pred)
#plot_results(input_seqs1, target_data1, y_test_pred1)
TRACEBACK:
Traceback (most recent call last):
File "<ipython-input-38-e8f27dda0841>", line 1, in <module>
runfile('C:/Users/Viktor/Desktop/rost_nocl.py', wdir='C:/Users/Viktor/Desktop')
File "D:\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "D:\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Viktor/Desktop/rost_nocl.py", line 116, in <module>
tokenizer_encoder1.fit_on_texts(input_grams1)
File "D:\Anaconda3\lib\site-packages\keras_preprocessing\text.py", line 223, in fit_on_texts
self.split)
File "D:\Anaconda3\lib\site-packages\keras_preprocessing\text.py", line 43, in text_to_word_sequence
text = text.lower()
AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Related

How to perform super pixel image segmentation and feature extraction

I am interested in multi class segmentation of skin tissues, I have 3000 skin tissue labels classified into 4 classes, I have created a CNN classification algorithm to train my classification model. I would like to use the classification model for segmentation task of new skin tissue image and perform feature extraction of the skin tissue belonging to each of the class
Following is the code that is written to train my classification model
from tensorflow.keras.layers import Input, Concatenate, Dropout, Flatten, Dense, GlobalAveragePooling2D, Conv2D
from tensorflow.keras import backend as K
#from tensorflow.keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.models import Sequential, Model, load_model
import tensorflow as tf
from tensorflow.keras.initializers import he_uniform
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, CSVLogger, ReduceLROnPlateau
#from tensorflow.compat.keras.backend import KTF
#import keras.backend.tensorflow_backend as KTF
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.inception_v3 import InceptionV3
import os
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
#import numpy as np, Pillow, skimage, imageio, matplotlib
#from scipy.misc import imresize
from skimage.transform import resize
from tqdm import tqdm
from tensorflow.keras import metrics
#### PREPROCESS STAGE ####
# Path to superpixels class files
classes_file = "/home/DEV/SKIN_3000_CLASSES.csv"
concatenated_data= pd.read_csv(classes_file, header=None)
# Instances with targets
targets = concatenated_data[1].tolist()
# Split data according to their classes
class_0 = concatenated_data[concatenated_data[1] == 0]
class_1 = concatenated_data[concatenated_data[1] == 1]
class_2 = concatenated_data[concatenated_data[1] == 2]
class_3 = concatenated_data[concatenated_data[1] == 3]
# Holdout split train/test set (Other options are k-folds or leave-one-out)
split_proportion = 0.8
split_size_0 = int(len(class_0)*split_proportion)
split_size_1 = int(len(class_1)*split_proportion)
split_size_2 = int(len(class_2)*split_proportion)
split_size_3 = int(len(class_3)*split_proportion)
new_class_0_train = np.random.choice(len(class_0), split_size_0, replace=False)
new_class_0_train = class_0.iloc[new_class_0_train]
new_class_0_test = ~class_0.iloc[:][0].isin(new_class_0_train.iloc[:][0])
new_class_0_test = class_0[new_class_0_test]
new_class_1_train = np.random.choice(len(class_1), split_size_1, replace=False)
new_class_1_train = class_1.iloc[new_class_1_train]
new_class_1_test = ~class_1.iloc[:][0].isin(new_class_1_train.iloc[:][0])
new_class_1_test = class_1[new_class_1_test]
new_class_2_train = np.random.choice(len(class_2), split_size_2, replace=False)
new_class_2_train = class_2.iloc[new_class_2_train]
new_class_2_test = ~class_2.iloc[:][0].isin(new_class_2_train.iloc[:][0])
new_class_2_test = class_2[new_class_2_test]
new_class_3_train = np.random.choice(len(class_3), split_size_3, replace=False)
new_class_3_train = class_3.iloc[new_class_3_train]
new_class_3_test = ~class_3.iloc[:][0].isin(new_class_3_train.iloc[:][0])
new_class_3_test = class_3[new_class_3_test]
x_train_list = pd.concat(
[new_class_0_train, new_class_1_train, new_class_2_train, new_class_3_train])
x_test_list = pd.concat(
[new_class_0_test, new_class_1_test, new_class_2_test, new_class_3_test])
# Load superpixels files
imagePath = "/home/DEV/SKIN_SET_3000/"
x_train = []
y_train = []
for index, row in tqdm(x_train_list.iterrows(), total=x_train_list.shape[0]):
try:
loadedImage = plt.imread(imagePath + str(row[0]) + ".jpg")
x_train.append(loadedImage)
y_train.append(row[1])
except:
# Try with .png file format if images are not properly loaded
try:
loadedImage = plt.imread(imagePath + str(row[0]) + ".png")
x_train.append(loadedImage)
y_train.append(row[1])
except:
# Print file names whenever it is impossible to load image files
print(imagePath + str(row[0]))
x_test = []
y_test = []
for index, row in tqdm(x_test_list.iterrows(), total=x_test_list.shape[0]):
try:
loadedImage = plt.imread(imagePath + str(row[0]) + ".jpg")
x_test.append(loadedImage)
y_test.append(row[1])
except:
# Try with .png file format if images are not properly loaded
try:
loadedImage = plt.imread(imagePath + str(row[0]) + ".png")
x_test.append(loadedImage)
y_test.append(row[1])
except:
# Print file names whenever it is impossible to load image files
print(imagePath + str(row[0]))
# Reescaling of images
img_width, img_height = 139, 139
index = 0
for image in tqdm(x_train):
#aux = resize(image, (img_width, img_height, 3), "bilinear")
aux = resize(image, (img_width, img_height))
x_train[index] = aux / 255.0 # Normalization
index += 1
index = 0
for image in tqdm(x_test):
#aux = resize(image, (img_width, img_height, 3), "bilinear")
aux = resize(image, (img_width, img_height))
x_test[index] = aux / 255.0 # Normalization
index += 1
#### TRAINING STAGE ####
os.environ["KERAS_BACKEND"] = "tensorflow"
RANDOM_STATE = 42
def get_session(gpu_fraction=0.8):
num_threads = os.environ.get('OMP_NUM_THREADS')
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
if num_threads:
return tf.Session(config=tf.ConfigProto(
gpu_options=gpu_options, intra_op_parallelism_threads=num_threads))
else:
return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#KTF.set_session(get_session())
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def fbeta_score(y_true, y_pred, beta=1):
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# Set F-score as 0 if there are no true positives (sklearn-like).
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0.0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
nb_classes = 4
final_model = []
# Option = InceptionV3
model = InceptionV3(weights="imagenet", include_top=False,
input_shape=(img_width, img_height, 3))
# Option = ResNet
# model = ResNet50(weights="imagenet", include_top=False, input_shape=(3,img_width, img_height))
# Creating new outputs for the model
x = model.output
x = Flatten()(x)
x = Dense(512, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(512, activation="relu")(x)
x = Dropout(0.5)(x)
predictions = Dense(nb_classes, activation='softmax')(x)
#predictions = Dense(nb_classes, activation='sigmoid')(x)
final_model = Model(inputs=model.input, outputs=predictions)
# Metrics
learningRate = 0.001
optimizer = optimizers.SGD(learning_rate=learningRate, momentum=0.88, nesterov=True)
# Compiling the model...
final_model.compile(loss="categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy", fbeta_score])
final_model.summary()
#final_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#x_train = np.array(x_train)
#x_test = np.array(x_test)
x_train = np.asarray(x_train).astype(np.float32)
#x_test = np.array(x_test)
x_test = np.asarray(x_test).astype(np.float32)
# Defining targets...
y_train = np.concatenate([np.full((new_class_0_train.shape[0]), 0), np.full((new_class_1_train.shape[0]), 1),
np.full((new_class_2_train.shape[0]), 2), np.full((new_class_3_train.shape[0]), 3)])
y_test = np.concatenate([np.full((new_class_0_test.shape[0]), 0), np.full((new_class_1_test.shape[0]), 1),
np.full((new_class_2_test.shape[0]), 2), np.full((new_class_3_test.shape[0]), 3)])
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
modelFilename = "/home/DEV/SKIN_SET_3000/model_inception.h5"
trainingFilename = "/home/DEV/SKIN_SET_3000/training.csv"
nb_train_samples = y_train.shape[0]
nb_test_samples = y_test.shape[0]
#epochs = 10000
epochs = 100
batch_size = 24
trainingPatience = 200
decayPatience = trainingPatience / 4
# Setting the data generator...
train_datagen = ImageDataGenerator(
horizontal_flip=True,
fill_mode="reflect",
zoom_range=0.2
)
train_generator = train_datagen.flow(x_train, y_train, batch_size=batch_size)
# Saving the model
checkpoint = ModelCheckpoint(modelFilename,
monitor='val_accuracy',
verbose=1,
save_best_only=True,
save_weights_only=False,
mode='auto',
save_freq=1)
adaptativeLearningRate = ReduceLROnPlateau(monitor='val_accuracy',
factor=0.5,
patience=decayPatience,
verbose=1,
mode='auto',
min_delta=0.0001,
cooldown=0,
min_lr=1e-8)
early = EarlyStopping(monitor='val_accuracy',
min_delta=0,
patience=trainingPatience,
verbose=1,
mode='auto')
csv_logger = CSVLogger(trainingFilename, separator=",", append=False)
# Callbacks
callbacks = [checkpoint, early, csv_logger, adaptativeLearningRate]
# Training of the model
final_model.fit(train_generator,
steps_per_epoch=nb_train_samples / batch_size,
epochs=epochs,
shuffle=True,
validation_data=(x_test, y_test),
validation_steps=nb_test_samples / batch_size,
callbacks=callbacks)
final_model.save('/home/DEV/SKIN_SET_3000/model_inception.h5')
#compile metrics
In order to segment my image, first i have transformed my input image to super pixel using SLIC
from skimage.segmentation import slic
from skimage.segmentation import mark_boundaries
from skimage.util import img_as_float
from skimage import io; io.use_plugin('matplotlib')
import cv2 as cv
from skimage.color import label2rgb
img_width, img_height = 139, 139
# load the model we saved
model = load_model('/home/DEV/SKIN_SET_3000/model_inception.h5', compile=False)
# Get test image ready
img = skimage.img_as_float(skimage.io.imread('/home/DEV/SKIN_ULCER.jpg')).astype(np.float32)
plt.imshow(img)
test_image_slic = slic(img, n_segments=500, compactness=10.0)
test_image_slic_out = mark_boundaries(img,test_image_slic)
plt.imshow(test_image_slic_out)
#test_image=test_image/255
test_image_array = np.array(test_image_slic_out)
test_image_resize = cv2.resize(test_image_array,(img_width,img_height))
test_image_reshape = test_image_resize.reshape(1,img_width, img_height,3)
I would like to check if each superpixel of my input is labeled as one of my target class among 4 tissue classes, and extract the features belonging to each class as a mask and quantify the total surface area of mask .
any suggestions of how to implement this approach would be appreciated.

ValueError: Error when checking input: expected dense_1_input to have shape (3000,) but got array with shape (1,)

I'm playing with this https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/ model and trying to make a Twitter sentiment analysis model. I'm getting
File "C:\Users\sam\Desktop\proje\load_model.py", line 71, in <module>
pred = model.predict(top_tweets)
File "C:\Users\sam\anaconda3\lib\site-packages\keras\engine\training.py", line 1441, in predict
x, _, _ = self._standardize_user_data(x)
File "C:\Users\sam\anaconda3\lib\site-packages\keras\engine\training.py", line 579, in _standardize_user_data
exception_prefix='input')
File "C:\Users\sam\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 145, in standardize_input_data
str(data_shape))
ValueError: Error when checking input: expected dense_1_input to have shape (3000,) but got array with shape (1,)
I've searched and it seems like there is a problem with my model but I can't seem to pinpoint the issue
My code:
model.py:
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import numpy as np
import pandas as pd
training = np.genfromtxt('training.1600000.processed.noemoticon.csv', delimiter=',', skip_header=1, usecols=(0, 5), dtype=None, encoding='latin-1')
train_x = [x[1] for x in training]
train_y = np.asarray([x[0] for x in training])
max_words = 3000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_x)
dictionary = tokenizer.word_index
with open('dict.json', 'w') as dictionary_file:
json.dump(dictionary, dictionary_file)
def convert_text_to_index_array(text):
return [dictionary[word] for word in kpt.text_to_word_sequence(text)]
allWordIndices = []
for text in train_x:
wordIndices = convert_text_to_index_array(text)
allWordIndices.append(wordIndices)
allWordIndices = np.asarray(allWordIndices)
train_y = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
train_x = keras.utils.to_categorical(train_x, 2)
model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(train_x, train_y,
batch_size=32,
epochs=5,
verbose=1,
validation_split=0.1,
shuffle=True)
model_json = model.to_json()
with open('model.json', 'w') as json_file:
json_file.write(model_json)
model.save_weights('model.h5')
print('Model saved!')
load_model.py:
import json
import numpy as np
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
from numpy.core._multiarray_umath import ndarray
import GetOldTweets3 as got
import pandas as pd
def get_tweets(username, top_only, max_tweets):
tweet_criteria = got.manager.TweetCriteria().setUsername(username) \
.setTopTweets(top_only) \
.setMaxTweets(max_tweets)
tweet = got.manager.TweetManager.getTweets(tweet_criteria)
text_tweets = [[tw.text] for tw in tweet]
top_tweets = pd.DataFrame(text_tweets)
return top_tweets
tokenizer = Tokenizer(num_words=3000)
labels = ['negative', 'pozitive']
with open('dict.json', 'r') as dictionary_file:
dictionary = json.load(dictionary_file)
def convert_to_index_array(text):
words = kpt.text_to_word_sequence(text)
wordIndices = []
for word in words:
if word in dictionary:
wordIndices.append(dictionary[word])
else:
print(("'%s' is getting ignored." % (word)))
return wordIndices
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights('model.h5')
while True:
username = input("Enter an username:")
top_tweets = get_tweets(username,
top_only=True,
max_tweets=100)
evalSentence = top_tweets
if len([input]) < 1:
break
testArr = convert_to_index_array(evalSentence.to_string())
username = tokenizer.sequences_to_matrix([testArr], mode="binary")
pred = model.predict(top_tweets)
print(("Your profile %s; %f%% confidince" % (labels(np.argmax(pred)), pred[0][np.argmax(pred)] * 100)))
Thanks

How to make a prediction as binary output? - Python (Tensorflow)

I'm learning text classification using movie reviews as data with tensorflow, but I got stuck when I get an output prediction different (not rounded, not binary) to the label.
CODE
predict = model.predict([test_review])
print("Prediction: " + str(predict[0])) # [1.8203685e-19]
print("Actual: " + str(test_labels[0])) # 0
The expected ouput should be:
Prediction: [0.]
Actual: 0
What the output is giving:
Prediction: [1.8203685e-19]
Actual: 0
The output prediction should be 0 or 1, representing if the review was good or not.
FULL CODE
import tensorflow as tf
from tensorflow import keras
import numpy as np
data = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words = 10000)
word_index = data.get_word_index()
word_index = {k:(v + 3) for k, v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value = word_index['<PAD>'], padding = 'post', maxlen = 256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value = word_index['<PAD>'], padding = 'post', maxlen = 256)
def decode_review(text):
""" decode the training and testing data into readable words"""
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print("\n")
print(decode_review(test_data[0]))
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation = 'relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))
model.summary()
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]
fitModel = model.fit(x_train, y_train, epochs = 40,
batch_size = 512,
validation_data = (x_val, y_val),
verbose = 1)
results = model.evaluate(test_data, test_labels)
test_review = test_data[0]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: " + str(predict[0])) # [1.8203685e-19]
print("Actual: " + str(test_labels[0]))
print("\n[loss, accuracy]: ", results)
Replace the predict method with predict_classes method:
model.predict_classes([test_review])

Keras 2.2.4 fit_generator problem . Value error, Problem in inputting values to the input layer

I am running Keras multi_gpu model. My model takes 2 inputs. one input is given by the Imagedatagenerator and other input is generated through a function inside the model. please have a look at the following code:
import numpy as np
import keras
from keras.layers.convolutional import Conv2D
from keras.layers import ReLU,MaxPooling2D,ZeroPadding2D,BatchNormalization,Dense,Dropout, Activation, Flatten, Lambda, Concatenate, Add
from keras.models import Model
from keras.layers import Input
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras import backend as K
from keras_preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json
from keras.utils import multi_gpu_model
import pandas as pd
import os
import sys
from tqdm import *
# import skimage
import matplotlib.pyplot as plt
# %matplotlib inline
import cv2
import tensorflow as tf
import multiprocessing
# import pydot
########### Make Log directory #####################################
cwd=os.getcwd()
log_dir = cwd+'/log_dir/Relation_net_logs'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
tensorboard_logsdir = log_dir+"/tensorboard_logdir"
if not os.path.exists(tensorboard_logsdir):
os.makedirs(tensorboard_logsdir)
######### Make Network##############################################
def ConvolutionNetworks(kernel_size=3, stride_size=2):
def conv(model):
model = Conv2D(24, (9, 9), strides=(stride_size, stride_size),activation='relu',input_shape=(100, 100, 3), data_format='channels_last')(model)
model = BatchNormalization()(model)
model = Conv2D(24, (7, 7), strides=(stride_size, stride_size),activation='relu')(model)
model = BatchNormalization()(model)
model = Conv2D(24, (kernel_size, kernel_size), strides=(stride_size, stride_size),activation='relu')(model)
model = BatchNormalization()(model)
model = Conv2D(24, (5, 5), strides=(1, 1),activation='relu')(model)
model = BatchNormalization()(model)
return model
return conv
######### Compute Relations #######
def compute_relations(objects):
def get_top_dim_1(t):
return t[:, 0, :, :]
def get_all_but_top_dim_1(t):
return t[:, 1:, :, :]
def get_top_dim_2(t):
return t[:, 0, :]
def get_all_but_top_dim2(t):
return t[:, 1:, :]
slice_top_dim_1 = Lambda(get_top_dim_1)
slice_all_but_top_dim_1 = Lambda(get_all_but_top_dim_1)
slice_top_dim_2 = Lambda(get_top_dim_2)
slice_all_but_top_dim2 = Lambda(get_all_but_top_dim2)
d = K.int_shape(objects)[2]
features = []
for i in range(d): #This loop extracts top layer of the feature map
features1 = slice_top_dim_1(objects)
objects = slice_all_but_top_dim_1(objects)
for j in range(d): #This loop extract each object from the "top layer" extracted in the previous loop and append it in variable "features"
features2 = slice_top_dim_2(features1)
features1 = slice_all_but_top_dim2(features1)
features.append(features2)
relations = []
concat = Concatenate()
for feature1 in features:
for feature2 in features:
relations.append(concat([feature1, feature2]))
return relations
############## f_theta ############################
def f_theta():
def f(model):
model = Dense(256,activation='relu')(model)
# model = Activation('relu')(model)
model = Dense(256,activation='relu')(model)
# model = Activation('relu')(model)
# model = Dropout(0.5)(model)
model = Dense(256,activation='relu')(model)
# model = Activation('relu')(model)
model = Dense(256,activation='relu')(model)
# model = Activation('relu')(model)
return model
return f
################# Relation module and tag building #########################################
from keras.utils import plot_model
def g_th(layers):
def f(model):
for n in range(len(layers)):
model = layers[n](model)
return model
return f
def stack_layer(layers):
def f(x):
for k in range(len(layers)):
x = layers[k](x)
return x
return f
def g_theta(h_unit=256, layers=4):
r = []
for k in range(layers):
r.append(Dense(h_unit))
r.append(Activation('relu'))
return g_th(r)
def get_MLP():
return g_th()
def RelationNetworks(objects):
g_t = g_theta()
relations = compute_relations(objects)
print("length of relations={}".format(len(relations)))
g_all = []
for r in tqdm(relations):
g_all.append(g_t(r)) #send each relation to g_t and append to a list for easy summation.
print("relation computed")
combined_relation = Add()(g_all)
print("relation combined")
f_out = f_theta()(combined_relation)
print("relation went through f_theta")
return f_out
def build_tag(conv):
d = K.int_shape(conv)[2]
tag = np.zeros((d,d,2))
print("tagging in process")
for i in range(d):
for j in range(d):
tag[i,j,0] = float(int(i%d))/(d-1)*2-1
tag[i,j,1] = float(int(j%d))/(d-1)*2-1
tag = K.variable(tag)
tag = K.expand_dims(tag, axis=0)
batch_size = K.shape(conv)[0]
tag = K.tile(tag, [batch_size,1,1,1])
print("tagging done")
return Input(tensor=tag)
################################# Build Model ###################################################################################
visual_scene = Input((100, 100, 3))
# visual_question = Input((11,))
visual_conv = ConvolutionNetworks()(visual_scene)
tag = build_tag(visual_conv)
visual_conv = Concatenate()([tag, visual_conv])
visual_RN = RelationNetworks(visual_conv)
visual_out = Dense(4, activation='softmax')(visual_RN)
VisualModel = Model(inputs=[tag,visual_scene], outputs=visual_out)
print("model made")
# plot_model(VisualModel, to_file='/home/aakash/Relation_Network/figures/VisualModel1.png')
################################ Create parallel model ###############
# This executes Data Parallelism. Batch is divided equally on all GPUs for computation
try:
parallel_model = multi_gpu_model(VisualModel, cpu_merge=True, cpu_relocation=True,gpus=2)
print("Training using multiple GPUs..")
except:
parallel_model = model
print("Training using single GPU or CPU..")
################################# Training #################################################################################
workers=multiprocessing.cpu_count()-1
batchsize=32
IMG_SIZE=100
train_df_path="/home/aakash/Relation_Network/training_df.pkl"
valid_df_path="/home/aakash/Relation_Network/validation_df.pkl"
image_dir="/home/aakash/Relation_Network/DL_Dataset"
from keras.optimizers import Adam
lr = 1e-4
adam = Adam(lr=lr)
parallel_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#Save architecture
NAME = "2_conv_model"
with open(NAME+".json", "w") as json_file:
json_file.write(VisualModel.to_json())
print("model architecture saved as json file")
#create callbacks
# NAME = "{}-conv-{}-nodes-{}-dense-{}".format(conv_layer, layer_size, dense_layer, int(time.time()))
checkpoint = keras.callbacks.ModelCheckpoint(log_dir+'/'+NAME+'.h5', monitor='val_loss',verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
csv_logger = keras.callbacks.CSVLogger(log_dir+"/"+NAME+".csv", separator=',', append=False)
tensorboard = keras.callbacks.TensorBoard(log_dir=tensorboard_logsdir+'/'+NAME, histogram_freq=0, batch_size=batchsize, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0,
embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch')
training_df=pd.read_pickle(train_df_path)
validation_df=pd.read_pickle(valid_df_path)
datagen=ImageDataGenerator(rescale=1./255)
train_generator=datagen.flow_from_dataframe(dataframe=training_df, directory=image_dir,
x_col="image", y_col="lesion", class_mode="categorical",
target_size=(IMG_SIZE,IMG_SIZE), batch_size=batchsize,shuffle=True)
validation_generator=datagen.flow_from_dataframe(dataframe=validation_df, directory=image_dir,
x_col="image", y_col="lesion", class_mode="categorical",
target_size=(IMG_SIZE,IMG_SIZE), batch_size=batchsize)
parallel_model.fit_generator(generator = train_generator,
steps_per_epoch = (training_df.shape[0])//batchsize,
validation_data = validation_generator,
validation_steps = (validation_df.shape[0])//batchsize,
epochs = 30,verbose=1,callbacks=[checkpoint, csv_logger,tensorboard],
use_multiprocessing=True,workers=workers)
build_tag function returns an input layer with a tensor (this is my second input).
But when I run this code, it shows the following error
!(https://drive.google.com/file/d/1gGjoO89zwRw_zUQ14sUIrdC7oRKrdVT1/view?usp=sharing)
I made the build_tag function a Lambda layer and the value returned by build_tag is just value "tag" and NOT an input layer and remove "tag" input to the model and it starts to work.
This is the model-architecture before converting the build_tag into Lambda layer and this one is after conversion.

RNN: Get prediction from a text input after the model is trained

I am new to RNNs and I have been working on a small binary label classifier. I have been able to get a stable model with satisfactory results.
However, I am having a hard time using the model to classify new inputs and I was wondering if any of you could help me. Please see my code below for reference.
Thank you very much.
from tensorflow.keras import preprocessing
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras import models
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input,
Embedding
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping
from matplotlib import pyplot
class tensor_rnn():
def __init__(self, hidden_layers=3):
self.data_path = 'C:\\\\Users\\cmazz\\PycharmProjects\\InvestmentAnalysis_2.0\\Sentiment\\Finance_Articles\\'
# self.corp_paths = corpora_paths
self.h_layers = hidden_layers
self.num_words = []
good = pd.read_csv(self.data_path + 'GoodO.csv')
good['Polarity'] = 'pos'
for line in good['Head'].tolist():
counter = len(line.split())
self.num_words.append(counter)
bad = pd.read_csv(self.data_path + 'BadO.csv')
bad['Polarity'] = 'neg'
for line in bad['Head'].tolist():
counter = len(line.split())
self.num_words.append(counter)
self.features = pd.concat([good, bad]).reset_index(drop=True)
self.features = shuffle(self.features)
self.max_len = len(max(self.features['Head'].tolist()))
# self.train, self.test = train_test_split(features, test_size=0.33, random_state=42)
X = self.features['Head']
Y = self.features['Polarity']
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)
self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, test_size=0.30)
self.tok = preprocessing.text.Tokenizer(num_words=len(self.num_words))
self.tok.fit_on_texts(self.X_train)
sequences = self.tok.texts_to_sequences(self.X_train)
self.sequences_matrix = preprocessing.sequence.pad_sequences(sequences, maxlen=self.max_len)
def RNN(self):
inputs = Input(name='inputs', shape=[self.max_len])
layer = Embedding(len(self.num_words), 30, input_length=self.max_len)(inputs)
# layer = LSTM(64, return_sequences=True)(layer)
layer = LSTM(32)(layer)
layer = Dense(256, name='FC1')(layer)
layer = Activation('relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(1, name='out_layer')(layer)
layer = Activation('sigmoid')(layer)
model = Model(inputs=inputs, outputs=layer)
return model
def model_train(self):
self.model = self.RNN()
self.model.summary()
self.model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy']) # RMSprop()
def model_test(self):
self.history = self.model.fit(self.sequences_matrix, self.Y_train, batch_size=100, epochs=3,
validation_split=0.30, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
test_sequences = self.tok.texts_to_sequences(self.X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len)
accr = self.model.evaluate(test_sequences_matrix, self.Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1]))
if __name__ == "__main__":
a = tensor_rnn()
a.model_train()
a.model_test()
a.model.save('C:\\\\Users\\cmazz\\PycharmProjects\\'
'InvestmentAnalysis_2.0\\RNN_Model.h5',
include_optimizer=True)
b = models.load_model('C:\\\\Users\\cmazz\\PycharmProjects\\'
'InvestmentAnalysis_2.0\\RNN_Model.h5')
stringy = ['Fund managers back away from Amazon as they cut FANG exposure']
prediction = b.predict(np.array(stringy))
print(prediction)
When I run my code I get the following error:
ValueError: Error when checking input: expected inputs to have shape
(39,) but got array with shape (1,)
Based on the ValueError and prediction = b.predict(np.array(stringy)), I think you need to tokenize your input string.

Categories

Resources