Good morning, I'm trying to train lstm to classify spam and not spam, I came across the following error:
ValueError: Input 0 is incompatible with layer lstm_1: expected ndim = 3, found ndim = 4
Can someone help me understand where the problem is?
my code:
import sys
import pandas as pd
import numpy as np
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
if __name__ == "__main__":
np.random.seed(7)
with open('SMSSpamCollection') as file:
dataset = [[x.split('\t')[0],x.split('\t')[1]] for x in [line.strip() for line in file]]
data = np.array([dat[1] for dat in dataset])
labels = np.array([dat[0] for dat in dataset])
dataVectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 5000)
labelVectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 5000)
data = dataVectorizer.fit_transform(data).toarray()
labels = labelVectorizer.fit_transform(labels).toarray()
vocab = labelVectorizer.get_feature_names()
print(vocab)
print(data)
print(labels)
data = np.reshape(data, (data.shape[0], 1, data.shape[1]))
input_dim = data.shape
tam = len(data[0])
print(data.shape)
print(tam)
model = Sequential()
model.add(LSTM(tam, input_shape=input_dim))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(data, labels, epochs=100, batch_size=1, verbose=2)
I tried adding another position in the data array but also with no result
my file SMSSpamCollection
ham Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham Ok lar... Joking wif u oni...
spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham U dun say so early hor... U c already then say...
ham Nah I don't think he goes to usf, he lives around here though
spam FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham Even my brother is not like to speak with me. They treat me like aids patent.
...
thanks
The problem lies in fact that you are adding an additional dimension connected with samples. Try:
input_dim = (data.shape[1], data.shape[2])
This should work.
Related
This is my first time asking a question on stackoverflow so sorry if I am not asking in a correct format. Let's say I am working with some extremely long timestep sequence datas (10000000), with 2701 samples and only one feature, my input array is [2701,10000000,1], and my dataset looks like
[ 2.81143e-01 4.98219e-01 -8.08500e-03 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 1.95077e-01 2.20920e-02 -1.68663e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
...
[ 1.06033e-01 8.96650e-02 -3.20860e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 6.85510e-02 -3.83653e-01 -2.19265e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 2.51404e-01 8.02280e-02 2.84610e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]]
However, from what I had read, usually LSTM network perform better in a range of (200~400) time steps, even ignoring the performance, I cannot successfully train with a single sample [1,10000000,1]. I believe the network is functional since I tried to limit the length of each sample to (1500), which is now [2701,1500,1] and it finally stop stucking at the first epoch. Here below is my code if needed:
from keras.utils import Sequence
import numpy as np
from numpy.lib.format import open_memmap
import gc
import platform
import pandas as pd
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Masking
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC' #A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
stock_price=pd.read_csv("C:/Users/user/Desktop/Final-Year-Project-master/stock_classification_7Days_test.csv",sep=',', dtype={"ID":"string","Class":int})
print(stock_price)
print (platform.architecture())
y_data=[]
x_data=[]
y_data=pd.get_dummies(stock_price['Class'])
def embedded_reader(file_path):
with open(file_path) as embedded_raw:
for line in embedded_raw:
for word in line.split(','):
try:
val=float(word)
yield val
except:
pass
embedded_raw.close()
gc.collect()
for y in range(len(stock_price)):
if int(stock_price.at[y,'Class']) is not None:
i = stock_price.at[y,'ID']
print("Company code current: ",i)
embedded_current=[]
try:
gen=embedded_reader("C:/Users/user/Desktop/Final-Year-Project-master/json_test/{}.jsonl".format(i))
while True:
val=next(gen)
embedded_current.append(val)
except:
pass
fp=np.memmap('embedded_array.mymemmap', dtype=np.uint8,mode='w+',shape=(1,))
fp=np.delete(fp,0)
fp=np.concatenate((fp,embedded_current),axis=0)
fp=np.pad(fp, (0,(10000000-len(embedded_current))), 'constant', constant_values=(100, 100))
print(fp)
x_data.append(fp)
print(np.shape(x_data))
del fp
print("embedded_data current: ",len(embedded_current))
print("this is number {}".format(y))
print("-----------------------------")
gc.collect()
gc.collect()
print(len(x_data))
print(np.shape(x_data))
print("-"*20)
print(np.shape(y_data))
print(np.size(y_data))
X_train, X_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.2,random_state=0)
print(np.shape(X_train))
print(np.shape(X_test))
X_train=np.array(X_train)
X_test=np.array(X_test)
print(np.shape(X_train))
print(np.shape(X_test))
print(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], 1))
print(np.shape(X_train))
print(np.shape(X_test))
y_train=np.array(y_train)
y_test=np.array(y_test)
print(len(X_test[0]))
print(np.shape(y_train))
model=Sequential()
model.add(Masking(mask_value=100, input_shape=(10000000,1)))
model.add(LSTM(units=1, return_sequences = True, input_shape=(10000000,1)))
model.add(LSTM(units=1,return_sequences=False))
model.add(Dense(5,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
model.fit(X_train,y_train,epochs=50,batch_size=4,verbose=1)
print(model.predict(X_test))
print("class label:", reverse_label(model.predict_classes(X_test)))
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
model.save('my_model')
From some tutorials they had mentioned to reshape the array, so I tried to reshape my array into something like [2701*25000, 10000000/25000, 1] but then I got stuck with the problem that x_data sample and y_data sample is not the same. Also I saw ones mentioned model.fit_generator but seems like it is solving the problem with huge sample size, which in my case, the model is not even working with a single sample (I am new to neural network so not sure if I am understanding it correctly). Totally got no clue and would really appreciate for any help, thank you.
Edit: just to state my question clear: "any advice on treating such long input using LSTM?"
I'm trying to run a voice recognition code from Github HERE that analyzes voice. There is an example in final_results_gender_test.ipynb that illustrates the steps both on the training and inference. So I copied and adjusted the inference part and came up with the following code that uses the trained model for just inference. But I'm not sure why I get this error, complaining This LabelEncoder instance is not fitted yet.
How to fix the problem? I'm just doing inference and why do I need the fit?
Traceback (most recent call last):
File "C:\Users\myname\Documents\Speech-Emotion-Analyzer-master\audio.py", line 53, in <module>
livepredictions = (lb.inverse_transform((liveabc)))
File "C:\Users\myname\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py", line 272, in inverse_transform
check_is_fitted(self, 'classes_')
File "C:\Users\myname\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 914, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.
Here is my copied/adjusted code from the notebook:
import os
from keras import regularizers
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Dense, Embedding, Input, Flatten, Dropout, Activation, LSTM
from keras.models import Model, Sequential, model_from_json
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import librosa
import librosa.display
from matplotlib.pyplot import specgram
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
lb = LabelEncoder()
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Voice_Detection_Model.h5")
print("Loaded model from disk")
X, sample_rate = librosa.load('h04.wav', res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
sample_rate = np.array(sample_rate)
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
featurelive = mfccs
livedf2 = featurelive
livedf2= pd.DataFrame(data=livedf2)
livedf2 = livedf2.stack().to_frame().T
twodim= np.expand_dims(livedf2, axis=2)
livepreds = loaded_model.predict(twodim, batch_size=32, verbose=1)
livepreds1=livepreds.argmax(axis=1)
liveabc = livepreds1.astype(int).flatten()
livepredictions = (lb.inverse_transform((liveabc)))
print(livepredictions)
I was facing the same problem. It is probably too late for you. But I want to give solution for who has still errors.
I was using this code Github
at read.me file you can see this note:
NOTE: If you are using the model directly and want to decode the output ranging from 0 to 9 then the following list will help you.
So since he already gave that list, just delete this part from your code:
livepredictions = (lb.inverse_transform((liveabc)))
print(livepredictions)
Since the speechs are already loaded, we don't need to fit it or transform it.
So instead of those lines add these: I prefer to use as dictionary, then print it from there.
Sentiments = { 0 : "Female_angry",
1 : "Female Calm",
2 : "Female Fearful",
3 : "Female Happy",
4 : "Female Sad",
5 : "Male Angry",
6 : "Male calm",
7 : "Male Fearful",
8 : "Male Happy",
9 : "Male sad"
}
Way1: Use generator to get value from dictionary.
Result = [emotions for (number,emotions) in Sentiments.items() if liveabc == number]
print(Result)
Way2: Or simply get exact value from dictionary.
for number,emotions in Sentiments.items():
if liveabc == number:
print(emotions)
if you use way 1: it will show you as ['Male Angry'].
if you use way 2: it will print as Male Angry.
So the full code will be like this:
from keras.models import model_from_json
import librosa
import numpy as np
import pandas as pd
Sentiments = { 0 : "Female_angry",
1 : "Female Calm",
2 : "Female Fearful",
3 : "Female Happy",
4 : "Female Sad",
5 : "Male Angry",
6 : "Male calm",
7 : "Male Fearful",
8 : "Male Happy",
9 : "Male sad"
}
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Voice_Detection_Model.h5")
print("Loaded model from disk")
X, sample_rate = librosa.load('output10.wav', res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
sample_rate = np.array(sample_rate)
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
featurelive = mfccs
livedf2 = featurelive
livedf2= pd.DataFrame(data=livedf2)
livedf2 = livedf2.stack().to_frame().T
twodim= np.expand_dims(livedf2, axis=2)
livepreds = loaded_model.predict(twodim,
batch_size=32,
verbose=1)
livepreds1=livepreds.argmax(axis=1)
liveabc = livepreds1.astype(int).flatten()
Result = [emotions for (number,emotions) in Sentiments.items() if liveabc == number]
print(Result)
I’m very new to tensorflow. I’ve attended an online course, but I still have many questions related to data pre-processing. I would really appreciate if someone could help me out!!
My goal is to train a model that classifies Portuguese nouns into two gender categories (feminine and masculine) based on their internal structure. So, for this, I have a file containing about 4300 nouns and their categories (F and M labels).
First question:
I have opened the nouns files and I first tokenized the words and after that I have padded them. I have a put the labels in a separated file. The labels file is a txt list containing the labels ‘f’ and ‘m’. I’ve converted them into 0 and 1 integers and then convert them into a numpy array. I’ve also converted the padded nouns into a numpy array. Is that correct?
What is strange is that I have set the number of epochs for 100, but the program keeps training…
Second question:
How can I separate my train and labels into test and test_labels?
My code so far is below:
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
import re
import os
import sys
from pathlib import Path
import numpy as np
import numpy
import tensorflow as tf
while True:
try:
file_to_open =Path(input("Please, insert your file path: "))
with open(file_to_open,'r', encoding="utf-8") as f:
words = f.read()
break
except FileNotFoundError:
print("\nFile not found. Better try again")
except IsADirectoryError:
print("\nIncorrect Directory path.Try again")
corpus=words.split('\n')
labels = []
new_labels=[]
nouns = []
for i in corpus:
if i == '0':
labels.append(i)
elif i== '1':
labels.append(i)
else:
nouns.append(i)
for x in labels:
new_labels.append(int(x))
training_labels= numpy.array(new_labels)
training_nouns=[]
for w in nouns:
a=list(w)
b=' '.join([str(elem) for elem in a]) + ',' + ' '
training_nouns.append(b)
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_nouns)
word_index = tokenizer.word_index
nouns_sequences = tokenizer.texts_to_sequences(training_nouns)
padded = pad_sequences(nouns_sequences,maxlen=max_length)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim,
input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(36, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=.
['accuracy'])
model.summary()
training_padded = np.array(padded)
num_epochs = 150
model.fit(training_padded, training_labels, epochs=num_epochs)
If you shouldn't use Tensorflow. you can use train_test_split scikit-learn function like this(you can continue with tensorflow):
from sklearn.model_selection import train_test_split
train_data,train_labels,test_data,test_labels=train_test_split(YOUR DATA,YOUR LABELS)
see here for more information.
Im working on my senior project in my university and I have only 2 days to fix this problem.I created a hand gesture recognition with using CNN in Python.I used 78000 images with 50x50px values.But I got stuck in the last part of my model.I can not improve my accuracy.When I start to train the data with 100 epochs,the first 15 epochs show 0,039 accuracy and it is horrible,because of that I'm not waiting the end of the train.Maybe it happens because of the values of conv2d or pooling because I don't know how to put the correct values into conv2d,pooling etc.
I'm new and I could not fix the problem.If you help me,I will be grateful for you
The code I wrote is given below;
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import pickle
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
from numpy import asarray
DATADIR = "asl_alphabet_train"
CATEGORIES = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
X_train = []
y_train = []
X_test=[]
y_test=[]
IMG_SIZE=50
def create_training_data():
for category in CATEGORIES:
path = os.path.join(DATADIR,category) # create path to dogs and cats
class_num = CATEGORIES.index(category) # get the classification (0 or a 1).
for img in tqdm(os.listdir(path)): # iterate over each image per dogs and cats
try:
img_array = cv2.imread(os.path.join(path,img)) # convert to array
#new_array = cv2.resize(img_array, (28, 50 )) # resize to normalize data size
X_train.append(img_array) # add this to our trainingdata
# add this to our X_train
y_train.append(class_num) # add this to our X_train
except Exception as e: # in the interest in keeping the output clean...
pass
create_training_data()
X_train = asarray(X_train)
y_train = asarray(y_train)
"""
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))
"""
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2,random_state=0)
N = y_train.size
M = y_train.max()+1
resultArray = np.zeros((N,M),int)
idx = (np.arange(N)*M) + y_train
resultArray.ravel()[idx] = 1
y_train=resultArray
classifier=Sequential()
#convolution step
classifier.add(Convolution2D(filters=96, input_shape=(50,50,3), kernel_size=(11,11), padding='valid',activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
#convolution step
classifier.add(Convolution2D(filters=256,kernel_size=(11,11),padding="valid",activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
classifier.add(Convolution2D(filters=384,kernel_size=(3,3),padding="valid",activation="relu"))
classifier.add(MaxPooling2D(pool_size=(2,2)))
#flatten step
classifier.add(Flatten())
#Dense(Fully connected step)
classifier.add(Dense(output_dim=128,activation="relu"))
#Dropout to decrease the possibility of overfitting
classifier.add(Dropout(0.5))
#Dense to determine the output
classifier.add(Dense(output_dim=26,activation="softmax"))
#compile step
classifier.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
enter code here
classifier.fit(X_train,y_train,epochs=100,batch_size=32)
filename="CNN_TEST.sav"
pickle.dump(classifier, open(filename, 'wb'))
y_pred=classifier.predict(X_test)
print(y_pred)
Would recommend the following :
1) Reduce the kernel size in the first two convolutional layers of your model.
2) I believe the MaxPooling layer is not necessary after every convolutional layer. Do verify this.
3) A DropOut of 0.5 could drop out a large number of essential neurons, you might want to lower that.
4) Vary the number of epochs and see how your model performs each time.
Plot "train accuracy vs val accuracy" and "train loss vs val loss" at each attempt and see if your model overfits or underfits.
I am following the pretrained_word_embeddings and is saving the model using the following piece of code
print('Saving model to disk ...')
model.save('/home/data/pretrained-model.h5'')
I am then loading the pretrained model using
pretrained_model = load_model('/home/data/pretrained-model.h5')
Later the following piece of code for predicting on a different text altogether
predict_texts = [] # list of text samples
for predict_name in sorted(os.listdir(PREDICT_TEXT_DATA_DIR)):
predict_path = os.path.join(PREDICT_TEXT_DATA_DIR, predict_name)
if os.path.isdir(predict_path):
for predict_fname in sorted(os.listdir(predict_path)):
if predict_fname.isdigit():
predict_fpath = os.path.join(predict_path, predict_fname)
if sys.version_info < (3,):
f = open(predict_fpath)
else:
f = open(predict_fpath, encoding='latin-1')
predict_text = f.read()
i = predict_text.find('\n\n') # skip header
if 0 < i:
predict_text = predict_text[i:]
predict_texts.append(predict_text)
f.close()
print('Found %s texts.' % len(predict_texts))
tokenizer.fit_on_texts(predict_texts)
predict_sequences = tokenizer.texts_to_sequences(predict_texts)
predict_data = pad_sequences(predict_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of predict data tensor:', predict_data.shape)
x_predict = predict_data
y_predict = pretrained_model.predict(x_predict)
max_val = np.argmax(y_predict)
print('Category it belongs to : ',max_val)
The problem that I am facing now is that each time I run this above piece of code, max_val is always a different value.
How do I make predictions consistent please ?
I think you should predict one by one, not merge all texts for all files.
The following code I tested is OK:
from __future__ import print_function
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import load_model
from keras.preprocessing.text import text_to_word_sequence
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
model = load_model('embedding.h5')
PREDICT_TEXT_DATA_DIR = 'predict_data'
predict_path = os.path.join(PREDICT_TEXT_DATA_DIR, '1.txt')
f = open(predict_path, encoding='utf-8')
predict_text = f.read()
f.close()
texts=[predict_text]
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_predict = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of predict data tensor:', x_predict.shape)
y_predict = model.predict(x_predict)
max_val = np.argmax(y_predict)
print('Category it belongs to : ',max_val)