Handling extremely long timestep sequence in LSTM (NLP Multi-label classification) - python

This is my first time asking a question on stackoverflow so sorry if I am not asking in a correct format. Let's say I am working with some extremely long timestep sequence datas (10000000), with 2701 samples and only one feature, my input array is [2701,10000000,1], and my dataset looks like
[ 2.81143e-01 4.98219e-01 -8.08500e-03 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 1.95077e-01 2.20920e-02 -1.68663e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
...
[ 1.06033e-01 8.96650e-02 -3.20860e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 6.85510e-02 -3.83653e-01 -2.19265e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]
[ 2.51404e-01 8.02280e-02 2.84610e-01 ... 1.00000e+02 1.00000e+02
1.00000e+02]]
However, from what I had read, usually LSTM network perform better in a range of (200~400) time steps, even ignoring the performance, I cannot successfully train with a single sample [1,10000000,1]. I believe the network is functional since I tried to limit the length of each sample to (1500), which is now [2701,1500,1] and it finally stop stucking at the first epoch. Here below is my code if needed:
from keras.utils import Sequence
import numpy as np
from numpy.lib.format import open_memmap
import gc
import platform
import pandas as pd
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Masking
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC' #A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
stock_price=pd.read_csv("C:/Users/user/Desktop/Final-Year-Project-master/stock_classification_7Days_test.csv",sep=',', dtype={"ID":"string","Class":int})
print(stock_price)
print (platform.architecture())
y_data=[]
x_data=[]
y_data=pd.get_dummies(stock_price['Class'])
def embedded_reader(file_path):
with open(file_path) as embedded_raw:
for line in embedded_raw:
for word in line.split(','):
try:
val=float(word)
yield val
except:
pass
embedded_raw.close()
gc.collect()
for y in range(len(stock_price)):
if int(stock_price.at[y,'Class']) is not None:
i = stock_price.at[y,'ID']
print("Company code current: ",i)
embedded_current=[]
try:
gen=embedded_reader("C:/Users/user/Desktop/Final-Year-Project-master/json_test/{}.jsonl".format(i))
while True:
val=next(gen)
embedded_current.append(val)
except:
pass
fp=np.memmap('embedded_array.mymemmap', dtype=np.uint8,mode='w+',shape=(1,))
fp=np.delete(fp,0)
fp=np.concatenate((fp,embedded_current),axis=0)
fp=np.pad(fp, (0,(10000000-len(embedded_current))), 'constant', constant_values=(100, 100))
print(fp)
x_data.append(fp)
print(np.shape(x_data))
del fp
print("embedded_data current: ",len(embedded_current))
print("this is number {}".format(y))
print("-----------------------------")
gc.collect()
gc.collect()
print(len(x_data))
print(np.shape(x_data))
print("-"*20)
print(np.shape(y_data))
print(np.size(y_data))
X_train, X_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.2,random_state=0)
print(np.shape(X_train))
print(np.shape(X_test))
X_train=np.array(X_train)
X_test=np.array(X_test)
print(np.shape(X_train))
print(np.shape(X_test))
print(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], 1))
print(np.shape(X_train))
print(np.shape(X_test))
y_train=np.array(y_train)
y_test=np.array(y_test)
print(len(X_test[0]))
print(np.shape(y_train))
model=Sequential()
model.add(Masking(mask_value=100, input_shape=(10000000,1)))
model.add(LSTM(units=1, return_sequences = True, input_shape=(10000000,1)))
model.add(LSTM(units=1,return_sequences=False))
model.add(Dense(5,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
model.fit(X_train,y_train,epochs=50,batch_size=4,verbose=1)
print(model.predict(X_test))
print("class label:", reverse_label(model.predict_classes(X_test)))
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
model.save('my_model')
From some tutorials they had mentioned to reshape the array, so I tried to reshape my array into something like [2701*25000, 10000000/25000, 1] but then I got stuck with the problem that x_data sample and y_data sample is not the same. Also I saw ones mentioned model.fit_generator but seems like it is solving the problem with huge sample size, which in my case, the model is not even working with a single sample (I am new to neural network so not sure if I am understanding it correctly). Totally got no clue and would really appreciate for any help, thank you.
Edit: just to state my question clear: "any advice on treating such long input using LSTM?"

Related

Data preparation for neural network in Python

I want to learn how to prepare data for training samples in python. I found a simple example of a neural network that predicts the stock price. At the moment I am not interested in the accuracy of training the network, but I am interested in how to take any data and prepare it for submission to the neural network.
As an example, I took these stocks over the past 5 years. As planned, the neural network accepts data for the last 50 days as input and predicts the course for the next 5 days. To do this, I read the .csv file, processed the data in such a way that after the transformation I got two dataframes, the first one is responsible for the input data, and the second for the output.
The problem is, no matter what I do, I keep getting errors and so I cannot complete the training. What am I doing wrong? The code is shown below:
import matplotlib.pylab as plt
import torch
import random
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
import pandas_profiling as pprf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Activation, Input, MaxPooling1D, Dropout
from tensorflow.keras.layers import AveragePooling1D, Conv1D, Flatten
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.utils import plot_model
from IPython.display import display, Image
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
data = pd.read_csv('F:\\YNDX_ME.csv')[::]
data = data.drop('Date',axis=1)
data = data.drop('Adj Close',axis=1)
data = data.drop(np.where(data['Volume'] == 0)[0])
data = data.reset_index(drop=True)
#profiler = pprf.ProfileReport(data)
#profiler.to_file(r'F:\profiling.html')
days_edu = 50
days_pred = 5
df_edu_list = []
for i in range(len(data.index)-days_edu-days_pred+1):
df_temp = []
for j in range(days_edu):
df_temp.extend(data.loc[i+j,:].tolist())
df_edu_list.append(df_temp)
df_edu_out_list = []
for i in range(len(data.index)-days_edu-days_pred+1):
df_temp = []
for j in range(5):
df_temp.extend(data.loc[i+j+days_edu,:].tolist())
df_edu_out_list.append(df_temp)
df_edu_train = pd.DataFrame(df_edu_list[:int(len(df_edu_list)*0.8)])
df_edu_val = pd.DataFrame(df_edu_list[int(len(df_edu_list)*0.8):])
df_edu_train_out = pd.DataFrame(df_edu_out_list[:int(len(df_edu_out_list)*0.8)])
df_edu_val_out = pd.DataFrame(df_edu_out_list[int(len(df_edu_out_list)*0.8):])
df_edu_train = normalize(df_edu_train.values)
df_edu_val = normalize(df_edu_val.values)
df_edu_train_out = normalize(df_edu_train_out.values)
df_edu_val_out = normalize(df_edu_val_out.values)
df_edu_train = np.expand_dims(df_edu_train,axis=0)
df_edu_train_out = np.expand_dims(df_edu_train_out,axis=0)
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=5, padding="same", strides=1, input_shape= (959,250),data_format='channels_first'))
model.add(Conv1D(32, 5))
model.add(Dropout(0.3))
model.add(Conv1D(16, 5))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(25, activation=None))
optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer=optimizer, loss='mae', metrics=['accuracy'])
EPOCHS = 1000
model.fit(df_edu_train, df_edu_train_out, epochs=EPOCHS)
Error:
InvalidArgumentError: Conv2DCustomBackpropFilterOp only supports NHWC.
[[node gradient_tape/sequential/conv1d/Conv1D/Conv2DBackpropFilter
(defined at C:\Users\nick0\anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:464)
]] [Op:__inference_train_function_1046]
Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/sequential/conv1d/Conv1D/Conv2DBackpropFilter:
In[0] sequential/conv1d/Conv1D/ExpandDims (defined at C:\Users\nick0\anaconda3\lib\site-packages\keras\layers\convolutional.py:231)
In[1] gradient_tape/sequential/conv1d/Conv1D/ShapeN:
In[2] gradient_tape/sequential/conv1d/Conv1D/Reshape:
Update:
Changed data_format = 'channels_first' to data_format = 'channels_last'. The training began, but as I understood, the training took place on the entire training set, i.e. the neural network just thought that there was one example and it was trained on it specifically. How to make the neural network take each line in turn? is each line essentially a separate example?

The accuracy problem of hand sign gestures recognition with using CNN in Python

Im working on my senior project in my university and I have only 2 days to fix this problem.I created a hand gesture recognition with using CNN in Python.I used 78000 images with 50x50px values.But I got stuck in the last part of my model.I can not improve my accuracy.When I start to train the data with 100 epochs,the first 15 epochs show 0,039 accuracy and it is horrible,because of that I'm not waiting the end of the train.Maybe it happens because of the values of conv2d or pooling because I don't know how to put the correct values into conv2d,pooling etc.
I'm new and I could not fix the problem.If you help me,I will be grateful for you
The code I wrote is given below;
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import pickle
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
from numpy import asarray
DATADIR = "asl_alphabet_train"
CATEGORIES = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
X_train = []
y_train = []
X_test=[]
y_test=[]
IMG_SIZE=50
def create_training_data():
for category in CATEGORIES:
path = os.path.join(DATADIR,category) # create path to dogs and cats
class_num = CATEGORIES.index(category) # get the classification (0 or a 1).
for img in tqdm(os.listdir(path)): # iterate over each image per dogs and cats
try:
img_array = cv2.imread(os.path.join(path,img)) # convert to array
#new_array = cv2.resize(img_array, (28, 50 )) # resize to normalize data size
X_train.append(img_array) # add this to our trainingdata
# add this to our X_train
y_train.append(class_num) # add this to our X_train
except Exception as e: # in the interest in keeping the output clean...
pass
create_training_data()
X_train = asarray(X_train)
y_train = asarray(y_train)
"""
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))
"""
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2,random_state=0)
N = y_train.size
M = y_train.max()+1
resultArray = np.zeros((N,M),int)
idx = (np.arange(N)*M) + y_train
resultArray.ravel()[idx] = 1
y_train=resultArray
classifier=Sequential()
#convolution step
classifier.add(Convolution2D(filters=96, input_shape=(50,50,3), kernel_size=(11,11), padding='valid',activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
#convolution step
classifier.add(Convolution2D(filters=256,kernel_size=(11,11),padding="valid",activation="relu"))
#pooling step
classifier.add(MaxPooling2D(pool_size=(2,2)))
classifier.add(Convolution2D(filters=384,kernel_size=(3,3),padding="valid",activation="relu"))
classifier.add(MaxPooling2D(pool_size=(2,2)))
#flatten step
classifier.add(Flatten())
#Dense(Fully connected step)
classifier.add(Dense(output_dim=128,activation="relu"))
#Dropout to decrease the possibility of overfitting
classifier.add(Dropout(0.5))
#Dense to determine the output
classifier.add(Dense(output_dim=26,activation="softmax"))
#compile step
classifier.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
enter code here
classifier.fit(X_train,y_train,epochs=100,batch_size=32)
filename="CNN_TEST.sav"
pickle.dump(classifier, open(filename, 'wb'))
y_pred=classifier.predict(X_test)
print(y_pred)
Would recommend the following :
1) Reduce the kernel size in the first two convolutional layers of your model.
2) I believe the MaxPooling layer is not necessary after every convolutional layer. Do verify this.
3) A DropOut of 0.5 could drop out a large number of essential neurons, you might want to lower that.
4) Vary the number of epochs and see how your model performs each time.
Plot "train accuracy vs val accuracy" and "train loss vs val loss" at each attempt and see if your model overfits or underfits.

Keras model predicting wrong values(accuracy: 0.0000e+00)

Que the cliche "This is my first Keras project", but alas, this is the truth. I apologize for any cringe beginner mistakes in advance.
How is the data setup
Column A: We capture the time a given train is scheduled to depart in 24-hour time format.
Column B: An integer representation of the given trains destination. EX California == 2, New York == 0
Column C: The track assigned to the given train.
Screenshot of data setup
GOAL
By using this data, can we predict the track number using the time and location.
Current Attempt
# multivariate one step problem
from numpy import array
from numpy import hstack
from numpy import insert
from numpy import zeros,newaxis
from numpy import reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import pandas as pd
file_name = "DATA_DUMP.csv"
destination = 0
departure = 1622
#extract values
raw_data = pd.read_csv(file_name)
data = raw_data
in_seq1 = array([data['TIME'].values])
in_seq2 = array([data['LOCATION'].values])
result = array([data['TRACK'][0:-1].values])
# reshape series
in_seq1 = in_seq1.reshape((in_seq1.shape[1],len(in_seq1)))
in_seq2 = in_seq2.reshape((in_seq2.shape[1],len(in_seq2)))
result = result.reshape((result.shape[1],len(result)))
dataset = hstack((in_seq1, in_seq2))
result = insert(result,0,0)
result = result.reshape((len(result),1))
# define generator
n_features = dataset.shape[1]
n_input = 1
generator = TimeseriesGenerator(dataset, result, length=n_input, batch_size=1)
for i in range(len(generator)):
x, y = generator[i]
print('%s => %s' % (x, y))
# define model
model = Sequential()
model.add(LSTM(100, activation='sigmoid', input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer=Adam(lr=0.00001), loss='mse',metrics=['accuracy'])
# fit model
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=2)
# make a one step prediction out of sample
raw_array = array([1507,3]) #predict arrival at 15:07 destination 3, what track will it be?
x_input = array(raw_array).reshape((1,n_input,n_features))
yhat = model.predict(x_input, verbose=1)
print(yhat)
The Problem
Although my code runs, I am getting extremely inaccurate predictions. I'm assuming this is due to my large loss. Any help in getting this model up and running would be greatly appreciated.
Epoch 500/500
1/1 - 0s - loss: 424.2032 - accuracy: 0.0000e+00

Invalid Argument Error while training in Keras

I am trying to make a movie recommendation model using keras:
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from keras.layers import Input, Embedding, Dot, Flatten
rating = pd.read_csv("./ratings.csv",usecols=[0,1,2])
users = len(rating.userId.unique())
movies = len(rating.movieId.unique())
embed_size = 3
train, test = train_test_split(rating, test_size=0.2)
movie_input = Input(shape=[1], name="movie_in")
movie_embed = Embedding(movies, embed_size, name="movie_embed")(movie_input)
movie_vector = Flatten(name="flatten_movies")(movie_embed)
user_input = Input(shape=[1], name="user_in")
user_embed = Embedding(users, embed_size, name="user_embed")(user_input)
user_vector = Flatten(name="flatten_users")(user_embed)
prod = Dot(axes=-1, name="dot-product")([movie_vector, user_vector])
model = keras.Model(inputs=[user_input, movie_input], outputs=prod)
model.compile(optimizer='adam', loss='mse')
model.fit(x=[train.userId, train.movieId], y=train.rating,epochs=10,
verbose=0)
When I try to train the model I am getting the following error:
tensorflow.python.framework.errors_impl.InvalidArgumentError:
indices[15,0]= 7438 is not in [0, 5000)
[[{{node movie_embed/embedding_lookup}} = GatherV2[Taxis=DT_INT32,
Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:#training/Adam/Assign_2"],
_device="/job:localhost/replica:0/task:0/device:CPU:0"]
(movie_embed/embeddings/read, movie_embed/Cast,
training/Adam/gradients/movie_embed/embedding_lookup_grad/concat/axis)]]
But most of the online tutorials use the same code, it works properly for them.
Your movie_embed embedding layer (basically a lookup table) has 5000 rows, so it expects integers between 0 and 5000 as input. You are giving it 7438 as input, which causes the error. There are probably 5000 unique values in rating.movieId, but apparently also values outside of the interval [0, 5000). You'll need to map your train.userId integers onto this interval to make it work.

Exploding Gradient on fully connected layer

I am trying to learn on the MNIST dataset using a deep learning model.
My model is of the format-
Input(28*28*1)
Conv2d(14*14*32)
Conv2d(7*7*64)-flatten
FC(3164*1024)
FC(1024*10)
10 class prediction of MNIST
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train,y_test=one_hot(y_train),one_hot(y_test)
x_train=np.reshape(x_train,[x_train.shape[0],28,28,1])
x_test=np.reshape(x_test,[x_test.shape[0],28,28,1])
x_dataset=tf.data.Dataset.from_tensor_slices(x_train)
y_dataset=tf.data.Dataset.from_tensor_slices(y_train)
train_dataset=tf.data.Dataset.zip((x_dataset,y_dataset)).repeat().batch(50)
iterator=tf.data.Iterator.from_structure(train_dataset.output_types,train_dataset.output_shapes)
next_element=iterator.get_next()
training_init_op=iterator.make_initializer(train_dataset)
x_testds=tf.data.Dataset.from_tensor_slices(x_test)
y_testds=tf.data.Dataset.from_tensor_slices(y_test)
testds=tf.data.Dataset.zip((x_testds,y_testds)).repeat().batch(2000)
valid_inti_op=iterator.make_initializer(testds)
##%%##
def one_hot(y_train):
y_train1=np.zeros((y_train.shape[0],10))
for i in range(y_train.shape[0]):
y_train1[i][y_train[i]]=1
return y_train1
def conv_layer(input,channels_in,channels_out,name="conv"):
with tf.name_scope(name):
input=tf.cast(input,tf.float32)
w=tf.Variable(tf.truncated_normal([5,5,channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.truncated_normal([channels_out],stddev=0.1),name="B")
conv=tf.nn.conv2d(input,w,strides=[1,1,1,1],padding="SAME")
act=tf.nn.relu(conv+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
def fc_layer(input,channels_in,channels_out,name="fc"):
with tf.name_scope(name):
w=tf.Variable(tf.truncated_normal([channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.zeros([channels_out]),name="B")
act=tf.nn.relu(tf.matmul(input,w)+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
conv1=conv_layer(next_element[0],1,32,"conv1")
pool1=tf.nn.max_pool(conv1,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool1")
conv2=conv_layer(pool1,32,64,"conv2")
pool2=tf.nn.max_pool(conv2,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool2")
flattened=tf.reshape(pool2,[-1,7*7*64])
fc1=fc_layer(flattened,7*7*64,1024,"fc1")
logits=fc_layer(fc1,1024,10,"fc2")
##%%##
with tf.name_scope("cross_entropy"):
cross_entropy=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=next_element[1]))
with tf.name_scope("train"):
train_step=tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope("accuracy"):
correct_prediction=tf.equal(tf.argmax(logits,1),tf.argmax(next_element[1],1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
##%%
sess=tf.Session()
tf.summary.scalar('cross_entropy',cross_entropy)
tf.summary.scalar('accuracy',accuracy)
tf.summary.image('input',next_element[0])
merged_summary=tf.summary.merge_all()
writer=tf.summary.FileWriter("D:/work/ML/tensorboard/MNIST/deep/4")
writer.add_graph(sess.graph)
##%%
sess.run(tf.global_variables_initializer())
sess.run(training_init_op)
for i in range(600):
s=sess.run(merged_summary)
if(i%5==0):
writer.add_summary(s,i)
print(i,end="\r")
sess.run(valid_inti_op)
for i in range(1,6):
s1=sess.run(merged_summary)
writer.add_summary(s1,601+i)
My Accuracy and cross_entropy are stuck. After trying to use tensorboard, the issue seems to be that my weights of FC layer are stuck at very large values, even though i have initialised them to 0 if this really is the error then i dont know how to fix and if it isnt then i dont know what the error is.
A sess.run(train_step) after inputting data fixed my code

Categories

Resources