I created a multiple input one output LSTM that estimated the total price with a dataset of daily room rates for a hotel by month, but the model I created doesn't work well. Below I shared the model's code and the link to the data set.
data = pd.read_csv("/content/drive/My Drive/hotels.csv")
data
enter image description here
new_data = data.loc[:,['date','days','price','total']]
new_data.info()
date = new_data.date.values
dates = []
for i in date:
dates.append(i.split('/')[0])
new_data['date'] = dates
new_data
enter image description here
new_data = new_data.astype('float32')
new_data.info()
enter image description here
import pickle
filehandler = open(b"Hotels.obj","wb")
pickle.dump(new_data,filehandler)
file = open("/content/Hotels.obj",'rb')
object_file = pickle.load(file)
object_file
enter image description here
from math import sqrt
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Concatenate
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop,Adam
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import datetime
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from packaging import version
print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
"This notebook requires TensorFlow 2.0 or above."
file = open('/content/Hotels.obj', 'rb')
scaler = MinMaxScaler(feature_range=(0, 1))
train_size = int(len(object_file) * 0.76)
test_size = len(object_file) - train_size
days = object_file["days"].values.reshape(-1,1)
price = object_file["price"].values.reshape(-1,1)
total = object_file["total"].values.reshape(-1,1)
date = object_file["date"].values.reshape(-1,1)
days_ = scaler.fit_transform(days)
total_ = scaler.fit_transform(total)
price_ = scaler.fit_transform(price)
date_ = scaler.fit_transform(date)
days_train = days_[0:train_size].reshape(train_size,1,1)
days_test = days_[train_size:len(days_)].reshape(test_size,1,1)
date_train = date_[0:train_size].reshape(train_size,1,1)
date_test = date_[train_size:len(days_)].reshape(test_size,1,1)
price_train = price_[0:train_size].reshape(train_size,1,1)
price_test = price_[train_size:len(price_)].reshape(test_size,1,1)
total_train = total_[0:train_size].reshape(train_size,1)
total_test = total_[train_size:len(total_)].reshape(test_size,1)
def buildModel(dataLength,labelLength):
date = tf.keras.Input(shape=(1,1),name='date')
days = tf.keras.Input(shape=(1,1),name='days')
price = tf.keras.Input(shape=(1,1),name='price')
dateLayers = LSTM(100,return_sequences=False)(date)
daysLayers = LSTM(100,return_sequences=False)(days)
priceLayers = LSTM(100,return_sequences=False)(price)
output = tf.keras.layers.concatenate(inputs=[dateLayers,daysLayers, priceLayers],axis=1)
output = Dense(labelLength,activation='relu',name='weightedAverage_output_3')(output)
model = Model(inputs=[date,days,price],outputs=[output])
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=optimizer,loss='mse',metrics=['accuracy'])
return model
object_file = pickle.load(file)
logdir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
rnn = buildModel(train_size,1)
rnn.fit([date_train,days_train,price_train],
[total_train],
validation_data = ([date_test,days_test,price_test],[total_test]),
epochs = 1,
batch_size = 10,
callbacks=[tensorboard_callback]
)
result = rnn.predict([date_test,days_test,price_test])
scaler.inverse_transform(result)
enter image description here
When I increase the number of epoch, the model is being overfit.I can't get the result I want.How can I do this?
Data set link : https://www.kaggle.com/leomauro/argodatathon2019#hotels.csv
Your results are poor because your metrics is accuracy. If I understand correctly, you're predicting a continuous variable — you're not classifying. So, it makes no sense to look at accuracy.
Metrics should be mae for mean absolute error. I think you'll be satisfied with your model performance then.
Re-scaling your target makes no sense here. It's the inner workings of the neural network that prefer an input between 0 and 1.
Related
I'm trying to implement a prediction using the Cleveland dataset from kaggle.com. I want to use shufflenet to implement as below in deep learning. All the examples I have seen use image datasets. I need guidance on how to go about this using non-image datasets like the Cleveland or SAHeart heart disease datasets.
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score
from keras.models import Sequential
from keras.layers import Dense
############# main ################
if __name__ == '__main__':
lc= "heart.csv"
dataset = pd.read_csv(lc)
predictors = dataset.drop("target",axis=1)
target = dataset["target"]
X_train,X_test,Y_train,Y_test = train_test_split(predictors,target,test_size=0.20,random_state=0)
model = Sequential()
model.add(Dense(11,activation='relu',input_dim=13))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train,Y_train,epochs=300)
#Predict
Y_pred_nn = model.predict(X_test)
rounded = [round(x[0]) for x in Y_pred_nn]
Y_pred_nn = rounded
score_nn = round(accuracy_score(Y_pred_nn,Y_test)*100,2)
precision_score_nn = precision_score(Y_pred_nn, Y_test, average=None)
#Print Accuracy score
print("The accuracy score achieved using Neural Network is: "+str(score_nn)+" %")
print("The precision score achieved using Neural Network is: "+str(precision_score_nn)+" %")
I'm trying to learn tensorflow basic and make codes to check students performance score with this csvfrom kaggle, .
But I have this error
The error is
ValueError
Data cardinality is ambiguous:
x sizes: 1000
y sizes: 3
Make sure all arrays contain the same number of samples.
File
"C:\Users\w1234\algorithm.py\tensor\tensorflow\students_performance.py",
line 30, in model.fit(np.array(x_data), np.array(y_data),
epochs = 100)
Could you help me? How can I change the samples size?
The codes
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import os
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
data = pd.read_csv("C:/Users/w1234/algorithm.py/tensor/tensorflow/students_performance.csv")
data = data.dropna()
x_data = []
y_data = [data['math score'].values,
data['reading score'].values,
data['writing score']]
for i, row in data.iterrows() :
x_data.append([row['gender'],
row['parental level of education'],
row['lunch'],
row['test preparation course']])
model = Sequential([Dense(64, activation='relu'),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid', name = 'output')])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = 'accuracy')
model.fit(np.array(x_data), np.array(y_data), epochs = 100)
Typically machine learning algorithms work with numeric matrices or tensors and hence most feature engineering techniques deal with converting raw data into some numeric representations which can be easily understood by these algorithms.
From your code it seems like you are trying to predict the output for race/ethnicity which is the output variable.
gender, parental level of education, lunch, test preparation course are all categorical columns with dtype as object, we must convert these columns to numerical columns, hence I have used one-hot encoding.
Please find the working code below:
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import os
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
data = pd.read_csv("/content/StudentsPerformance.csv")
data = data.dropna()
#y_data is the output variable
y_data=data.pop("race/ethnicity")
#x_data are the input variables or the features on which y_data is depended
x_data=data
x_data.astype('object')
categorical_cols = ['gender', 'parental level of education', 'lunch', 'test preparation course']
#One-hot encoding
x_data = pd.get_dummies(x_data, columns = categorical_cols)
x_data.astype('float')
y_data =pd.get_dummies(y_data)
model = Sequential([Dense(64, activation='relu', ),
Dense(32, activation='relu'),
Dense(5, activation='sigmoid', name = 'output')])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = 'accuracy')
model.fit((x_data), (y_data), epochs = 100)
Let us know if the issue still persists. Thanks!
I am trying to build the following model but am getting this error when I am finally training the model and trying to get it's accuracy. It gets stuck when I am feedingg in the trainiing data in my linear model to train it.
Here is the whole code->
# Importing all needed libraries to build the model->
import tensorflow as tf
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc
print("All modules imported.")
# Loading a data_set to train our model-->
training_data = pd.read_csv("Real estate.csv")
testing_data = pd.read_csv("Real estate eval.csv")
# print(training_data.head())
y_train = training_data.pop("Y house price of unit area")
y_test = testing_data.pop("Y house price of unit area")
# print(training_data.head())
print(y_train)
print(y_test)
numerical_colunms = ["No","X1 transaction date","age","X3 distance to the nearest MRT station","X4 number of convenience stores",
"X5 latitude","X6 longitude"]
feature_colunms=[]
for feature_name in numerical_colunms:
print(feature_name)
feature_colunms.append(tf.feature_column.numeric_column(feature_name,dtype=tf.float32))
print(feature_colunms)
# Making an input function to ddistribute our data into vatches,batch size and define the no.of epochs->
def make_input_fn(data_df, y_df, num_epochs=10,shuffle=True,batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices(dict(data_df),y_df)
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
# This would be returning a function object for use->
return input_function
# Finally preparing the objects for training and testing data that shall be fedd into our model->
training_data_input = make_input_fn(training_data,y_train)
testing_data_input = make_input_fn(testing_data,y_test,num_epochs=1,shuffle=True)
# Actuallu making the linear model :)
linear_model = tf.estimator.LinearClassifier(feature_columns=feature_colunms)
# Training the model built=>>
linear_model.train(training_data_input)
results = linear_model.evaluate(testing_data_input)
clear_output()
print(f"The accuracy of the model ia >> {results['accuracy']}")
The main error is showing here-
linear_model.train(training_data_input)
I've just recently started dabbling with Keras and I'm having a hell of a time trying to understand how to format my data to be used by Keras.
I'm currently trying to use a TimeDistributed ConvLSTM1D layer to predict stock prices. I've used a walk-forward approach to generate my dataset.
In creating the dataset, each data point comprises of 21 features for 5 days. I've arranged these data points into sequences of 10. The shapes for my test data and labels are as follows:
X.shape = (3467, 10, 5, 21)
Y.shape = (3467, 10)
From my understanding of a TimeDistributed(ConvLSTM1D) layer, I would need the following shape: (samples, time, rows, channels)
I assume samples should be 10 and time should be 5. I'm not clear on how I should reshape my array to account for rows and channels. Do I have 21 rows of 1 channel, or 1 row of 21 channels? Does it even matter?
Below is my code:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import sys
import warnings
from datetime import datetime, timedelta
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from keras.models import Sequential, model_from_json
from keras.layers import Conv2D, Conv1D, MaxPooling1D, Bidirectional,LSTM,Dropout,TimeDistributed,Flatten, ConvLSTM1D, ConvLSTM2D
from keras.layers import Dense, RepeatVector, LeakyReLU
from keras.layers import BatchNormalization
from keras.callbacks import TensorBoard
from talib import DEMA, ROCR, ATR, RSI, SMA, ADX, CCI, MACD, OBV, SAR, EMA, T3
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib.ticker import AutoMinorLocator, FixedLocator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression
from sklearn import preprocessing
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #stop using GPU since there's a problem
def setup_TA(dataframe):
#Add these indicators
dataframe['DEMA']=DEMA(dataframe['Close'])
dataframe['ROCR']=ROCR(dataframe['Close'])
dataframe['ATR']=ATR(dataframe['High'], dataframe['Low'], dataframe['Close'])
dataframe['RSI']=RSI(dataframe['Close'])
dataframe['SMA']=SMA(dataframe['Close'])
dataframe['ADX']=ADX(dataframe['High'], dataframe['Low'], dataframe['Close'])
dataframe['CCI']=CCI(dataframe['High'], dataframe['Low'], dataframe['Close'])
dataframe['MACD'], dataframe['MACD Signal'], dataframe['MACD Hist'] = MACD(dataframe['Close'])
dataframe['OBV']=OBV(dataframe['Close'],dataframe['Volume'])
dataframe['SAR']=SAR(dataframe['High'], dataframe['Low'])
dataframe['EMA']=EMA(dataframe['Close'])
dataframe['T3']=T3(dataframe['Close'])
# Lets make everything relative to the close price as far as we can.
#Reduce these scales to between 0-1
dataframe['RSI'] = dataframe['RSI'] / 100
dataframe['CCI'] = dataframe['CCI'] / 100
dataframe['ADX'] = dataframe['ADX'] / 100
dataframe['SAR'] = dataframe['SAR'] / dataframe['Close']
dataframe['ATR'] = dataframe['ATR'] / dataframe['Close']
dataframe['OBV'] = dataframe['OBV'] / dataframe['Volume']
dataframe['Open'] = dataframe['Open'] / dataframe['Close']
dataframe['High'] = dataframe['High'] / dataframe['Close']
dataframe['Low'] = dataframe['Low'] / dataframe['Close']
dataframe['DEMA'] = dataframe['DEMA'] / dataframe['Close']
dataframe['SMA'] = dataframe['SMA'] / dataframe['Close']
dataframe['EMA'] = dataframe['EMA'] / dataframe['Close']
dataframe['T3'] = dataframe['T3'] / dataframe['Close']
dataframe['ROCR'] = dataframe['ROCR'] - 1
#Remove any rows with NaN
rows_with_nan=[]
for index, rows in dataframe.iterrows():
if rows.isnull().any():
rows_with_nan.append(index)
dataframe.drop(rows_with_nan,inplace=True)
dataframe=dataframe.reset_index(drop=True)
return dataframe
def feature_selection(dataframe):
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit_transform(dataframe.drop(['Labels'], 1).to_numpy(), dataframe['Labels'].to_numpy())
sorted = (selector.scores_).argsort()[:]
print(sorted)
print(dataframe.columns[sorted])
print(selector.scores_)
print(list(dataframe))
#Open, High, Low, Close, Volumen, compound, msg count, DEMA, ROCR, ATR, RSI, SMA, ADX, CCI, MACD, MACD Signal, MACD Hist, OBV, SAR, EMA, T3
raw_data = pd.read_csv('./Test/test.csv', index_col=0)
ta_df = setup_TA(raw_data)
ta_df['msg count'] +=1
ta_df['Shifted Close'] = ta_df['Close'].shift(1)
ta_df['Shifted Volume'] = ta_df['Volume'].shift(1)
ta_df['Shifted msg count'] = ta_df['msg count'].shift(1)
ta_df['Close'] = np.log(ta_df['Close']/ta_df['Shifted Close'])
ta_df['Volume'] = np.log(ta_df['Volume']/ta_df['Shifted Volume'])
ta_df['msg count'] = np.log(ta_df['msg count']/ta_df['Shifted msg count'])
ta_df.drop(['Shifted Close', 'Shifted Volume', 'Shifted msg count'], 1, inplace=True)
ta_df = ta_df.drop(0,0).reset_index(drop=True)
ta_df['Close'] = (ta_df['Close'] -ta_df['Close'].mean())/ta_df['Close'].std()
ta_df['Volume'] = (ta_df['Volume'] - ta_df['Volume'].mean())/ta_df['Volume'].std()
ta_df['compound'] = (ta_df['compound'] - ta_df['compound'].mean())/ta_df['compound'].std()
ta_df['msg count'] = (ta_df['msg count'] - ta_df['msg count'].mean())/ta_df['msg count'].std()
ta_df['OBV'] = (ta_df['OBV'] - ta_df['OBV'].mean())/ta_df['OBV'].std()
ta_df['Labels'] = ta_df['Close'].shift(-1)
ta_df=ta_df[:-1]
clean_df = ta_df.drop(['Date','Labels'],1)
#create samples(5 day window, 10 previous weeks?)
X=[]
Y=[]
for k in range(len(clean_df)-10-5):
temp_x=[]
temp_y=[]
for i in range(10):
start_index=k+i
end_index = start_index + 4
temp2_x=[]
for j in range(start_index, end_index+1):
temp2_x.append(clean_df.xs(j))
temp_x.append(temp2_x)
temp_y.append(ta_df.iloc[end_index,ta_df.columns.get_loc('Labels')])
X.append(temp_x)
Y.append(temp_y)
pd.set_option('display.max_rows',None)
X=np.array(X)
Y=np.array(Y)
#X = X.reshape(X.shape[0],X.shape[1],X.shape[2],1,X.shape[3])
#Y = Y.reshape(Y.shape[0],Y.shape[1],1)
print(X.shape)
print(Y.shape)
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, shuffle=False)
model = Sequential()
model.add(TimeDistributed(ConvLSTM1D(filters = 21, kernel_size=(3), padding='same', return_sequences=True, input_shape=(10,5,21,1))))
model.add(TimeDistributed(ConvLSTM1D(filters = 21, kernel_size=(3), padding='same', return_sequences=False)))
model.add(TimeDistributed((Flatten())))
model.add(TimeDistributed(Dense(21)))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam', metrics = ['mse'])
history = model.fit(train_X, train_Y, validation_data=(test_X, test_Y), epochs = 100, batch_size = 64, shuffle=False, verbose=1)
I try to build an android apps to predict text classification using AverageWordVecModelSpec that have been provided by Tensorflow Lite Model Maker.
I'm using books content to test if my apps works. There are 3 books I've provided for this experiment. Here's the code:
!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]
import numpy as np
import os
import tensorflow as tf
assert tf.__version__.startswith('2')
from tensorflow_examples.lite.model_maker.core.data_util.text_dataloader import TextClassifierDataLoader
from tensorflow_examples.lite.model_maker.core.task.model_spec import AverageWordVecModelSpec
from tensorflow_examples.lite.model_maker.core.task import text_classifier
data_path = '/content/drive/My Drive/datasetps'
model_spec = AverageWordVecModelSpec()
train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['categorya', 'categoryb'])
test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)
model = text_classifier.create(train_data, model_spec=model_spec)
loss, acc = model.evaluate(test_data)
model.export(export_dir='.')
It works when i only use 2 classes/books (same as examples provided by tensorflow team):
it works normal even though it has small acurracy-- because i only takes 20 sample page per book as dataset actually
You can see that i have rational loss value here,
But i have a problem when i've try to add the 3rd class:
train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['categorya', 'categoryb', 'categoryc'])
test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)
Here's the training result involving 3rd class:
enter image description here
You can see that it's not rational for having loss value more than 1.
I've tried to find which line of code (from Tensorflow Model Maker) that i should change to solve it and ended up to this question in this forum.
So is it possible to have multiclass model for textclassifier using
AverageWordVecModelSpec TFlite model maker?
It is possible. I would suggest to encode your labels first and then follow the workflow:
from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker import TextClassifierDataLoader
from tflite_model_maker import ExportFormat
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_excel('data_set.xls')
col = ['sentence', 'your_label']
df = df[col]
# Encoding happens here
df.your_label = pd.Categorical(df.your_label)
df['label'] = df.book_label.cat.codes
train, test = train_test_split(df, test_size=0.2)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
spec = model_spec.get('average_word_vec')
train_data = TextClassifierDataLoader.from_csv(
filename='train.csv',
text_column='sentence',
label_column='label',
model_spec=spec,
delimiter=',',
is_training=True)
test_data = TextClassifierDataLoader.from_csv(
filename='test.csv',
text_column='sentence',
label_column='label',
model_spec=spec,
delimiter=',',
is_training=False)
model = text_classifier.create(train_data, model_spec=spec, batch_size=5, epochs=4)
config = configs.QuantizationConfig.create_dynamic_range_quantization(optimizations=[tf.lite.Optimize.OPTIMIZE_FOR_LATENCY])
model.export(export_dir='average_word_vec/', export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])