Resnet for Text data - python

Hi I want to use ResNet for Text data. I tried to look some code example lot of other data at the end I wrote the following code. But I'm not sure it's the correct way for ResNet or not.
NOTE::: this part is optional if i recieve an opinion on it. it will be great but I'm going to try it once the above one is corrected. if it is correct way then I want it to implement it in this way ----> ResNet should contain 18 layers in total whereas these layers should be divided into four stages and each stage should consist of two convolutional blocks. Each convolutional block should contain two convolutional layers with batch normalization and ReLU non_linearity in-between. Then, ResNet should pass the output from the convolutional layers to two fully-connected layers that will use the reduced data to classify the initial data to a given website class. Last but not least, you should use Adam optimizer and categorical cross-entropy (typically used for multi-class classification problems). Make sure that you identify and use the optimal hyper-parameters for your ResNet.
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
class ResNet_class():
def __init__(self):
# Cross-Validate
self.no_of_folds = int(input('enter no of K_fold: '))
self.kf = KFold(self.no_of_folds, shuffle=True, random_state=42) # Use for KFold classification
self.EPOCHS = int(input('enter no of epochs: '))
def check_test(self):
df = pd.read_csv(
"https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
na_values=['NA','?'])
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)
med = df['income'].median()
df['income'] = df['income'].fillna(med)
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values
oos_y = []
oos_pred = []
fold = 0
for train, test in self.kf.split(x):
fold += 1
print(f"Fold #{fold}")
x_train = x[train]
y_train = y[train]
x_test = x[test]
y_test = y[test]
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=0,
epochs=self.EPOCHS)
pred = model.predict(x_test)
oos_y.append(y_test)
oos_pred.append(pred)
score = np.sqrt(metrics.mean_squared_error(pred, y_test))
print(f"Fold score (RMSE): {score}")
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred, oos_y))
print(f"Final, out of sample score (RMSE): {score}")
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat([df, oos_y, oos_pred], axis=1)
resnet = ResNet_class()
resnet.check_test()

Related

tokenizer output unable to be converted to numpy array

I've been following the following tutorial to try and understand LSTMs and tensorflow a bit more. From running, it the training of the model goes smoothly, but when I try to use the trained tokenizer on the test data and then convert it to a numpy array, it doesn't work and I'm not really sure what the problem is. The relevant portion that goes wrong is below:
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
The error it presents is as below:
Traceback (most recent call last):
File "/Users/pranavnair/Documents/Code/wpd/wpd.py", line 85, in <module>
x_test = np.array(x_test_data)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10824,) + inhomogeneous part.
I've tried using np.hstack instead of np.array, and that doesn't fix it. Would appreciate any help at all, thanks in advance.
Full code below for reference
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras import utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
# set random seed for reproducibility
RANDOM_SEED = 4
np.random.seed(RANDOM_SEED)
# import datasets
df_neut = pd.read_csv("./input/good.csv")
df_prom = pd.read_csv("./input/promotional.csv")
# clean up data to only include text
df_prom = df_prom.drop(df_prom.columns[1:], axis=1)
df_neut = df_neut.drop(df_neut.columns[1:], axis=1)
# combine datasets
df_neut.insert(1, 'label', 0) # neutral labels
df_prom.insert(1, 'label', 1) # promotional labels
# merge dataframes
df = pd.concat((df_neut, df_prom), ignore_index=True, axis=0)
# randomize order of dataframes
df = df.reindex(np.random.permutation(df.index))
# split into training and testing datasets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# perform data preprocessing using keras tokenizer
text_data = [str(txt) for txt in df_train['text'].values] # convert text data to strings
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', lower=True) # create tokenizer
tokenizer.fit_on_texts(text_data) # make dictionary
# vectorize dataset
x_train = tokenizer.texts_to_sequences(text_data)
# Max number of words in each sequence
MAX_SEQUENCE_LENGTH = 400
# pad sequence lengths
x_train = utils.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
# get test labels
y_train = df_train['label'].values
# create sequential model
model = Sequential()
# create embedding layer
EMBEDDING_DIM = 100
model.add(Embedding(MAX_NB_WORDS+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
# add LSTM layer to model
model.add(LSTM(80))
# setup model layers
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# setup binary classification via binary cross entropy loss
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
# train for two epochs
EPOCHS = 4
BATCH_SIZE = 64
history = model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.15)
# test model
x_test = np.array(tokenizer.texts_to_sequences([str(txt) for txt in df_test['text'].values]))
x_test = utils.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)
y_test = np.array(df_test['label'].values)
# evaluate model
scores = model.evaluate(x_test, y_test, batch_size=128)
print("The model has a test loss of %.2f and a test accuracy of %.1f%%" % (scores[0], scores[1]*100))

How to integrate keras model with sequential backward selection code?

I am trying to integrate a Keras deep neural network as a classifier within code for sequential backward feature selection in Python. (Originally, I tried to wrap the Keras deep neural network within Scikeras to use within scikit-learn's built in sequential feature selection models, but I kept getting error messages).
I found this code from scratch for sequential backward feature selection (taken from https://vitalflux.com/sequential-backward-feature-selection-python-example/), and have been trying to integrate a Keras model in the code to replace the "estimator" within the function but I keep getting this error: ValueError: Input 0 of layer "sequential_410" is incompatible with the layer: expected shape=(None, 45), found shape=(None, 44)
Here is the code that I have so far for the sequential backward feature selection and the deep neural network:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier, KerasRegressor
# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone
class SequentialBackwardSearch():
'''
Instantiate with Estimator and given number of features
'''
def __init__(self, estimator, k_features):
self.estimator = clone(estimator)
self.k_features = k_features
'''
X_train - Training data Pandas dataframe
X_test - Test data Pandas dataframe
y_train - Training label Pandas dataframe
y_test - Test data Pandas dataframe
'''
def fit(self, X_train, X_test, y_train, y_test):
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train.values, X_test.values,
y_train.values, y_test.values, self.indices_)
self.scores_ = [score]
'''
Iterate through all the dimensions until k_features is reached
At the end of loop, dimension count is reduced by 1
'''
while dim > k_features:
scores = []
subsets = []
'''
Iterate through different combinations of features, train the model,
record the score
'''
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, p)
scores.append(score)
subsets.append(p)
#
# Get the index of best score
#
best_score_index = np.argmax(scores)
#
# Record the best score
#
self.scores_.append(scores[best_score_index])
#
# Get the indices of features which gave best score
#
self.indices_ = subsets[best_score_index]
#
# Record the indices of features for best score
#
self.subsets_.append(self.indices_)
dim -= 1 # Dimension is reduced by 1
'''
Transform training, test data set to the data set
havng features which gave best score
'''
def transform(self, X):
return X.values[:, self.indices_]
'''
Train models with specific set of features
indices - indices of features
'''
def _calc_score(self, X_train, X_test, y_train, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train.ravel())
y_pred = self.estimator.predict(X_test[:, indices])
score = accuracy_score(y_test, y_pred)
return score
# ===============================================
# Keras deep neural network
def dnn():
model = keras.Sequential([
layers.Dense(20, activation='relu', input_shape = (X_train.shape[1])),
layers.Dropout(0.3),
layers.Dense(20, activation='relu'),
layers.Dropout(0.3),
layers.Dense(1, activation='sigmoid'),
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy']
)
early_stopping = keras.callbacks.EarlyStopping(
patience=5,
min_delta=0.001,
restore_best_weights=True,
)
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
batch_size=512,
callbacks=[early_stopping],
)
history_df = pd.DataFrame(history.history)
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
return model
keras_clf = KerasClassifier(dnn,
epochs=5,
verbose=False)
keras_clf._estimator_type = "classifier"
And this is the code I have for trying to integrate them together:
k_features = 5
#
# Instantiate SequentialBackwardSearch
#
sbs = SequentialBackwardSearch(keras_clf, k_features)
#
# Fit the data to determine the k_features which give the
# most optimal model performance
#
sbs.fit(X_train, X_test, y_train, y_test)
#
# Transform the training data set to dataset having k_features
# giving most optimal model performance
#
X_train_kfeatures = sbs.transform(X_train)
#
# Transform the test data set to dataset having k_features
#
X_test_kfeatures = sbs.transform(X_test)
sbs.indices_
X_train.columns[[sbs.indices_]] # sbs is an instance of SequentialBackwardSearch class
I am wondering whether this is even possible (integrating a neural network to the existing code for sequential backward feature selection) or if there's anything I can do to get it to run and output the top 5 features from the training dataset. I have tried to address the error message by altering the input shape of the neural network, but I believe it is correct (45 features). Any help or advice would be welcome!
This should work with SciKeras!
I had to clean up your code / fix some bugs. I first did a "sanity check" using Scikit-Learn's MLPClassfier, then I ran it against an MLPClassfier created using Keras. Details may differ for more complex model architectures, but this shows that it does work.
import numpy as np
# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone
class SequentialBackwardSearch:
'''
Instantiate with Estimator and given number of features
'''
def __init__(self, estimator, k_features):
self.estimator = clone(estimator)
self.k_features = k_features
'''
X_train - Training data Pandas dataframe
X_test - Test data Pandas dataframe
y_train - Training label Pandas dataframe
y_test - Test data Pandas dataframe
'''
def fit(self, X_train, X_test, y_train, y_test):
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, X_test,
y_train, y_test, self.indices_)
self.scores_ = [score]
'''
Iterate through all the dimensions until k_features is reached
At the end of loop, dimension count is reduced by 1
'''
while dim > self.k_features:
scores = []
subsets = []
'''
Iterate through different combinations of features, train the model,
record the score
'''
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train, X_test, y_train, y_test, p)
scores.append(score)
subsets.append(p)
#
# Get the index of best score
#
best_score_index = np.argmax(scores)
#
# Record the best score
#
self.scores_.append(scores[best_score_index])
#
# Get the indices of features which gave best score
#
self.indices_ = subsets[best_score_index]
#
# Record the indices of features for best score
#
self.subsets_.append(self.indices_)
dim -= 1 # Dimension is reduced by 1
'''
Transform training, test data set to the data set
havng features which gave best score
'''
def transform(self, X):
return X.values[:, self.indices_]
'''
Train models with specific set of features
indices - indices of features
'''
def _calc_score(self, X_train, X_test, y_train, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train.ravel())
y_pred = self.estimator.predict(X_test[:, indices])
score = accuracy_score(y_test, y_pred)
return score
# Sklearn MLPClassifier
from sklearn.neural_network import MLPClassifier
estimator = MLPClassifier()
search = SequentialBackwardSearch(estimator, 1)
X = np.random.randint(0, 2, size=(100, 5))
y = X[:, -1]
search.fit(X, X, y, y)
assert list(search.indices_) == [4]
# SciKeras MLPClassifier
# see https://www.adriangb.com/scikeras/stable/notebooks/MLPClassifier_MLPRegressor.html
import tensorflow.keras as keras
from scikeras.wrappers import KerasClassifier
class KerasMLPClassifier(KerasClassifier):
def __init__(
self,
hidden_layer_sizes=(100, ),
optimizer="adam",
optimizer__learning_rate=0.001,
epochs=200,
verbose=0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_layer_sizes = hidden_layer_sizes
self.optimizer = optimizer
self.epochs = epochs
self.verbose = verbose
def _keras_build_fn(self, compile_kwargs):
model = keras.Sequential()
inp = keras.layers.Input(shape=(self.n_features_in_))
model.add(inp)
for hidden_layer_size in self.hidden_layer_sizes:
layer = keras.layers.Dense(hidden_layer_size, activation="relu")
model.add(layer)
if self.target_type_ == "binary":
n_output_units = 1
output_activation = "sigmoid"
loss = "binary_crossentropy"
elif self.target_type_ == "multiclass":
n_output_units = self.n_classes_
output_activation = "softmax"
loss = "sparse_categorical_crossentropy"
else:
raise NotImplementedError(f"Unsupported task type: {self.target_type_}")
out = keras.layers.Dense(n_output_units, activation=output_activation)
model.add(out)
model.compile(loss=loss, optimizer=compile_kwargs["optimizer"])
return model
estimator2 = KerasMLPClassifier()
search2 = SequentialBackwardSearch(estimator2, 1)
search2.fit(X, X, y, y)
assert list(search2.indices_) == [4]
Notebook version (can't promise this will be around forever): https://colab.research.google.com/drive/1EWxT3GWZsqhftz4f7W5GsXNe_SPtva4H#scrollTo=chU7wLn1BTU1

I just trained my first ML model based on the titanic dataset from kaggle.I am getting an RMSE value of ~0.4 is it good?

Please Note : I trained my model only on the basis of numerical columns and not the string columns
And please suggest some resources to go further into machine learning as I really like this subject.
Thank you
Here is the code and gives the following output :-
train rmse: 0.42
test rmse: 0.43
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import matplotlib.pyplot as plt
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dftest = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
dftrain.loc[dftrain['fare'] == 0, 'fare'] = 34.85
plt.plot(list(dftrain.age), list(dftrain.fare), '.',markersize = 1)
dftrain = dftrain.drop(['sex', 'class', 'deck','embark_town', 'alone'], axis =1 )
X = dftrain.loc[:, dftrain.columns != 'survived']
y = dftrain.loc[:, 'survived']
model = Sequential()
model.add(Dense(128, activation = 'relu', input_dim = 4))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam' , loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X, y , epochs = 200)
dftest = dftest.drop(['sex', 'class', 'deck','embark_town', 'alone'], axis =1 )
A = dftest.loc[:, dftest.columns != 'survived']
b = dftest.loc[:, 'survived']
from sklearn.metrics import mean_squared_error
import numpy as np
train_pred = model.predict(X)
train_rmse = np.sqrt(mean_squared_error(y, train_pred))
test_pred = model.predict(A)
test_rmse = np.sqrt(mean_squared_error(b, test_pred))
print("train rmse: {:0.2f}".format(train_rmse))
print("test rmse: {:0.2f}".format(test_rmse))```
First, root mean square error might not be a good score to look at in classification problems in the first place. For reasons why, refer to either this post or this stats stack exchange post.
Second, you're training a somewhat large neural network (with many parameters) compared to the available amount of training data (there were only 2224 passengers and crew members). When you have a comparable number of parameters in your model to the amount of training data, you run a risk of overfitting. Refer to this tutorial to learn what you can find about your model from looking at training/validation loss curves and how you can combat over/under fitting. You can experiment with different learning rates, number of epochs, batch sizes, normalization methods etc.
You might also want to take a look at other metrics like accuracy score and precision and recall

Saving TensorFlow model (ensemble) to one pickle file, instead of multiple pickles, in Python

I am looking to deploy a 50 model ensemble for a regression problem, with each model being a Keras.Sequential Neural Network.
Below is a (simplified to 3 models) version of my code, that runs and works fine.
However, I don't want to create a pickle file for every individual model, and so is there a way of creating a class with a list of all the models, resulting in me having to save/load only one pickle file?
from __future__ import absolute_import, division, print_function
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
train = pd.read_csv("Training Data.csv").fillna(0)
X_train = train.drop(['ID_NUMBER','DATE','X','Y','Z'],axis=1)
Y_train = train[['X','Y','Z']]
EPOCHS = 1500
BATCH_SIZE = 256
#Defining the 3 layered Neural Network
def build_model():
model = keras.Sequential([
keras.layers.Dense(1000, activation=tf.nn.softplus,
input_shape=(X_train.shape[1],)),
keras.layers.Dense(500, activation=tf.nn.softplus),
keras.layers.Dense(3)
])
model.compile(loss='mse',optimizer='adam', metrics=['mse'])
return model
model0 = build_model()
# Store training stats
history0 = model0.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
validation_split=0.0, verbose=1)
model1 = build_model()
# Store training stats
history1 = model1.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
validation_split=0.0, verbose=1)
model2 = build_model()
# Store training stats
history2 = model2.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
validation_split=0.0, verbose=1)
model0.save("model0.pkl")
model1.save("model1.pkl")
model2.save("model2.pkl")
For making new predictions, my code would look something like this:
#Loading Models
model0 = tf.keras.models.load_model("model0.pkl")
model1 = tf.keras.models.load_model("model1.pkl")
model2 = tf.keras.models.load_model("model2.pkl")
#Finding Weights (based on train score)
train_nn_predictions = model0.predict(X_train)
train['X'],train['Y'],train['Z'] = train_nn_predictions[:,0],train_nn_predictions[:,1],train_nn_predictions[:,2]
nn0 = #training score metric (irrelevant to show how it is calculated here)
print("Average Train Score for Model 0 is:",nn0)
train_nn_predictions = model1.predict(X_train)
train['X'],train['Y'],train['Z'] = train_nn_predictions[:,0],train_nn_predictions[:,1],train_nn_predictions[:,2]
nn1 = #training score metric (irrelevant to show how it is calculated here)
print("Average Train Score for Model 1 is:",nn1)
train_nn_predictions = model2.predict(X_train)
train['X'],train['Y'],train['Z'] = train_nn_predictions[:,0],train_nn_predictions[:,1],train_nn_predictions[:,2]
nn2 = #training score metric (irrelevant to show how it is calculated here)
print("Average Train Score for Model 2 is:",nn2)
#Apply the weightings for each of the models
w0,w1,w2 = 1/nn0,1/nn1,1/nn2
#New Predictions
new_record = np.array([my variables])
target_predictions = (w0*model0.predict(new_record)+w1*model1.predict(new_record)+w2*model2.predict(new_record))/(w0+w1+w2)
You could try to:
Merge all the models using layers.concatenate. It will create an output for all the 50 models. For more details regarding the code check:
According to https://keras.io/api/layers/merging_layers/concatenate/:
x = np.arange(20).reshape(2, 2, 5)
y = np.arange(20, 30).reshape(2, 1, 5)
tf.keras.layers.Concatenate(axis=1)([x, y])
Use KerasPickleWrapper to pickle it.

Unexpected results when using tfrecords loaded using tf.data.Dataset.list_files() with shuffle argument

I'm hoping to get clarification on how the shuffle argument in tf.data.Dataset.list_files() works. The documentation states that when shuffle=True, the filenames will be shuffled randomly. I've made model predictions using a tfrecords dataset that has been loaded using tf.data.Dataset.list_files(), and I would've expected the accuracy metric to be the same no matter the order of the files (i.e. whether shuffle is True or False), but am seeing otherwise.
Is this expected behavior or is there something wrong with my code or intepretation? I have reproducible example code below.
Oddly, as long as tf.random.set_random_seed() is set initially (and it seems it doesn't even matter what seed value is set), then the predictions results are the same no matter whether shuffle is True or False in list_files().
tensorflow==1.13.1, keras==2.2.4
Thanks for any clarifications!
Edit: re-thinking it through and wondering if Y = [y[0] for _ in range(steps) for y in sess.run(Y)] is a separate and independent call?
# Fit and Save a Dummy Model
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn import datasets, metrics
seed = 7
np.random.seed(seed)
tf.random.set_random_seed(seed)
dataset = datasets.load_iris()
X = dataset.data
Y = dataset.target
dummy_Y = np_utils.to_categorical(Y)
# 150 rows
print(len(X))
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, dummy_Y, epochs=10, batch_size=10, verbose=2)
model.save('./iris/iris_model')
predictions = model.predict(X)
predictions = np.argmax(predictions, axis=1)
# returns accuracy = 0.3466666666666667
print(metrics.accuracy_score(y_true=Y, y_pred=predictions))
Split dataset into multiple tfrecords files so we can reload it with list_files() later:
numrows = 15
for i, j in enumerate(range(0, len(X), numrows)):
with tf.python_io.TFRecordWriter('./iris/iris{}.tfrecord'.format(i)) as writer:
for x, y in zip(X[j:j+numrows, ], Y[j:j+numrows, ]):
features = tf.train.Features(feature=
{'X': tf.train.Feature(float_list=tf.train.FloatList(value=x)),
'Y': tf.train.Feature(int64_list=tf.train.Int64List(value=[y]))
})
example = tf.train.Example(features=features)
writer.write(example.SerializeToString())
At this point, I exit (ipython) and restart again:
import numpy as np
import tensorflow as tf
from keras.models import load_model
from sklearn import metrics
model = load_model('./iris/iris_model')
batch_size = 10
steps = int(150/batch_size)
file_pattern = './iris/iris*.tfrecord'
feature_description = {
'X': tf.FixedLenFeature([4], tf.float32),
'Y': tf.FixedLenFeature([1], tf.int64)
}
def _parse_function(example_proto):
return tf.parse_single_example(example_proto, feature_description)
def load_data(filenames, batch_size):
raw_dataset = tf.data.TFRecordDataset(filenames)
dataset = raw_dataset.map(_parse_function)
dataset = dataset.batch(batch_size, drop_remainder=True)
dataset = dataset.prefetch(2)
iterator = dataset.make_one_shot_iterator()
record = iterator.get_next()
return record['X'], record['Y']
def get_predictions_accuracy(filenames):
X, Y = load_data(filenames=filenames, batch_size=batch_size)
predictions = model.predict([X], steps=steps)
predictions = np.argmax(predictions, axis=1)
print(len(predictions))
with tf.Session() as sess:
Y = [y[0] for _ in range(steps) for y in sess.run(Y)]
print(metrics.accuracy_score(y_true=Y, y_pred=predictions))
# No shuffle results:
# Returns expected accuracy = 0.3466666666666667
filenames_noshuffle = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=False)
get_predictions_accuracy(filenames_noshuffle)
# Shuffle results, no seed value set:
# Returns UNEXPECTED accuracy (non-deterministic value)
filenames_shuffle_noseed = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=True)
get_predictions_accuracy(filenames_shuffle_noseed)
# Shuffle results, seed value set:
# Returns expected accuracy = 0.3466666666666667
# It seems like it doesn't even matter what seed value you set, as long as you you set it
seed = 1000
tf.random.set_random_seed(seed)
filenames_shuffle_seed = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=True)
get_predictions_accuracy(filenames_shuffle_seed)

Categories

Resources