Hello i have problem with GridSearchCV it works perfectly on mnist_dataset but not on my own data, and i don't know why.
# df = pd.read_csv('bank-full.csv',sep=';')
# print(df.head())
#
# print(df.shape)
#
# print(df.columns)
# print(df.info)
# df.columns = [col.replace('"', '') for col in df.columns]
#
#
# df.drop(columns=['day', 'poutcome'], axis =1 , inplace=True)
#
#
# print(df.head())
# print(df.shape)
#
# le = preprocessing.LabelEncoder()
# df.job = le.fit_transform(df.job)
# df.education = le.fit_transform(df.education)
# df.housing = le.fit_transform(df.housing)
# df.loan = le.fit_transform(df.loan)
# #df.poutcome = le.fit_transform(df.poutcome)
# df.month = le.fit_transform(df.month)
# df.contact = le.fit_transform(df.contact)
# df.marital = le.fit_transform(df.marital)
# df.default = le.fit_transform(df.default)
# df.y = le.fit_transform(df.y)
#
#
#
# print(df.head())
#
# X = df.iloc[:, 0:14]
# y = df.iloc[:, 14]
# X = np.array(X, dtype="float64")
# y = np.array(y,dtype="float64")
#
# scaler = Normalizer()
# X = scaler.fit_transform(X)
#
#
#
#
# x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
# model = LogisticRegression(penalty='l2', max_iter=1000)
# model.fit(x_train, y_train)
# prediction = model.predict(x_test)
# from sklearn.metrics import accuracy_score
# print("ACC: {} ".format(accuracy_score(y_test, prediction)))
#
#
# print(x_train.shape)
#
# nn = Sequential()
# nn.add(Dense(120,input_dim = 14, activation='relu'))
# nn.add(Dense(240,activation='relu'))
#
#
# nn.add(Dense(1))
# nn.add(Activation('sigmoid'))
#
# nn.compile(loss=keras.losses.binary_crossentropy,
# optimizer='sgd',
# metrics=['accuracy'])
#
# nn.fit(x_train, y_train,
# batch_size=10,
# epochs=10,
# verbose=1,
#
# validation_data=(x_test, y_test))
#
# loss_acc = nn.evaluate(x_test, y_test, verbose=0)
# print('Test loss:', loss_acc[0])
# print('Test accuracy:', loss_acc[1])
data = bm.load_data('bank-full.csv')
data = bm.preprocess_data(data)
X,y = bm.split_data(data)
scaler = Normalizer()
X = scaler.fit_transform(X)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
start = time()
model = KerasClassifier(build_fn=nnmodel.create_model())
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = np.array([50, 100, 150])
batches = np.array([5, 10, 20])
param_grid = dict(optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
print("total time:", time() - start)
this commented section, is just a simple keras model, that works perfectrly but below if i try gridSearchCV on this model, it gives me this errors:
https://pastebin.com/mhJLSXAS , for example if i run this program https://www.kaggle.com/shujunge/gridsearchcv-with-keras it works perferctly byt on my data it's not, does somebody know why ?
Scikitlearn builds each time new model. Script has to build a classifier with specific parameters inside the grid search method. So you have to send the method name as an argument, not the result of it.
Probably nnmodel.create_model is your function which creates a new model based on parameters. So try to change:
build_fn=nnmodel.create_model()
To:
build_fn=nnmodel.create_model
Related
The model is as below:
inputs_1 = keras.Input(shape=(10081,1))
layer1 = Conv1D(64,14)(inputs_1)
layer2 = layers.MaxPool1D(5)(layer1)
layer3 = Conv1D(64, 14)(layer2)
layer4 = layers.GlobalMaxPooling1D()(layer3)
inputs_2 = keras.Input(shape=(85,))
layer5 = layers.concatenate([layer4, inputs_2])
layer6 = Dense(128, activation='relu')(layer5)
layer7 = Dense(2, activation='softmax')(layer6)
model_2 = keras.models.Model(inputs = [inputs_1, inputs_2], output = [layer7])
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:10166], df[['Result_cat','Result_cat1']].values, test_size=0.2)
X_train = X_train.to_numpy()
X_train = X_train.reshape([X_train.shape[0], X_train.shape[1], 1])
X_train_1 = X_train[:,0:10081,:]
X_train_2 = X_train[:,10081:10166,:].reshape(736,85)
X_test = X_test.to_numpy()
X_test = X_test.reshape([X_test.shape[0], X_test.shape[1], 1])
X_test_1 = X_test[:,0:10081,:]
X_test_2 = X_test[:,10081:10166,:].reshape(185,85)
adam = keras.optimizers.Adam(lr = 0.0005)
model_2.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['acc'])
history = model_2.fit([X_train_1,X_train_2], y_train, epochs = 120, batch_size = 256, validation_split = 0.2, callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)])
Questions:
1) The data is 921rows x 10166columns. Each row is an observation(first 10080 columns being a time series with remaining columns being other statistics features). According to the model, is the input data split into inputs_1 and inputs_2 randomly?
2) I am thinking about doing a kfold cross-validation and splitting the input data into inputs_1 and inputs_2. What is a good way to do this? Thanks
By splitting only indexes.
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=False)
ID_Inp = np.array(range(nSamples))
ID_Out = np.array(range(nSamples))
Inputs = [Input1,Input2]
for IDs_Train, IDs_Test in kfold.split(ID_Inp, ID_Out):
Fold_Train_Input1, Fold_Train_Input2 = Input1[IDs_Train], Input2[IDs_Train]
Fold_Train_OutPut = Output[IDs_Train]
Fold_Test_Input1, Fold_Test_Input2 = Input1[IDs_Test], Input2[IDs_Test]
Fold_Test_OutPut = Output[IDs_Test]
####################
When trying to build my data set an error of "TypeError: 'set' object is not subscriptable" is received.
dataDir = '/content/drive/My Drive/Colab Notebooks/HW 3/' # Directory with input files
trainFile = 'q2train.csv' # Training examples
labelFile = 'q2label.csv' # Test label
validFile = 'q2valid.csv' # Valid Files
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train',
'label',
'valid'}
def get_data(data_set_name, test_prop=0.2, seed=2019):
"""returns data for training, testing, and data characteristics"""
data = data_sets[data_set_name]
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_prop,
random_state=seed)
nF = X.shape[1] # number of features
nC = len(np.unique(y)) # number of classes
nTrain, nTest = len(y_train), len(y_test)
print("\nData set: %s" %data_set_name)
print("\tNumber of features %d" %nF)
print("\tNumber of output classes = %d" %(nC))
print("\tNumber of training examples = %d" %(nTrain))
print("\tNumber of testing examples = %d" %(nTest))
return X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest
for name in data_set:
X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest = get_data(name)
Any help would be appreciated, thanks in advance.
Use a dictionary:
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train': train,
'label': label,
'valid': valid
}
Then data_sets[data_set_name] will retrieve the dataset you want.
I am trying to use hyperas to optimize my keras model but I keep getting NameError: processing (function_name) is not defined. I have already looked at this and this example from hyperas and done exactly that. It doesn't seem to work for me.
This is my code:
def processing():
df = pd.read_json('balanced_all.json')
def label (df):
if df['rating'] < 3:
return 0
if df['rating'] > 3:
return 1
df['label'] = df.apply (lambda df: label(df), axis=1)
df = df[['review_text', 'label']]
maxlen = 100
max_words = 2000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['review_text'].values)
sequences = tokenizer.texts_to_sequences(df['review_text'].values)
word_index = tokenizer.word_index
sequences = pad_sequences(sequences, maxlen=maxlen)
labels = pd.get_dummies(df['label']).values
glove_dir = '/home/uttam/Documents/Thesis/Glove'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 'r', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return sequences, labels, embedding_matrix
def data():
sequences = processing()[0]
labels = processing()[1]
x_train, x_test, y_train, y_test = train_test_split(sequences,labels, test_size = 0.33, random_state = 42)
return x_train, y_train, x_test, y_test
def create_model(x_train, y_train, x_test, y_test):
embedding_dim = 100
max_words = 2000
embedding_matrix = processing()[2]
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=100))
model.add(LSTM(128))
model.add(Dropout({{uniform(0, 1)}}))
model.add(Dense(2, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer={{choice(['rmsprop', 'adam', 'sgd'])}}, loss='binary_crossentropy',metrics=['acc'])
result = model.fit(x_train, y_train, epochs=20, batch_size={{choice([64, 128])}}, validation_split=0.2)
model.save('pre_trained_glove_model.h5')
validation_acc = np.amax(result.history['val_acc'])
print('Best validation acc of epoch:', validation_acc)
return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}
if __name__ == '__main__':
best_run, best_model = optim.minimize(model=create_model,
data=data,
algo=tpe.suggest,
max_evals=5,
trials=Trials())
x_train, y_train, x_test, y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(x_test, y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)
I don't even need the intermediate function, I had to create it because hyperas didn't find the global variable. for e.g. if I had a variable x outside the hyperas function say create_model(), It would say NameError: x is not defined
I need this because as you can see I am using pre-trained glove embedding. I can't put everything in either data() or create_model(). For e.g. data()needs the variable sequences and label and create_model needs the variable embedding_matrix, so there is no way (as far as I know) to split everything in two functions.
Only way this worked for me was by putting everything in both data() and create_model() functions, which definitely is not efficient and not the way to do.
A little bit late but for future reference, you are right hyperas doesn't recognize global variables. You can pass the function in a list of functions in minimize:
best_run, best_model = optim.minimize(model=create_model,
data=data,
functions=[processing], # <<
algo=tpe.suggest,
max_evals=5,
trials=Trials())
As you mentioned if you need to pass a global variable in hyperas. You can choose one of these options:
Using data():
def data():
## ... my code ...
return x_train, y_train, x_test, y_test, foo
def create_model(x_train, y_train, x_test, y_test, foo):
or defining a new function and pass it in the list of functions:
def my_funct():
return foo
def data():
return x_train, y_train, x_test, y_test
def create_model(x_train, y_train, x_test, y_test):
foo = my_funct()
best_run, best_model = optim.minimize(model=create_model,
data=data,
functions=[my_funct], # << foo
algo=tpe.suggest,
max_evals=5,
trials=Trials())
I have the following code and sample which is working fine and exactly I want it to:
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing
import datetime
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
valid_set_size_percentage = 3
test_set_size_percentage = 3
seq_len = 5 # choose sequence length
df = pd.read_csv("Test.csv", encoding = 'utf-16',sep=',',index_col = 0)
df.head()
def normalize_data(df):
cols = list(df_stock.columns.values)
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
df = pd.DataFrame(min_max_scaler.fit_transform(df.values))
df.columns = cols
return df
def load_data(stock, seq_len):
data_raw = stock.as_matrix() # convert to numpy array
data = []
print(data_raw.shape)
for index in range(len(data_raw) - seq_len):
data.append(data_raw[index: index + seq_len])
data = np.array(data);
valid_set_size = int(np.round(valid_set_size_percentage/100*data.shape[0]));
test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]));
train_set_size = data.shape[0] - (valid_set_size + test_set_size);
x_train = data[:train_set_size,:-1,:]
y_train = data[:train_set_size,-1,:4]
x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
y_valid = data[train_set_size:train_set_size+valid_set_size,-1,:4]
x_test = data[train_set_size+valid_set_size:,:-1,:]
y_test = data[train_set_size+valid_set_size:,-1,:4]
return [x_train, y_train, x_valid, y_valid, x_test, y_test]
df_stock = df.copy()
cols = list(df_stock.columns.values)
print('df_stock.columns.values = ', cols)
df_stock_norm = df_stock.copy()
df_stock_norm = normalize_data(df_stock_norm)
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(df_stock_norm, seq_len)
print(y_train[:2])
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('Inputs = ',x_train.shape[2])
print('Outputs = ', y_train.shape[1])
print('x_valid.shape = ',x_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)
index_in_epoch = 0;
perm_array = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)
def get_next_batch(batch_size):
global index_in_epoch, x_train, perm_array
if index_in_epoch > x_train.shape[0]:
start = 0 # start next epoch
index_in_epoch = 0#batch_size
start = index_in_epoch
index_in_epoch += batch_size
end = index_in_epoch
return x_train[perm_array[start:end]], y_train[perm_array[start:end]]
n_steps = seq_len -1
n_inputs = x_train.shape[2]
n_neurons = 100
n_outputs = y_train.shape[-1]
n_layers = 2
learning_rate = 0.001
batch_size =10
n_epochs = 100
train_set_size = x_train.shape[0]
test_set_size = x_test.shape[0]
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None,n_outputs])
layers = [tf.contrib.rnn.LSTMCell(num_units=n_neurons,
activation=tf.nn.leaky_relu, use_peepholes = True)
for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
outputs = outputs[:,n_steps-1,:] # keep only last output of sequence
loss = tf.reduce_mean(tf.squared_difference(outputs, y)) # loss function = mean squared error
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for iteration in range(int(n_epochs*train_set_size/batch_size)):
x_batch, y_batch = get_next_batch(batch_size) # fetch the next training batch
sess.run(training_op, feed_dict={X: x_batch, y: y_batch})
if iteration % int(1*train_set_size/batch_size) == 0:
mse_train = loss.eval(feed_dict={X: x_train, y: y_train})
mse_valid = loss.eval(feed_dict={X: x_valid, y: y_valid})
mse_test = loss.eval(feed_dict={X: x_test, y: y_test})
print('%.2f epochs: MSE train/valid/test = %.3f/%.3f/%.3f'%(
iteration*batch_size/train_set_size, mse_train, mse_valid,mse_test))
try:
save_path = saver.save(sess, "modelfile\\model"+str(iteration)+".ckpt")
except Exception as e:
print(e)
if not os.path.exists("modelfile\\"):
os.makedirs("modelfile\\")
save_path = saver.save(sess, "modelfile\\model"+str(iteration)+".ckpt")
The following is my sample of what I am trying to execute:
Same data Please click and see
I am willing to add the Projector of the Tensorboard to my code. But I could not understand how I can make it. I want to visualize the different inputs I am giving for my training. I am supplying the following columns and trying to predict the ohlc values.
'o', 'h', 'l', 'c', 'rel1', 'rel2', 'rel3', 'rel4', 'rel5', 'rel6', 'rel7', 'rel8'
I want to visualize the above columns in the projector to know how they are relating with each other to give me the output.
Please let me know what I can do to get what I am willing to.
EDITED:
I have tried something as follows but cannot see the projector tab:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None,n_outputs])
symbols = tf.placeholder(tf.int32, [None, 1], name='stock_labels')
embed_matrix = tf.Variable(
tf.random_uniform([1, n_inputs],0.0, 1.0),
name="embed_matrix"
)
stacked_symbols = tf.tile(symbols, [batch_size,n_steps], name='stacked_stock_labels')
stacked_embeds = tf.nn.embedding_lookup(embed_matrix, stacked_symbols)
# stacked_embeds = tf.nn.embedding_lookup(embed_matrix)
# After concat, inputs.shape = (batch_size, num_steps, input_size + embed_size)
inputs_with_embed = tf.concat([X, stacked_embeds], axis=2, name="inputs_with_embed")
embed_matrix_summ = tf.summary.histogram("embed_matrix", embed_matrix)
And edited the following lines in the session code:
merged_sum = tf.summary.merge_all()
global_step = 0
# Set up the logs folder
writer = tf.summary.FileWriter('logs')
writer.add_graph(sess.graph)
projector_config = projector.ProjectorConfig()
# You can add multiple embeddings. Here we add only one.
added_embed = projector_config.embeddings.add()
added_embed.tensor_name = embed_matrix.name
# Link this tensor to its metadata file (e.g. labels).
shutil.copyfile("logs\\metadata.tsv",
"logs\\metadata1.tsv")
added_embed.metadata_path = "metadata.tsv"
# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(writer, projector_config)
sess.run(tf.global_variables_initializer())
if iteration % int(1*train_set_size/batch_size) == 0:
global_step += 1
mse_train = loss.eval(feed_dict={X: x_train, y: y_train})
mse_valid = loss.eval(feed_dict={X: x_valid, y: y_valid})
mse_test = loss.eval(feed_dict={X: x_test, y: y_test})
_,train_merge = sess.run([outputs,merged_sum], feed_dict={X: x_train, y: y_train})
writer.add_summary(train_merge, global_step=global_step)
Here is teh metadata.tsv file
Please let me know what I missed.
I am working with the dbpedia training set (small_train, small_test). Using estimators with bag_of_words_model gives an accuracy of 0.70
Tensorflow version 0.9
Why is the following code behaving differently than the estimators ?
In estimators graph_action's _run_with_monitors does the same thing by running the train_op for given no of steps
outputs = session.run(tensors, feed_dict=feed_dict)
While the same model implemented as below gives accuracy of 0.67
with tf.Graph().as_default():
sess = tf.Session()
tf.set_random_seed(1618)
keys_placeholder = tf.placeholder(tf.int64, shape=(None,),name='keys')
keys = tf.identity(keys_placeholder)
labels_placeholder = tf.placeholder(tf.int64, shape=(None,),name='labels')
input_placeholder = tf.placeholder(tf.int64, shape=(None, 10),name='input')
inputs = {'key': keys_placeholder.name, 'inputs_': input_placeholder.name}
tf.add_to_collection('inputs', json.dumps(inputs))
target = tf.one_hot(labels_placeholder, 15, 1, 0)
word_vectors = learn.ops.categorical_variable(input_placeholder, n_classes=n_words,
embedding_size=EMBEDDING_SIZE, name='words')
features = tf.reduce_max(word_vectors, reduction_indices=1)
prediction, loss = learn.models.logistic_regression(features, target)
prediction = tf.argmax(prediction, 1)
train_op = tf.contrib.layers.optimize_loss(
loss, tf.contrib.framework.get_global_step(),
optimizer='Adam', learning_rate=0.01)
outputs = {'key': keys.name, 'prediction': prediction.name}
tf.add_to_collection('outputs', json.dumps(outputs))
init = tf.initialize_all_variables()
sess.run(init)
for step in xrange(100):
fill_feed_dict = {
keys_placeholder: np.array(range(len(x_train))),
labels_placeholder: y_train,
input_placeholder: x_train }
start_time = time.time()
_, loss_value = sess.run([train_op, loss],
feed_dict=fill_feed_dict)
duration = time.time() - start_time
print("Done Training")
y_predicted = sess.run(prediction, feed_dict={input_placeholder: x_test})
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))
I am reading the data as follows:-
global n_words
training_set = pandas.read_csv('dbpedia_data/dbpedia_csv/train_small.csv', header=None)
testing_set = pandas.read_csv('dbpedia_data/dbpedia_csv/test_small.csv', header=None)
x_train = training_set[2]
y_train = training_set[0]
x_test = testing_set[2]
y_test = testing_set[0]
# Process vocabulary
np.random.seed(1)
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
x_train = np.array(list(vocab_processor.fit_transform(x_train)))
x_test = np.array(list(vocab_processor.transform(x_test)))
n_words = len(vocab_processor.vocabulary_)