implementation decision trees in tensorflow

implementation decision trees in tensorflow - python

feature_columns = []
for feature_name in train.columns.tolist() :
feature_columns.append(tf.feature_column.numeric_column(feature_name,dtype=tf.float32))
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
if shuffle:
dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = dataset.repeat(n_epochs)
# In memory training doesn't use batching.
dataset = dataset.batch(NUM_EXAMPLES)
return dataset
return input_fn
# Training and evaluation input functions.
train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, shuffle=False, n_epochs=1)
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
n_batches_per_layer=n_batches)
est.train(train_input_fn, max_steps=100)
result = est.evaluate(eval_input_fn)
result
built a decision tree model. like everything works, trains, checks for validation. but I just can't run the test sample (
test_input_fn = tf.data.Dataset.from_tensors(dict(X))
prediction = list(est.predict(test_input_fn))
example by which I studied
https://www.tensorflow.org/tutorials/estimator/boosted_trees
and this is where I read all sorts of parameters. I just can't figure out how to get predictions on the test sample
https://www.tensorflow.org/api_docs/python/tf/estimator/BoostedTreesClassifier

test_input_fn = make_input_fn(test, test.index, shuffle=False, n_epochs=1)
preds = est.predict(test_input_fn)
preds = [pred['class_ids'][0] for pred in preds]
pd.DataFrame({'PassengerId': dataTest.PassengerId, 'Survived':
preds}).to_csv('submission.csv', index=False)
!head submission.csv

Related

I use pytorch to train a model to classify iris, but my acc was about 0.4

I have tried many improvements like increasing epochs, using better loss functions and optimizers, deepening the network and shuffling the dataset, etc, but still to no avail. This problem has been bothering me for a long time, thanks for your help. Below is my code.
load and process dataset(updated)
def Iris_Reader(dataset):
train_data, test_data, train_label, test_label = train_test_split(dataset.data, dataset.target, test_size=0.4)
# scaler = StandardScaler()
# train_data = scaler.fit_transform(train_data)
# test_data = scaler.transform(test_data)
return torch.FloatTensor(train_data), torch.LongTensor(train_label), torch.FloatTensor(test_data), torch.LongTensor(test_label)
Define the classifier
class Classifier(nn.Module):
def __init__(self):
super().__init__()
#4*3*3 network
self.model = nn.Sequential(
nn.Linear(4,3),
nn.ReLU(),
nn.Linear(3,3),
)
#SGD
self.optimiser = torch.optim.SGD(self.parameters(), lr = 0.1)
#MSE LOSS_FUNCTION
self.loss_fn = nn.CrossEntropyLoss()
self.counter = 0
self.progress = []
def forward(self, input):
return self.model(input)
def train(self, input, target):
output = self.forward(input)
loss = self.loss_fn(output, target)
self.counter += 1
self.progress.append(loss.item())
self.optimiser.zero_grad()
loss.backward()
self.optimiser.step()
# plot loss
def plot_loss(self):
plt.figure(dpi=100)
plt.ylim([0,1.0])
plt.yticks([0, 0.25, 0.5, 1.0])
plt.scatter(x = [i for i in range(len(self.progress))], y = self.progress, marker = '.', alpha = 0.2)
plt.grid('on')
plt.show()
TRAIN
C = Classifier()
epochs = 10
dataset = datasets.load_iris()
for epoch in range(epochs):
train_data, train_label, _, _ = Iris_Reader(dataset)
for i, j in zip(train_data, train_label):
C.train(i, j)
TEST
score = 0
num = 0
# for epoch in range(epochs):
_, _, test_data, test_label = Iris_Reader(dataset)
for i,j in zip(test_data, test_label):
output = C.forward(i).detach().argmax()
if output == j:
# print(C.forward(i).detach(), j)
score += 1
num += 1
print(score, num, round(score/num, 3))
OUTPUT: 53 60 0.883

There's a bunch of problems here:
First, you seem to shuffle data and labels independently, rendering the dataset useless.
Also, you recreate the dataset inside the loop every epoch, wasting the CPU time pointlessly.
Overall, the dataset creation can be shortened to something like this:
def Iris_Reader(dataset):
train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(dataset.data, dataset.target, test_size=0.2)
return torch.FloatTensor(train_data), torch.LongTensor(train_label), torch.FloatTensor(test_data), torch.LongTensor(test_label)
and should be taken outside the loop.
Next, MSELoss() is suited for regression. For classification, CrossEntropyLoss() is the default choice.
Using sigmoid as activation in an intermediate layer is not the best choice, especially with a short number of epochs. ReLU should converge much better.
Last but not least, your loss chart would look much cleaner if the values were averaged per epoch.
Update: the implementation that ensures the target having the same size as network output, with additional feature scaling:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def Iris_Reader(dataset):
label = nn.functional.one_hot(torch.LongTensor(dataset.target), num_classes=3).float()
train_data, test_data,train_label, test_label = train_test_split(dataset.data, label, test_size=0.2)
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)
return torch.FloatTensor(train_data), train_label, torch.FloatTensor(test_data), test_label
Oh, and you should probably also remove the final Sigmoid() since CrossEntropyLoss() applies logsoftmax anyway.

Why does tensorflow show inaccurate loss?

I'm using Tensorflow to train a network to predict the third item in a list of numbers.
When I train, the network appears to train quite well and do well on both the training and test set. However, when I evaluate its performance myself, it seems to be doing quite poorly.
For example, at the end of training, Tensorflow says that the validation loss is 2.1 x 10^(-5). However, when I compute it myself, I get 0.17 x 10^0. What am I doing wrong?
Here's code that can be run on Google Colab:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
def create_dataset(k=5, n=2, example_amount=200):
'''Create a dataset of numbers where the goal is to always output the nth number'''
# UPGRADE: this could be done better with numpy to just generate all the examples at once
example_amount = 1000
x = []
y = []
ans = [x, y]
for i in range(example_amount):
example_x = np.random.rand(k)
example_y = example_x[n]
x.append(example_x)
y.append(example_y)
return ans
def tensorize(tensor_like) -> tf.Tensor:
'''Turn stuff into tensors'''
return tf.convert_to_tensor(tensor_like, dtype=tf.float32)
def split_dataset(dataset, train_split=0.8, random_state=42):
'''
Takes in a list (or tuple) where index 0 contains the inputs and index 1 contains the outputs
outputs x_train, x_test, y_train, y_test, train_indexes, test_indexes all as tf.Tensor
'''
indices = np.arange(len(dataset[0]))
return tuple([tensorize(data) for data in train_test_split(dataset[0], dataset[1], indices, train_size=train_split, random_state=random_state)])
# how many numbers in each example
K = 5
# the index of the solution
N = 2
# how many examples
EXAMPLE_AMOUNT = 20000
# what percentage of the examples are in the training set
TRAIN_SPLIT = 0.5
# how long to train for
epochs = 50
dataset = create_dataset(K, N, EXAMPLE_AMOUNT)
x_train, x_test, y_train, y_test, train_indexes, test_indexes = split_dataset(dataset, train_split=TRAIN_SPLIT)
model_input = tf.keras.layers.Input(shape=(K,), name="input")
model_dense1 = tf.keras.layers.Dense(10, name="dense1")(model_input)
model_dense2 = tf.keras.layers.Dense(10, name="dense2")(model_dense1)
model_output = tf.keras.layers.Dense(1, name="output")(model_dense2)
model = tf.keras.Model(inputs=model_input, outputs=model_output)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse")
history = model.fit(x=x_train, y=y_train, validation_data=(x_test, y_test), epochs=epochs)
# the validation loss as Tensorflow computes it
print(history.history["val_loss"][-1]) # 2.1036579710198566e-05
# the validation loss as I compute it
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test, model.predict(x_test))).numpy()
print(val_loss) # 0.1655631

What you miss is that the shape of y_test.
y_test.numpy().shape
(500,) <-- causing the behaviour
Simply reshape it like:
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test.numpy().reshape(-1,1), model.predict(x_test))).numpy()
print(val_loss) # 1.1548506e-05
Also:
history.history["val_loss"][-1] # 1.1548506336112041e-05
Or you can flatten() both of the data while calculating it:
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test.numpy().flatten(), model.predict(x_test).flatten())).numpy()
print(val_loss) # 1.1548506e-05

How to append predicted data from test loader every other one

I want to appended every other predicted images and real one in CNN code, but I am not sure how to implement it.
The code is as below:
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=True)
test_pred=[]
test_real=[]
model.eval()
with torch.no_grad():
for data in test_loader:
x_test_batch, y_test_batch = data[0].to(device,
dtype=torch.float), data[1].to(device, dtype=torch.float)
y_test_pred = model(x_test_batch)
mse_val_loss = criterion(y_test_batch, y_test_pred, x_test_batch, mse)
mae_val_loss = criterion(y_test_batch, y_test_pred, x_test_batch, l1loss)
mse_val_losses.append(mse_val_loss.item())
mae_val_losses.append(mae_val_loss.item())
N_test.append(len(x_test_batch))
test_pred.append(y_test_pred[::2])
test_real.append(y_test_batch[::2])

How to do both Data Augmentation and Cross Validation at the same time in NLP?

I have read somewhere that you should not use data augmentation on your validation set, and you should only use it on your training set.
My problem is this:
I have a dataset which has less number of training samples and I want to use data augmentation.
I split the dataset into training and test set and use data augmentation on the training set. I then use StratifiedKfold on the training set, which returns me a train index and a test index, but if I use the X_train[test index] as my validation set, it has some augmented images and I don't want that.
Is there any way to do data augmentation on the training set and do cross-validation ?
Here is my code(I haven't done data augmentation but would love to get a way to separate out the test_index from the augmented training samples.):
kfold = StratifiedKFold(n_splits=5,shuffle=True)
i=1
for train_index , test_index in kfold.split(X_train,y_train):
dataset_train = tf.data.Dataset.from_tensor_slices((X_train[train_index],
y_train.iloc[train_index])).shuffle(len(X_train[train_index]))
dataset_train = dataset_train.batch(512,drop_remainder=True).repeat()
dataset_test = tf.data.Dataset.from_tensor_slices((X_train[test_index],
y_train.iloc[test_index])).shuffle(len(X_train[test_index]))
dataset_test = dataset_test.batch(32,drop_remainder=True).take(steps_per_epoch).repeat()
model_1 = deep_neural()
print('--------------------------------------------------------------------------------------------
-------------------------------')
print('\n')
print(f'Training for fold {i} ...')
print('Training on {} samples.........Validating on {} samples'.format(len(X_train[train_index]),
len(X_train[test_index])))
checkpoint = tf.keras.callbacks.ModelCheckpoint(get_model_name(i),
monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
history = model_1.fit(dataset_train,steps_per_epoch = len(X_train[train_index])//BATCH_SIZE,
epochs=4,validation_data=dataset_test,
validation_steps=1,callbacks=[csv_logger,checkpoint])
scores = model_1.evaluate(X_test,y_test,verbose=0)
pred_classes = model_1.predict(X_test).argmax(1)
f1score = f1_score(y_test,pred_classes,average='macro')
print('\n')
print(f'Score for fold {i}: {model_1.metrics_names[0]} of {scores[0]}; {model_1.metrics_names[1]}
of {scores[1]*100}; F1 Score of {f1score}%')
print('\n')
acc_per_fold.append(scores[1] * 100)
loss_per_fold.append(scores[0])
f1score_per_fold.append(f1score)
tf.keras.backend.clear_session()
gc.collect()
del model_1
i=i+1

tf.estimator wants label_data and batch_size for prediction Tensorflow

I have created a network using high level tf APIs such as tf.estimator.
Training and evaluating work fine and produce an output. However when predicting on new data, get_inputs() requires label_data and batch_size.
The error is: TypeError: get_inputs() missing 2 required positional arguments: 'label_data' and 'batch_size'
How can I resolve this so I can make a prediction?
Here is my code:
predictTest = [0.34, 0.65, 0.88]
predictTest is just a test and won't be my real prediction data.
get_inputs(), this is where the error is thrown.
def get_inputs(feature_data, label_data, batch_size, n_epochs=None, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices(
(feature_data, label_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(batch_size)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
Prediction inputs:
def predict_input_fn():
return get_inputs(
predictTest,
n_epochs=1,
shuffle=False
)
Predicting:
predict = estimator.predict(predict_input_fn)
print("Prediction: {}".format(list(predict)))

I worked out that I must create a new get_inputs() function for the prediction.
If I use the get_inputs() that train and evaluate use, it is expecting data it won't get.
get_inputs:
def get_inputs(feature_data, label_data, batch_size, n_epochs=None, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices( #from_tensor_slices
(feature_data, label_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(batch_size)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
Make a new function called pred_get_inputs that doesn't require label_data or batch_size:
def get_pred_inputs(feature_data,n_epochs=None, shuffle=False):
dataset = tf.data.Dataset.from_tensor_slices( #from_tensor_slices
(feature_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(1)
features = dataset
return features

The testing of any model has two types.
1) you want accuracy, recall etc. you need to provide a label for the test data. if you don't provide label it will give you an error.
2) you just want to test your model without calculating the accuracy than you don't need a label but here the prediction will be different.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

implementation decision trees in tensorflow - python

test_input_fn = make_input_fn(test, test.index, shuffle=False, n_epochs=1) preds = est.predict(test_input_fn) preds = [pred['class_ids'][0] for pred in preds] pd.DataFrame({'PassengerId': dataTest.PassengerId, 'Survived': preds}).to_csv('submission.csv', index=False) !head submission.csv

Related

I use pytorch to train a model to classify iris, but my acc was about 0.4

Why does tensorflow show inaccurate loss?

How to append predicted data from test loader every other one

How to do both Data Augmentation and Cross Validation at the same time in NLP?

tf.estimator wants label_data and batch_size for prediction Tensorflow

Categories

Resources