I'm using Tensorflow tf.data.Dataset API as my input pipeline as follows:
train_dataset = tf.data.Dataset.from_tensor_slices((trn_X,trn_y))
train_dataset =
train_dataset.map(_trn_parse_function,num_parallel_calls=12)
train_dataset =
train_dataset.shuffle(buffer_size=1000).repeat(args.num_epochs)#
.batch(args.batch_size)
train_dataset = train_dataset.apply(tf.contrib.data.batch_and_drop_remainder(args.batch_size))
train_dataset = train_dataset.prefetch(buffer_size=600)
val_dataset = tf.data.Dataset.from_tensor_slices((val_X,val_y))
val_dataset = val_dataset.map(_val_parse_function,num_parallel_calls=4)
val_dataset = val_dataset.repeat(1)
val_dataset = val_dataset.apply(tf.contrib.data.batch_and_drop_remainder(args.batch_size))
val_dataset = val_dataset.prefetch(buffer_size=200)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(
handle, train_dataset.output_types,
train_dataset.output_shapes)
images,labels = iterator.get_next()
train_iter = train_dataset.make_initializable_iterator()
val_iter = val_dataset.make_initializable_iterator()
Then use this code to switch between training and validation datasets:
# Define training and validation handlers
training_handle = sess.run(train_iter.string_handle())
validation_handle = sess.run(val_iter.string_handle())
sess.run(train_iter.initializer)
sess.run(val_iter.initializer)
...
loss = sess.run([train_op],feed_dict={handle:training_handle,
is_training:True})
After training, I save weights, and then freeze the graph from a saved checkpoint((.meta) into the .pb format. Subsequently, run the optimize_for_inference.py tool provided in the tensorflow repo. This script requires the input_nodes_names to be defined. I am unable to determine which is the correct input node for the graph. Here are nodes for my graph:
['Variable/initial_value',
'Variable',
'Variable/Assign',
'Variable/read',
'increment_global_step/value',
'increment_global_step',
'Placeholder',
'is_training',
'tensors/component_0',
'tensors/component_1',
'num_parallel_calls',
'batch_size',
'count',
'buffer_size',
'OneShotIterator',
'IteratorToStringHandle',
'IteratorGetNext',
....
....
'output/Softmax]
The output nodes can be easily determined, but not the input nodes.
handle = tf.placeholder(tf.string, shape=[]) is your input, so the tensor is most likely 'Placeholder:0'.
However it would make more sense to write:
handle = tf.placeholder(tf.string, shape=[], name="input_placeholder")
then you know for sure.
Related
I am using DistilBERT to do sentiment analysis on my dataset. The dataset contains text and a label for each row which identifies whether the text is a positive or negative movie review (eg: 1 = positive and 0 = negative). Here is the code from the huggingface documentation (https://huggingface.co/transformers/custom_datasets.html?highlight=imdb)
#This dataset can be explored in the Hugging Face model hub (IMDb), and can be alternatively downloaded with the 🤗 Datasets library with load_dataset("imdb").
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xf aclImdb_v1.tar.gz
#This data is organized into pos and neg folders with one text file per example. Let’s write a function that can read this in.
from pathlib import Path
def read_imdb_split(split_dir):
split_dir = Path(split_dir)
texts = []
labels = []
for label_dir in ["pos", "neg"]:
for text_file in (split_dir/label_dir).iterdir():
texts.append(text_file.read_text())
labels.append(0 if label_dir is "neg" else 1)
return texts, labels
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
import torch
class IMDbDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
#Now that our datasets our ready, we can fine-tune a model either #with the 🤗 Trainer/TFTrainer or with native PyTorch/TensorFlow. See #training.
#Fine-tuning with Trainer
#The steps above prepared the datasets in the way that the trainer is #expected. Now all we need to do is create a model to fine-tune, #define the TrainingArguments/TFTrainingArguments and instantiate a #Trainer/TFTrainer.
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
#We can also train with Pytorch/Tensorflow
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):
for batch in train_loader:
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optim.step()
model.eval()
I want to know test this model on a new piece of data. So, I have a dataframe which contains a piece of text/review for each row, and I want to predict the label. Does anyone know how I would go about doing that? I apologize, I am very new to this and would greatly appreciate any help! I tried taking in text, cleaning it, and then doing
prediction = model.predict(text)
and I got an error saying DistilBERT has no attribute .predict.
If you just want to use the model, you can use the corresponding pipeline:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
Then you can use it:
classifier("I hate this book")
The code that you've shared from the documentation essentially covers the training and evaluation loop. Beware that your shared code contains two ways of fine-tuning, once with the trainer, which also includes evaluation, and once with native Pytorch/TF, which contains just the training portion and not the evaluation portion.
Here is how the native method can be tweaked to generate predictions on the test set:
# Put model in evaluation mode
model.eval()
# Tracking variables for storing ground truth and predictions
predictions , true_labels = [], []
# Prediction Loop
for batch in test_dataset:
# Unpack the inputs from our dataloader and move to GPU/accelerator
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions
outputs = model(input_ids, attention_mask=attention_mask,
labels=labels)
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = labels.to('cpu').numpy()
# Store predictions and true labels
predictions.append(logits)
true_labels.append(label_ids)
After the execution of this loop, predictions will contain logits, i.e., the probability distribution from the model before any form of normalization.
You can use the following to pick the label with the maximum score from the logits, and produce a classification report
from sklearn.metrics import classification_report, accuracy_score
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)
# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)
# Accuracy
print(accuracy_score(flat_true_labels, flat_predictions))
# Classification Report
report = classification_report(flat_true_labels, flat_predictions)
For a more elegant way of performing predictions, you can create a BERTModel Class that would contain different methods and variables for handling the tokenization, creation of dataloader, running the predictions, etc.
You can try code like this example: Link-BERT
You'll arrange the dataset according to the BERT model. D Section in this link, you can just change the model name and your dataset.
I have this saved model and I want to restore it. After I restore, I want to evaluate it on a new dataset which I feeding with a Tensorflow Data input pipeline.
import tensorflow as tf
from tfwrappers.tf_dataset import Dataset
tf.reset_default_graph()
with tf.Session() as sess:
new_saver = tf.train.import_meta_graph('my_deep_model_2017.ckpt.meta')
new_saver.restore(sess, tf.train.latest_checkpoint('./'))
print("Restored Operations from MetaGraph:")
g = tf.get_default_graph()
batch_size = 128
num_steps = 4
train_init_op, test_init_op, Xtest, ytest = Dataset(year = 2017, batch_size = batch_size).build_iterator()
accuracy_update_op = g.get_tensor_by_name('LSTM/Accuracy/accuracy/update_op:0')
accuracy = g.get_tensor_by_name('LSTM/Accuracy/accuracy/value:0')
auc_update_op = g.get_tensor_by_name('LSTM/AUC/auc/update_op:0')
auc = g.get_tensor_by_name('LSTM/AUC/auc/value:0')
total_test_batch = int((400000/(num_steps * batch_size))+1)
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
sess.run(test_init_op)
for _ in range(total_test_batch):
sess.run([auc_update_op, accuracy_update_op])
accuracy_test= sess.run(accuracy)
AUC_test = sess.run(auc)
print("Test accuracy: {:>.2%}".format(accuracy_test), "Test AUC: {:>.2%}".format(AUC_test))
The error I get is FailedPreconditionError: GetNext() failed because the iterator has not been initialized. However, I already have initialization method sess.run(test_init_op).
Dataset module I have is pretty basic, with a Python generator reading data points from an SQL database and creating a Dataset object.
def build_iterator(self):
with tf.name_scope("Data"):
train_generator = PairGenerator(sql = '*SQL QUERY 1*'.format(self.year), max_rows=1600400)
validation_generator = PairGenerator(sql = '*SQL QUERY 2*'.format(self.year), max_rows=400000)
train_dataset = tf.data.Dataset.from_generator(lambda: train_generator, (tf.float32, tf.int32), (tf.TensorShape([self.num_steps, self.num_inputs]), tf.TensorShape([self.num_steps,])))
train_dataset=train_dataset.apply(tf.contrib.data.map_and_batch(map_func=lambda *x:(x[0], tf.cast(tf.one_hot(x[1], self.num_classes),tf.int32)), batch_size=self.batch_size, num_parallel_calls=self.num_parallel_calls, drop_remainder=False)).prefetch(self.prefetch_batch_buffer).repeat(self.num_epochs)
validation_dataset = tf.data.Dataset.from_generator(lambda: validation_generator, (tf.float32, tf.int32), (tf.TensorShape([self.num_steps, self.num_inputs]), tf.TensorShape([self.num_steps,])))
validation_dataset=validation_dataset.apply(tf.contrib.data.map_and_batch(map_func=lambda *x:(x[0], tf.cast(tf.one_hot(x[1], self.num_classes),tf.int32)), batch_size=self.batch_size, num_parallel_calls=self.num_parallel_calls, drop_remainder=False)).prefetch(self.prefetch_batch_buffer).repeat(self.num_epochs)
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
training_init_op = iterator.make_initializer(train_dataset, name='training_init_op')
validation_init_op = iterator.make_initializer(validation_dataset, name='validation_init_op')
X, y = iterator.get_next(name = 'get_next_datapoint')
return training_init_op, validation_init_op, X, y
Most of the solutions out there is about restoring an iterator and feeding a new dataset. I could not come up with a solution.
EDIT: Forgot to say that this saved model is trained with another tf dataset object.
Summary: according to the documentation, Keras model.fit() should accept tf.dataset as input (I am using TF version 1.12.0). I can train my model if I manually do the training steps but using model.fit() on the same model, I get an error I cannot resolve.
Here is a sketch of what I did: my dataset, which is too big to fit in the memory, consists of many files each with different number of rows of (100 features, label). I'd like to use tf.data to build my data pipeline:
def data_loader(filename):
'''load a single data file with many rows'''
features, labels = load_hdf5(filename)
...
return features, labels
def make_dataset(filenames, batch_size):
'''read files one by one, pick individual rows, batch them and repeat'''
dataset = tf.data.Dataset.from_tensor_slices(filenames)
dataset = dataset.map( # Problem here! See edit for solution
lambda filename: tuple(tf.py_func(data_loader, [filename], [float32, tf.float32])))
dataset = dataset.flat_map(
lambda features, labels: tf.data.Dataset.from_tensor_slices((features, labels)))
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
dataset = dataset.prefetch(1000)
return dataset
_BATCH_SIZE = 128
training_set = make_dataset(training_files, batch_size=_BATCH_SIZE)
I'd like to try a very basic logistic regression model:
inputs = tf.keras.layers.Input(shape=(100,))
outputs = tf.keras.layers.Dense(1, activation='softmax')(inputs)
model = tf.keras.Model(inputs, outputs)
If I train it manually everything works fine, e.g.:
labels = tf.placeholder(tf.float32)
loss = tf.reduce_mean(tf.keras.backend.categorical_crossentropy(labels, outputs))
train_step = tf.train.GradientDescentOptimizer(.05).minimize(loss)
iterator = training_set.make_one_shot_iterator()
next_element = iterator.get_next()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
for i in range(training_size // _BATCH_SIZE):
x, y = sess.run(next_element)
train_step.run(feed_dict={inputs: x, labels: y})
However, if I instead try to use model.fit like this:
model.compile('adam', 'categorical_crossentropy', metrics=['acc'])
model.fit(training_set.make_one_shot_iterator(),
steps_per_epoch=training_size // _BATCH_SIZE,
epochs=1,
verbose=1)
I get an error message ValueError: Cannot take the length of Shape with unknown rank. inside the keras'es _standardize_user_data function.
I have tried quite a few things but could not resolve the issue. Any ideas?
Edit: based on #kvish's answer, the solution was to change the map from a lambda to a function that would specify the correct tensor dimensions, e.g.:
def data_loader(filename):
def loader_impl(filename):
features, labels, _ = load_hdf5(filename)
...
return features, labels
features, labels = tf.py_func(loader_impl, [filename], [tf.float32, tf.float32])
features.set_shape((None, 100))
labels.set_shape((None, 1))
return features, labels
and now, all needed to do is to call this function from map:
dataset = dataset.map(data_loader)
Probably tf.py_func produces an unknown shape which Keras cannot infer. We can set the shape of the tensor returned by it using set_shape(your_shape) method and that would help Keras infer the shape of the result.
I have a very simple question. I have a Keras model (TF backend) defined for classification. I want to dump the training images fed into my model during training for debugging purposes. I am trying to create a custom callback that writes Tensorboard image summaries for this.
But how can I obtain the real training data inside the callback?
Currently I am trying this:
class TensorboardKeras(Callback):
def __init__(self, model, log_dir, write_graph=True):
self.model = model
self.log_dir = log_dir
self.session = K.get_session()
tf.summary.image('input_image', self.model.input)
self.merged = tf.summary.merge_all()
if write_graph:
self.writer = tf.summary.FileWriter(self.log_dir, K.get_session().graph)
else:
self.writer = tf.summary.FileWriter(self.log_dir)
def on_batch_end(self, batch, logs=None):
summary = self.session.run(self.merged, feed_dict={})
self.writer.add_summary(summary, batch)
self.writer.flush()
But I am getting the error:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input_1' with dtype float and shape [?,224,224,3]
There must be a way to see what models, get as an input, right?
Or maybe I should try another way to debug it?
You don't need callbacks for this. All you need to do is implementing a function that yields an image and its label as a tuple. flow_from_directory function has a parameter called save_to_dir which could satisfy all of your needs, in case it doesn't, here is what you can do:
def trainGenerator(batch_size,train_path, image_size)
#preprocessing see https://keras.io/preprocessing/image/ for details
image_datagen = ImageDataGenerator(horizontal_flip=True)
#create image generator see https://keras.io/preprocessing/image/#flow_from_directory for details
train_generator = image_datagen.flow_from_directory(
train_path,
class_mode = "categorical",
target_size = image_size,
batch_size = batch_size,
save_prefix = "augmented_train",
seed = seed)
for (batch_imgs, batch_labels) in train_generator:
#do other stuff such as dumping images or further augmenting images
yield (batch_imgs,batch_labels)
t_generator = trainGenerator(32, "./train_data", (224,224,3))
model.fit_generator(t_generator,steps_per_epoch=10,epochs=1)
When following the tensorflow image classification tutorial, at first it caches the bottleneck of each image:
def: cache_bottlenecks())
I have rewritten the training using tensorflow's Estimator. This really simplified all the code. However I want to cache the bottleneck features here.
Here is my model_fn. I want to cache the results of the dense layer so I can make changes to the actual training without having to compute the bottlenecks each time.
How can I accomplish that?
def model_fn(features, labels, mode, params):
is_training = mode == tf.estimator.ModeKeys.TRAIN
num_classes = len(params['label_vocab'])
module = hub.Module(params['module_spec'], trainable=is_training and params['train_module'])
bottleneck_tensor = module(features['image'])
with tf.name_scope('final_retrain_ops'):
logits = tf.layers.dense(bottleneck_tensor, units=num_classes, trainable=is_training) # save this?
def train_op_fn(loss):
optimizer = tf.train.AdamOptimizer()
return optimizer.minimize(loss, global_step=tf.train.get_global_step())
head = tf.contrib.estimator.multi_class_head(n_classes=num_classes, label_vocabulary=params['label_vocab'])
return head.create_estimator_spec(
features, mode, logits, labels, train_op_fn=train_op_fn
)
TF cannot work as you code. You should:
Export bottleneck to file from the raw net.
Use bottleneck result as input, use another net to train your data.
To expand on what #Feng said:
see TFRecords and TFExamples and Load Images
Something like this should work (untested):
# Serialize the data into two tfrecord files
tf.enable_eager_execution()
feature_extractor = ...
features_file = tf.python_io.TFRecordWriter('features.tfrec')
label_file = tf.python_io.TFRecordWriter('labels.tfrec')
for images, labels in dataset:
features = feature_extractor(images)
features_file.write(tf.serialize_tensor(features))
label_file.write(tf.serialize_tensor(labels))
# Parse the files and zip them together
def parse(type, shape):
_def parse(x):
result = tf.parse_tensor(x, out_type=shape)
result = tf.reshape(result, FEATURE_SHAPE)
return result
return parse
features_ds = tf.data.TFRecordDataset('features.tfrec')
features_ds = features_ds.map(parse(tf.float32, FEATURE_SHAPE), num_parallel_calls=AUTOTUNE)
labels_ds = tf.data.TFRecordDataset('labels.tfrec')
labels_ds = labels_ds.map(parse(tf.float32, FEATURE_SHAPE), num_parallel_calls=AUTOTUNE)
ds = tf.data.Dataset.zip(features_ds, labels_ds)
ds = ds.unbatch().shuffle().repeat().batch().prefetch()...
You might also be able to do it using Dataset.cache, but I'm not 100% sure of the details.