Using custom generator with zip to train a model in python - python

I wanted to use a custom generator function to train the model. I am returning 3 different outputs using yield and storing them in separate variables using zip function. I don't if it is directly possible to use the yield values without storing in separate variables, since 2 variables are used as input in model.fit() and remaining 1 is output. Whenever I call the generator function to store the yield iterator it causes memory overload and restart the kernel.
I have attached my kaggle link as well as the generator code below.
Kaggle notebook link
def generator(photo, caption):
# n_samples = 0
# X = []
# y_in = []
# y_out = []
for k, vv in caption.items(): #loop over caption dict k is image key(name) vv is caption list for that image
for v in vv: #loop over each caption of the caption list for encoding
for i in range(1, len(v)): #encoding each word of the current caption
in_seq= [v[:i]] #taking encoded part before current word
out_seq = v[i] #the current word
in_seq = pad_sequences(in_seq, maxlen=MAX_LEN, padding='post', truncating='post')[0] #padding upto the total chars in whole caption
out_seq = to_categorical([out_seq], num_classes=VOCAB_SIZE)[0] #padding
yield [photo[k],in_seq],out_seq #passing the values for training the model
#Whenever I run this line the kernel dies because of less memory
X, y_in, y_out = zip(*generator(images_features, captions_dict)) # THIS IS THE ONE CAUSING ISSUES
model.fit([X, y_in], y_out, batch_size=50, epochs=50) #training the model

Related

How to use custom function with tensorflow dataset API?

I am new to TensorFlow's tf.data.Dataset and I am trying to use it on my data that I loaded with pandas dataframe as follows:
Load the input date (df_input):
id messages Label
0 11 I am not driving home 0
1 11 Please pick me up 1
2 103 The car already park 1
3 103 No need for ticket 0
4 104 I will buy a car 1
5 104 I will buy truck 1
And I do preprocess and apply text Vectorization as follows:
text_vectorizer = layers.TextVectorization(max_tokens=20, output_mode="int", output_sequence_length=6)
text_vectorizer.adapt(df_input.message.values.tolist())
def encode(texts):
encoded_texts = text_vectorizer(texts)
return encoded_texts.numpy()
train_data = encode(df_input.message.values) ## This the training data
train_label = tf.keras.utils.to_categorical(df_input.label.values, 2) ## This labels
Then I am using the preprocess data in the training model by using the TensorFlow tf.data.Dataset function as follows:
train_dataset_df = (
tf.data.Dataset.from_tensor_slices((train_data, train_label))
.shuffle(1000)
.batch(2)
)
My question is how I can transform the data in every training epoch by applying my custom function to the training data. I saw a usage example of performing the transformation via .map function from here to this post:
train_dataset = train_dataset.batch(2).map(lambda x, y: (text_vectorizer(x), y))
My goal is to apply my custom function as follows (which reorders the words in text data):
def order_augment_sent(Sentence):
words = Sentence.split(" ")
words.sort()
newSentence = " ".join(words)
return newSentence
train_dataset_ds = (
tf.data.Dataset.from_tensor_slices((train_data, train_label))
.shuffle(1000)
.batch(2)
.map(lambda x, y: (order_augment_sent(x), y))
)
But I am getting error as:
AttributeError: 'Tensor' object has no attribute 'split'
Or if I apply my other cutom function, I am getting as:
TypeError: To be compatible with tf.function, Python functions must return zero or more Tensors or ExtensionTypes or None values; in compilation of <function _tf_if_stmt.<locals>.aug_body at 0124f565>, found return value of type WarningException, which is not a Tensor or ExtensionType.
I am not sure how I can do this and I will appreciate it if you have any idea or solution to help me.
The parameters you get in your lambda function are token from the vectors so they are int. If you want to reorder the text data, you need to do it before the text_vectorizer.
So you should add the TextVectorization layer to your model so your map function will have the string and you can reorder the sentance before calling the TextVectorization.
Here is an almost working exemple, you just need to edit the order_augment_sent function with the code you need, I didn't know what kind of sorting you want to do, probably you will have to write a custom sort with numpy https://www.tensorflow.org/api_docs/python/tf/py_function
import tensorflow as tf
import numpy as np
train_data = ["I am not driving home", "Please pick me up", "The car already park", " No need for ticket", "I will buy a car", "I will buy truck"]
train_label = [0,1,1,0,1,1]
text_dataset = tf.data.Dataset.from_tensor_slices(train_data)
max_features = 5000 # Maximum vocab size.
max_len = 4 # Sequence length to pad the outputs to.
# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=max_features,
output_mode='int',
output_sequence_length=max_len)
# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize_layer.adapt(train_data)
# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()
# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)
def apply_order_augment_sent(s):
Sentence = s.decode('utf-8')
words = Sentence.split(" ")
words.sort()
newSentence = " ".join(words)
return(newSentence)
def order_augment_sent(x: np.ndarray, y:np.ndarray):
new_x = []
for i in range(len(x)):
new_x.append(np.array([apply_order_augment_sent(x[i])]))
print('new', new_x, y)
return(new_x, y)
train_dataset_ds = tf.data.Dataset.from_tensor_slices((train_data, train_label))
train_dataset_ds = train_dataset_ds.shuffle(1000).batch(32)
train_dataset_ds = train_dataset_ds.map(lambda item1, item2: tf.numpy_function(
order_augment_sent, [item1, item2], [tf.string, tf.int32]))
list(train_dataset_ds.as_numpy_iterator())
model.predict(train_dataset_ds)

Tensorflow 1.13.1 tf.data map multiple images with a single row together

I'm building my tf dataset where there are multiple inputs (images and numerical/categorical data). The problem I am having is that multiple images correspond to the same row in the pd.Dataframe I have. I am doing regression.
So how, (even when shuffling all the inputs) do I ensure that each image gets mapped to the correct row?
Again, say I have 10 rows, and 100 images, with 10 images corresponding to a particular row. Now we shuffle the dataset, and we want to make sure that the shuffled images all correspond to their respective row.
I am using tf.data.Dataset to do this. I also have a directory structure such that the folder name corresponds to an element in the DataFrame, which is what I was thinking of using if I knew how to do the mapping
i.e. folder1 would be in the df with cols like dir_name, feature1, feature2, .... Naturally, the dir_names should not be passed as data into the model to fit on.
#images
path_ds = tf.data.Dataset.from_tensor_slices(paths)
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
#numerical&categorical features. First remove the dirs
x_train_input = X_train[X_train.columns.difference(['dir_name'])]
x_train_input=np.expand_dims(x_train_input, axis=1)
text_ds = tf.data.Dataset.from_tensor_slices(x_train_input)
#labels, y_train's cols are: 'label' and 'dir_name'
label_ds = tf.data.Dataset.from_tensor_slices(
tf.cast(y_train['label'], tf.float32))
# test creation of dataset without prior shuffling.
xtrain_ = tf.data.Dataset.zip((image_ds, text_ds))
model_ds = tf.data.Dataset.zip((xtrain_, label_ds))
# Shuffling
BATCH_SIZE = 64
# Setting a shuffle buffer size as large as the dataset ensures that
# data is completely shuffled
ds = model_ds.shuffle(buffer_size=len(paths))
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# prefetch lets the dataset fetch batches in the background while the
# model is training
# ds = ds.prefetch(buffer_size=AUTOTUNE)
ds = ds.prefetch(buffer_size=BATCH_SIZE)
My solution would be to utilize TFRecords for storing the data and holding it's integrity. This will also open doors for other efficiencies as well.
What the below code is doing...
Create dummy data. All need to be arrays with the same datatype found in the _parse_function. You can change that dtype, just also ensure you change it for your data too.
Create a dictionary that holds the arrays by name
Create feature_dimensions object that holds the shape of all arrays
Create TFRecords by looping over data dict. You can create one large file, or many small ones. This is a good starting point for you however.
Declare functions for generating the dataset. You can add and modify whatever logic you want there. The key, however, is that these functions use the feature_dimensions object to remember how to put the data back together
Create a dataset
Generate a sample. The result is a dictionary with a batch-size worth of data.
You should be able to just run this sample code all by itself and have no issues. Then just make the changes you need for it to work in your problem.
import tensorflow as tf
import pandas as pd
import numpy as np
from functools import partial
# Create dummy data, TODO replace with your own logic
# 10 images per row in DF
images_per_example = 10
examples = 200
# Save name for TFRecords, you can create multiple and pass a list of the names as well
save_name = "my_tfrecords.tfrecords"
# DF, dataframe with random categorical data
x_data = pd.DataFrame(data=(np.random.normal(size=(examples, 50)) > 0).astype(np.float32))
y_data = np.random.uniform(0, 1, size=(examples, )).reshape(-1, 1).astype(np.float32)
def load_and_preprocess_image(file):
# For dummy purposes generating instead of loading
img = np.random.uniform(high=255, low=0, size=(15, 15))
return (img / 255.).astype(np.float32)
# I would preprocess your images prior to creating the tfrecords file
img_data = np.array([[load_and_preprocess_image("add_logic") for j in range(images_per_example)]
for k in range(examples)])
# Prepare for tfrecords
data_dict = dict()
data_dict["images"] = img_data # Already an array
data_dict["x_data"] = x_data.values # Ensure it's an array
data_dict["y_data"] = y_data # Already an array
# Remember the dimensions for later restoration, replacing number of examples with -1
feature_dimensions = {k: v.shape for k, v in data_dict.items()}
feature_dimensions = {k: tuple([-1] + list(v[1:])) for k, v in feature_dimensions.items()}
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
writer = tf.python_io.TFRecordWriter(save_name)
# Create TFRecords file
for i in range(examples):
example_dict = dict() # New dictionary for each single example
for name, data in data_dict.items():
# if name == "images":
# break
example_dict[name] = data[i]
# Define the features of your tfrecord
feature = {k: _bytes_feature(tf.compat.as_bytes(v.tostring())) for k, v in example_dict.items()}
# Serialize to string and write to file
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
# Declare functions for creating dataset
def _parse_function(proto, feature_dimensions_: dict):
# define your tfrecord again. Remember that you saved your image as a string.
keys_to_features = {k: tf.FixedLenFeature([], tf.string) for k in feature_dimensions_.keys()}
# Load one example
parsed_features = tf.parse_single_example(proto, keys_to_features)
# Split data
for k, v in parsed_features.items():
parsed_features[k] = tf.decode_raw(v, tf.float32)
return parsed_features
def create_tf_dataset(file_paths: str, feature_dimensions_: dict, batch_size=64):
# This works with arrays as well
dataset = tf.data.TFRecordDataset(file_paths)
# Maps the parser on every filepath in the array. You can set the number of parallel loaders here
parse_function = partial(_parse_function, feature_dimensions_=feature_dimensions_)
dataset = dataset.map(parse_function, num_parallel_calls=1)
# This dataset will go on forever
dataset = dataset.repeat()
# Set the number of datapoints you want to load and shuffle
dataset = dataset.shuffle(batch_size) # Put whatever you want here
# Set the batchsize
dataset = dataset.batch(batch_size)
# Set up a pipeline
dataset = dataset.prefetch(batch_size) # Put whatever you want here
# Create an iterator
iterator = dataset.make_one_shot_iterator()
# Create your tf representation of the iterator
parsed_features = iterator.get_next()
# Reshape arrays and cast to float
for k, v in parsed_features.items():
parsed_features[k] = tf.reshape(v, feature_dimensions_[k])
for k, v in parsed_features.items():
parsed_features[k] = tf.cast(v, tf.float32)
return parsed_features
# Create dataset
ds = create_tf_dataset(save_name, feature_dimensions, batch_size=64)
# The final result is a dictionary with the names used above
sample = tf.Session().run(ds)
print("Sample Length:", len(sample))
print("Sample Keys:", sample.keys())
print("images shape:", sample["images"].shape)
print("x_data shape:", sample["x_data"].shape)
print("y_data shape:", sample["y_data"].shape)
Printed Results
Sample Length: 3
Sample Keys: dict_keys(['images', 'x_data', 'y_data'])
images shape: (64, 10, 15, 15)
x_data shape: (64, 50)
y_data shape: (64, 1)

Memory utilization much higher than it should be

I'm using a simple method to extract descriptors from images and save them to disk into a .csv file. I have around 1M images and my network returns 512 features per image (float32).
Therefore, I estimate that at the end of the loop I would have 1e6 * 512 * 32/4 / 1e9 = 4.1GB. However, I observed that it is using more than twice the memory.
index is a string and class_id is a int64, so I don't think they are the culprit here.
I have already tried using gc.collect() without any success. Do you think my code is leaving references behind?
Here is the method:
def prepare_gallery(self, data_loader, TTA, pbar=False, dump_path=None):
'''Compute embeddings for a data_loader and store it in model.
This is required before predicting to a test set.
New entries should be removed from data before calling this function
to avoid inferring on useless images.
data_loader: A linear loader containing the database that test is
compared against.'''
self.set_mode('valid')
self.net.cuda()
n_iter = len(data_loader.dataset) / data_loader.batch_size
if pbar:
loader = tqdm(enumerate(data_loader), total=n_iter)
else:
loader = enumerate(data_loader)
# Run inference and get embeddings
feat_list = []
index_list = []
class_list = []
for i, (index, im, class_id) in loader:
with torch.no_grad():
feat = tta(self.net, im)
# Returns something like np.random.random((32, 512))
feat_list.extend(feat)
index_list.extend(index)
class_list.extend(class_id.item())
if dump_path is not None:
np.save(dump_path + '_ids', index_list)
np.save(dump_path + '_cls', class_list)
np.save(dump_path + '_feat', feat_list)
return np.asarray(index_list), np.asarray(feat_list), np.asarray(class_list)

How can I get the indices of the data used in every batch?

I need to save the indices of the data that are used in every mini-batch.
For example if my data is:
x = np.array([[1.1], [2.2], [3.3], [4.4]])
and the first mini-batch is [1.1] and [3.3], then I want to store 0 and 2 (since [1.1] is the 0th observations and [3.3] is the 2nd observation).
I am using tensorflow in eager execution with the keras.sequential APIs.
As far as I can tell from reading the source code, this information is not stored anywhere so I was unable to do this with a callback.
I am currently solving my problem by creating an object that stores the indices.
class IndexIterator(object):
def __init__(self, n, n_epochs, batch_size, shuffle=True):
data_ix = np.arange(n)
if shuffle:
np.random.shuffle(data_ix)
self.ix_batches = np.array_split(data_ix, np.ceil(n / batch_size))
self.batch_indices = []
def generate_arrays(self, x, y):
batch_ixs = np.arange(len(self.ix_batches))
while 1:
np.random.shuffle(batch_ixs)
for batch in batch_ixs:
self.batch_indices.append(self.ix_batches[batch])
yield (x[self.ix_batches[batch], :], y[self.ix_batches[batch], :])
data_gen = IndexIterator(n=32, n_epochs=100, batch_size=16)
dnn.fit_generator(data_gen.generate_arrays(x, y),
steps_per_epoch=2,
epochs=100)
# This is what I am looking for
print(data_gen.batch_indices)
Is there no way to do this using a tensorflow callback?
Not sure if this will be more efficient than your solution, but is certainly more general.
If you have training data with n indices you can create a secondary Dataset that contains only these indices and zip it with the "real" dataset.
I.E.
real_data = tf.data.Dataset ...
indices = tf.data.Dataset.from_tensor_slices(tf.range(data_set_length)))
total_dataset = tf.data.Dataset.zip((real_data, indices))
# Perform optional pre-processing ops.
iterator = total_dataset.make_one_shot_iterator()
# Next line yields `(original_data_element, index)`
item_and_index_tuple = iterator.get_next()
`

Matrix factorization based recommendation using Tensorflow

I am new to tensor Flow and exploring about recommendation system using tensorflow. I have verified few sample codes in in github and come across mostly the same like following as the follwing
https://github.com/songgc/TF-recomm/blob/master/svd_train_val.py
But the question is, how do I pick top recommendation for user U1 in the above code?
If there any sample code or approach, please share. Thanks
It is a little difficult! Basically, when svd returns, it closes the session, and the tensors lose their values (you still keep the graph). There are a few options:
Save the model to a file and restore it later;
Don't put the session in a with tf.Session() as sess: .... block, and instead return the session;
Do the user processing inside the with ... block
The worst option is option 3: you should train your model separately from using it. The best approach is to save your model and weights somewhere, then restore the session. However, you are still left with the question of how you use this session object once you have recovered it. To demonstrate just that part, I am going to solve this problem using option 3, assuming that you know how to restore a session.
def svd(train, test):
samples_per_batch = len(train) // BATCH_SIZE
iter_train = dataio.ShuffleIterator([train["user"],
train["item"],
train["rate"]],
batch_size=BATCH_SIZE)
iter_test = dataio.OneEpochIterator([test["user"],
test["item"],
test["rate"]],
batch_size=-1)
user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])
infer, regularizer = ops.inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,
device=DEVICE)
global_step = tf.contrib.framework.get_or_create_global_step()
_, train_op = ops.optimization(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
summary_writer = tf.summary.FileWriter(logdir="/tmp/svd/log", graph=sess.graph)
print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
errors = deque(maxlen=samples_per_batch)
start = time.time()
for i in range(EPOCH_MAX * samples_per_batch):
users, items, rates = next(iter_train)
_, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users, item_batch: items, rate_batch: rates})
pred_batch = clip(pred_batch)
errors.append(np.power(pred_batch - rates, 2))
if i % samples_per_batch == 0:
train_err = np.sqrt(np.mean(errors))
test_err2 = np.array([])
for users, items, rates in iter_test:
pred_batch = sess.run(infer, feed_dict={user_batch: users,item_batch: items})
pred_batch = clip(pred_batch)
test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
end = time.time()
test_err = np.sqrt(np.mean(test_err2))
print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err, end - start))
train_err_summary = make_scalar_summary("training_error", train_err)
test_err_summary = make_scalar_summary("test_error", test_err)
summary_writer.add_summary(train_err_summary, i)
summary_writer.add_summary(test_err_summary, i)
start = end
# Get the top rated movie for user #1 for every item in the set
userNumber = 1
user_prediction = sess.run(infer, feed_dict={user_batch: np.array([userNumber]), item_batch: np.array(range(ITEM_NUM))})
# The index number is the same as the item number. Orders from lowest (least recommended)
# to largeset
index_rating_order = np.argsort(user_prediction)
print "Top ten recommended items for user {} are".format(userNumber)
print index_rating_order[-10:][::-1] # at the end, reverse the list
# If you want to include the score:
items_to_choose = index_rating_order[-10:][::-1]
for item, score in zip(items_to_choose, user_prediction[items_to_choose]):
print "{}: {}".format(item,score)
The only changes I made begin at the first commented line. To emphasize again, best practice would be to train in this function, but to actually make your predictions separately.

Categories

Resources