How to use custom function with tensorflow dataset API? - python

I am new to TensorFlow's tf.data.Dataset and I am trying to use it on my data that I loaded with pandas dataframe as follows:
Load the input date (df_input):
id messages Label
0 11 I am not driving home 0
1 11 Please pick me up 1
2 103 The car already park 1
3 103 No need for ticket 0
4 104 I will buy a car 1
5 104 I will buy truck 1
And I do preprocess and apply text Vectorization as follows:
text_vectorizer = layers.TextVectorization(max_tokens=20, output_mode="int", output_sequence_length=6)
text_vectorizer.adapt(df_input.message.values.tolist())
def encode(texts):
encoded_texts = text_vectorizer(texts)
return encoded_texts.numpy()
train_data = encode(df_input.message.values) ## This the training data
train_label = tf.keras.utils.to_categorical(df_input.label.values, 2) ## This labels
Then I am using the preprocess data in the training model by using the TensorFlow tf.data.Dataset function as follows:
train_dataset_df = (
tf.data.Dataset.from_tensor_slices((train_data, train_label))
.shuffle(1000)
.batch(2)
)
My question is how I can transform the data in every training epoch by applying my custom function to the training data. I saw a usage example of performing the transformation via .map function from here to this post:
train_dataset = train_dataset.batch(2).map(lambda x, y: (text_vectorizer(x), y))
My goal is to apply my custom function as follows (which reorders the words in text data):
def order_augment_sent(Sentence):
words = Sentence.split(" ")
words.sort()
newSentence = " ".join(words)
return newSentence
train_dataset_ds = (
tf.data.Dataset.from_tensor_slices((train_data, train_label))
.shuffle(1000)
.batch(2)
.map(lambda x, y: (order_augment_sent(x), y))
)
But I am getting error as:
AttributeError: 'Tensor' object has no attribute 'split'
Or if I apply my other cutom function, I am getting as:
TypeError: To be compatible with tf.function, Python functions must return zero or more Tensors or ExtensionTypes or None values; in compilation of <function _tf_if_stmt.<locals>.aug_body at 0124f565>, found return value of type WarningException, which is not a Tensor or ExtensionType.
I am not sure how I can do this and I will appreciate it if you have any idea or solution to help me.

The parameters you get in your lambda function are token from the vectors so they are int. If you want to reorder the text data, you need to do it before the text_vectorizer.
So you should add the TextVectorization layer to your model so your map function will have the string and you can reorder the sentance before calling the TextVectorization.
Here is an almost working exemple, you just need to edit the order_augment_sent function with the code you need, I didn't know what kind of sorting you want to do, probably you will have to write a custom sort with numpy https://www.tensorflow.org/api_docs/python/tf/py_function
import tensorflow as tf
import numpy as np
train_data = ["I am not driving home", "Please pick me up", "The car already park", " No need for ticket", "I will buy a car", "I will buy truck"]
train_label = [0,1,1,0,1,1]
text_dataset = tf.data.Dataset.from_tensor_slices(train_data)
max_features = 5000 # Maximum vocab size.
max_len = 4 # Sequence length to pad the outputs to.
# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=max_features,
output_mode='int',
output_sequence_length=max_len)
# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize_layer.adapt(train_data)
# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()
# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)
def apply_order_augment_sent(s):
Sentence = s.decode('utf-8')
words = Sentence.split(" ")
words.sort()
newSentence = " ".join(words)
return(newSentence)
def order_augment_sent(x: np.ndarray, y:np.ndarray):
new_x = []
for i in range(len(x)):
new_x.append(np.array([apply_order_augment_sent(x[i])]))
print('new', new_x, y)
return(new_x, y)
train_dataset_ds = tf.data.Dataset.from_tensor_slices((train_data, train_label))
train_dataset_ds = train_dataset_ds.shuffle(1000).batch(32)
train_dataset_ds = train_dataset_ds.map(lambda item1, item2: tf.numpy_function(
order_augment_sent, [item1, item2], [tf.string, tf.int32]))
list(train_dataset_ds.as_numpy_iterator())
model.predict(train_dataset_ds)

Related

Weighted average of embedding layer using Universal sentence encoder

In my dataframe, I have two columns viz text and score.Text is list of strings eg. [table,chair] and similarly score is list of numbers eg. [0.4,0.2].I am trying to use universal sentence encoder to take weighted average inside the keras model.No of text in the list might be different for different rows of the dataframe.
(0.4* UniversalEncoder('table') + 0.2*UniversalEncoder('chair'))/(0.4+0.2) -like this.
text_input = layers.Input(shape=(1,),name='text')
weight = layers.Input(shape=(1,),name='w')
embedding = layers.Lambda(my_lambda_func)([text_input,weight])
embedding[0].set_shape((None,512))
mul = 0.23
num_neuron = int(512*mul)
text_output=layers.Dense(num_neuron,input_shape=(512,))(embedding[0])
all_inputs.append(text_input)
all_inputs.append(weight)
preds = layers.Dense(1, activation="sigmoid")(text_output)
model = Model(inputs=all_inputs, outputs=preds)
lr = 0.0001
model.compile(
loss="binary_crossentropy",
optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
metrics=["AUC"],
)
model.fit([text_train,score_train],df_train["y"],epoch = 60)
text_train = np.array(list(x for x in df_train.text))
score_train = np.array(list(x for x in df_train.score))
def UniversalEncoder(x,weight):
text = x
weight = weight
emb_vec = embed(text)
vec = np.average(emb_vec,axis=0,weights = weight).flatten()
vec = vec/np.linalg.norm(vec)
return np.array(vec)
def my_lambda_func(x):
result = tf.py_function(UniversalEncoder, [x[0],x[1]], [tf.float32])
return result
Embed() is universal sentence encoder which gives us 512-dimensional vector for given string.
I am getting "ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray)" during fitting the model.Please help me with this issue.Thanks in advance.

Word-embedding does not provide expected relations between words

I am trying to train a word embedding to a list of repeated sentences where only the subject changes. I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding. However, the angle between the vectors of subjects is not always larger than the angle between subjects and a random word.
Man is going to write a very long novel that no one can read.
Woman is going to write a very long novel that no one can read.
Boy is going to write a very long novel that no one can read.
The code is based on pytorch tutorial:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
class EmbedTrainer(nn.Module):
def __init__(self, d_vocab, d_embed, d_context):
super(EmbedTrainer, self).__init__()
self.embed = nn.Embedding(d_vocab, d_embed)
self.fc_1 = nn.Linear(d_embed * d_context, 128)
self.fc_2 = nn.Linear(128, d_vocab)
def forward(self, x):
x = self.embed(x).view((1, -1)) # flatten after embedding
x = self.fc_2(F.relu(self.fc_1(x)))
x = F.log_softmax(x, dim=1)
return x
text = " ".join(["{} is going to write a very long novel that no one can read.".format(x) for x in ["Man", "Woman", "Boy"]])
text_split = text.split()
trigrams = [([text_split[i], text_split[i+1]], text_split[i+2]) for i in range(len(text_split)-2)]
dic = list(set(text.split()))
tok_to_ids = {w:i for i, w in enumerate(dic)}
tokens_text = text.split(" ")
d_vocab, d_embed, d_context = len(dic), 10, 2
""" Train """
loss_func = nn.NLLLoss()
model = EmbedTrainer(d_vocab, d_embed, d_context)
print(model)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
losses = []
epochs = 10
for epoch in range(epochs):
total_loss = 0
for input, target in trigrams:
tok_ids = torch.tensor([tok_to_ids[tok] for tok in input], dtype=torch.long)
target_id = torch.tensor([tok_to_ids[target]], dtype=torch.long)
model.zero_grad()
log_prob = model(tok_ids)
#if total_loss == 0: print("train ", log_prob, target_id)
loss = loss_func(log_prob, target_id)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(total_loss)
losses.append(total_loss)
embed_map = {}
for word in ["Man", "Woman", "Boy", "novel"]:
embed_map[word] = model.embed.weight[tok_to_ids[word]]
print(word, embed_map[word])
def angle(a, b):
from numpy.linalg import norm
a, b = a.detach().numpy(), b.detach().numpy()
return np.dot(a, b) / norm(a) / norm(b)
print("man.woman", angle(embed_map["Man"], embed_map["Woman"]))
print("man.novel", angle(embed_map["Man"], embed_map["novel"]))
I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding
I don't really think you'll achieve that kind of result with only 3 sentences and like 40 iterations in 10 epochs (plus most of the data in your 40 iterations is repeated).
maybe try downloading a couple of free datasets out there, or try your own data with a proven model like a genism model.
I'll give you the code for training a gensim model, so you can test your dataset on another model and see if the problem comes from your data or from your model.
I've tested similar gensim models on datasets with millions of sentences and it worked like a charm, for smaller datasets you might want to change the parameters.
from gensim.models import Word2Vec
from multiprocessing import cpu_count
corpus_path = 'eachLineASentence.txt'
vecSize = 300
winSize = 5
numWorkers = cpu_count()-1
epochs = 20
minCount = 5
skipGram = False
modelName = f'mymodel.model'
model = Word2Vec(corpus_file=corpus_path,
size=vecSize,
window=winSize,
min_count=minCount,
workers=numWorkers,
iter=epochs,
sg=skipGram)
model.save(modelName)
P.S. I don't think it's a good idea to use the keyword input as a variable in your code.
It's most probably the training size. Training a 128d embedding is definitely overkill. Rule of thumb from the the google developers blog:
Why is the embedding vector size 3 in our example? Well, the following "formula" provides a general rule of thumb about the number of embedding dimensions:
embedding_dimensions = number_of_categories**0.25
That is, the embedding vector dimension should be the 4th root of the number of categories. Since our vocabulary size in this example is 81, the recommended number of dimensions is 3:
3 = 81**0.25

Using custom generator with zip to train a model in python

I wanted to use a custom generator function to train the model. I am returning 3 different outputs using yield and storing them in separate variables using zip function. I don't if it is directly possible to use the yield values without storing in separate variables, since 2 variables are used as input in model.fit() and remaining 1 is output. Whenever I call the generator function to store the yield iterator it causes memory overload and restart the kernel.
I have attached my kaggle link as well as the generator code below.
Kaggle notebook link
def generator(photo, caption):
# n_samples = 0
# X = []
# y_in = []
# y_out = []
for k, vv in caption.items(): #loop over caption dict k is image key(name) vv is caption list for that image
for v in vv: #loop over each caption of the caption list for encoding
for i in range(1, len(v)): #encoding each word of the current caption
in_seq= [v[:i]] #taking encoded part before current word
out_seq = v[i] #the current word
in_seq = pad_sequences(in_seq, maxlen=MAX_LEN, padding='post', truncating='post')[0] #padding upto the total chars in whole caption
out_seq = to_categorical([out_seq], num_classes=VOCAB_SIZE)[0] #padding
yield [photo[k],in_seq],out_seq #passing the values for training the model
#Whenever I run this line the kernel dies because of less memory
X, y_in, y_out = zip(*generator(images_features, captions_dict)) # THIS IS THE ONE CAUSING ISSUES
model.fit([X, y_in], y_out, batch_size=50, epochs=50) #training the model

TensorFlow model's input

I am new to TensorFlow. My task is predict some values (in this case, speed). If I use one value for the model input (l0), then everything is fine, I can train it and make predictions:
dataset, meta = arff.loadarff('data.arff')
# meta: 'XYZ'
# TIMESTAMP_ms's type is numeric
# SPEED_KMH's type is numeric
# POWER_W's type is numeric
# CURRENT_A's type is numeric
# VOLTAGE_V's type is numeric
# TORQUE_Nm's type is numeric
# CADENCE_RPM's type is numeric
speed = np.array(dataset[:]['SPEED_KMH'], dtype=float)
cadence = np.array(dataset[:]['CADENCE_RPM'], dtype=float)
power = np.array(dataset[:]['POWER_W'], dtype=float)
torque = np.array(dataset[:]['TORQUE_Nm'], dtype=float)
# Create model
l0 = tf.keras.layers.Dense(units=4, input_shape=[1]) #with one input all ok. BUT HOW TO USE n-Input?
l1 = tf.keras.layers.Dense(units=4)
l2 = tf.keras.layers.Dense(units=1)
model = tf.keras.Sequential([l0, l1, l2])
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.01))
model.fit(cadence, speed, epochs=500, verbose=True)
...
model.predict([<some_val>])
BUT, when I tried to add several values to the input layer to increase the accuracy of the model, I have a problem:
...
train_data = []
for i in range(len(dataset)):
train_data.append([cadence[i], power[i], torque[i]])
...
l0 = tf.keras.layers.Dense(units=4, input_shape=[3])
...
model.fit(train_data, speed, epochs=1, verbose=True)
ValueError: Failed to find data adapter that can handle input: ( containing values of types {'(
Please, help me transfer multiple values to the input layer l0 of the model?
One way of using multiple inputs for a model is to use Tensorflow's functional API. It allows you to set multiple inputs which you can concatenate together later on in your model.
input1 = tf.keras.layers.Input(shape=(1, ))
input2 = tf.keras.layers.Input(shape=(1,))
input3 = tf.keras.layers.Input(shape=(1,))
mergeLayer = tf.keras.layers.Concatenate(axis=1)([input1, input2, input3])
dense1 = tf.keras.layers.Dense(4)(mergeLayer)
dense2 = tf.keras.layers.Dense(4)(dense1)
output = tf.keras.layers.Dense(1)(dense2)
model = tf.keras.models.Model([input1, input2, input3], output)
Now you can try merging your data together into one list and calling the fit() method on the new model.
For some more information on the functional API, you can go to the docs.
The Keras Functional API

How can I get the indices of the data used in every batch?

I need to save the indices of the data that are used in every mini-batch.
For example if my data is:
x = np.array([[1.1], [2.2], [3.3], [4.4]])
and the first mini-batch is [1.1] and [3.3], then I want to store 0 and 2 (since [1.1] is the 0th observations and [3.3] is the 2nd observation).
I am using tensorflow in eager execution with the keras.sequential APIs.
As far as I can tell from reading the source code, this information is not stored anywhere so I was unable to do this with a callback.
I am currently solving my problem by creating an object that stores the indices.
class IndexIterator(object):
def __init__(self, n, n_epochs, batch_size, shuffle=True):
data_ix = np.arange(n)
if shuffle:
np.random.shuffle(data_ix)
self.ix_batches = np.array_split(data_ix, np.ceil(n / batch_size))
self.batch_indices = []
def generate_arrays(self, x, y):
batch_ixs = np.arange(len(self.ix_batches))
while 1:
np.random.shuffle(batch_ixs)
for batch in batch_ixs:
self.batch_indices.append(self.ix_batches[batch])
yield (x[self.ix_batches[batch], :], y[self.ix_batches[batch], :])
data_gen = IndexIterator(n=32, n_epochs=100, batch_size=16)
dnn.fit_generator(data_gen.generate_arrays(x, y),
steps_per_epoch=2,
epochs=100)
# This is what I am looking for
print(data_gen.batch_indices)
Is there no way to do this using a tensorflow callback?
Not sure if this will be more efficient than your solution, but is certainly more general.
If you have training data with n indices you can create a secondary Dataset that contains only these indices and zip it with the "real" dataset.
I.E.
real_data = tf.data.Dataset ...
indices = tf.data.Dataset.from_tensor_slices(tf.range(data_set_length)))
total_dataset = tf.data.Dataset.zip((real_data, indices))
# Perform optional pre-processing ops.
iterator = total_dataset.make_one_shot_iterator()
# Next line yields `(original_data_element, index)`
item_and_index_tuple = iterator.get_next()
`

Categories

Resources