In a model with an embedding layer and SimpleRNN layer, I would like to compute the partial derivative dh_t/dh_0 for each step t.
The structure of my model, including imports and data preprocessing.
Toxic comment train data available: https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification/data?select=jigsaw-toxic-comment-train.csv
GloVe 6B 100d embeddings available: https://nlp.stanford.edu/projects/glove/
### 1. Imports
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
### 2. Text data tokenisation and GloVe-100d embeddings:
def data_pp():
train= pd.read_csv('/Users/Toxic comment data/jigsaw-toxic-comment-train.csv') train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
train= train.iloc[:12000,:]
xtr, xte, ytr, yte= train_test_split(train['comment_text'].values,
train['toxic'].values,
stratify= train['toxic'].values,
random_state= 42, test_size= 0.2, shuffle= True)
# Tokenise data
tok= text.Tokenizer(num_words= None)
tok.fit_on_texts(list(xtr)+ list(xte))
input_dim= len(tok.word_index)+1
input_length= train['comment_text'].apply(lambda x: len(str(x).split())).max()
xtr_seq= tok.texts_to_sequences(xtr); xte_seq= tok.texts_to_sequences(xte)
xtr_pad= sequence.pad_sequences(xtr_seq, maxlen= input_length)
xte_pad= sequence.pad_sequences(xte_seq, maxlen= input_length)
print('Shape of tokenised training input:', xtr_pad.shape)
return xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok
xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok= data_pp()
# Word embeddings
def embed_mat(input_dim, output_dim, tok):
'''By default output_dim = 100 for GloVe 100d embeddings'''
embedding_dict=dict()
f= open('/Users/GloVe/glove.6B.100d.txt')
for line in f:
values= line.split()
word= values[0]; coefs= asarray(values[1:], dtype= 'float32')
embedding_dict[word]= coefs
f.close()
Emat= zeros((input_dim, output_dim))
for word, i in tok.word_index.items():
embedding_vector= embedding_dict.get(word)
if embedding_vector is not None:
Emat[i]= embedding_vector
print('Embedding weight matrix has shape:', Emat.shape)
return Emat
output_dim = 100
Emat= embed_mat(input_dim, output_dim, took)
### 3. Define model and compute gradients:
# You can let it run for a few steps and stop the process. Then inspect the first step h_t, h_0 and the computed dh_t/dh_0.
# For the case in my comment, you can remove the for-loop over the steps t, comment out ht, and compute tape.gradient(states, h0) instead.
batch_size = 100
inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(input_dim, output_dim, input_length= input_length,
weights= [Emat], trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')
h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))
rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
ds = tf.data.Dataset.from_tensor_slices((xtr_pad[:100], ytr[:100])).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]
grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
with tf.GradientTape() as tape:
tape.watch(h0)
et = embedding_layer(x_batch_train)
states = rnn_layer(et, initial_state= h0) # (100, 1403, 200)
ht = states[:,t,:]
grad_t= tape.gradient(ht, h0) # (100, 200)
print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
grads_allsteps.append(grad_t)
At each step t, h_t has shape (100,200), h_0 has shape (100,200). However tape.gradient(ht, h0) returns None for every t. Below is the result of the first step:
for t in range(1):
with tf.GradientTape() as tape:
tape.watch(h0)
et = embedding_layer(x_batch_train)
#tape.watch(et)
states = rnn_layer(et, initial_state= h0) # (100, 1403, 200)
ht = states[:,t,:]
print(ht)
print(h0)
grad_t = tape.gradient(ht, h0)
tf.print(grad_t)
>>
# h_t:
tf.Tensor(
[[ 0.25634336 0.5259362 0.60045886 ... -0.4978792 0.62755316
0.09803997]
[ 0.58387524 0.26037565 0.5646103 ... 0.31233114 0.4853201
0.10877549]
[ 0.17190906 0.68681747 -0.32054633 ... -0.6139967 0.48944488
0.06301598]
...
[ 0.1985917 -0.11821499 -0.47709295 ... -0.05718012 0.16089934
0.20585683]
[ 0.73872745 0.503326 0.25224414 ... -0.5771631 0.03748894
0.09212588]
[-0.6597108 -0.43926442 -0.23546427 ... 0.26760277 0.28221437
-0.4039318 ]], shape=(100, 200), dtype=float32)
# h_0:
tf.Tensor(
[[0.51580787 0.51664346 0.70773274 ... 0.45973232 0.7760376 0.48297063]
[0.61048764 0.26038417 0.60392565 ... 0.7426153 0.15507504 0.57494944]
[0.11859739 0.33591187 0.68375146 ... 0.59409297 0.5302879 0.28876984]
...
[0.12401487 0.39376178 0.9850304 ... 0.21582918 0.9592233 0.5257605 ]
[0.9401199 0.2157638 0.6445949 ... 0.36316434 0.5799403 0.3749675 ]
[0.37230062 0.18162128 0.0739954 ... 0.21624395 0.66291 0.7807376 ]], shape=(100, 200), dtype=float32)
# dh_t/dh_0:
None
There seems to be some difficulty for Gradient tape to watch this h_0, and perform gradient computation. I have successfully used GradientTape watch the inputs e_t to the RNN layer, and computed the gradients dh_t/de_t, but this does not really provide much information about the quality of model fitting.
How can I use it to watch the fixed-time quantity h_0, and thus compute the gradient dh_t/dh_0? Thanks in advance for any help.
Reproducible test case:
### 1. Imports
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))
inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')
h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))
rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]
grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
with tf.GradientTape() as tape:
tape.watch(h0)
states= model_rnn(x_batch_train)
ht = states[:,t,:]
grad_t= tape.gradient(ht, h0)
print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
grads_allsteps.append(grad_t)
Something interesting: the first-step gradient is computed and looks fine. The rest are Nones.
grads_allsteps
>>
[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2307187 , -1.0343404 , 0.52859926, ..., -0.09879799,
-1.1407609 , -0.7241671 ],
[ 1.142821 , -1.312029 , 0.37148148, ..., 0.2300478 ,
-1.1440411 , -0.36673146],
[ 1.2778691 , -1.2225235 , 0.69951147, ..., 0.17701946,
-1.2816343 , -0.52648413],
...,
[ 1.1717036 , -1.2444504 , 0.5874837 , ..., -0.13161334,
-1.3752006 , -0.376719 ],
[ 1.1333262 , -1.0013355 , 0.3363382 , ..., -0.22350994,
-1.299541 , -0.5073889 ],
[ 1.18489 , -0.90809333, 0.55045474, ..., -0.10550319,
-1.0866506 , -0.58325446]], dtype=float32)>, None, None, None, None]
You could maybe try using tf.gradients. Also rather use tf.Variable for h0:
# Your imports
#-------
### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))
inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')
h0 = tf.Variable(tf.random.uniform((batch_size, 200)))
rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]
#tf.function
def calculate_t_gradients(t, x, h0):
return tf.gradients(model_rnn(x)[:,t,:], h0)
grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
grads_allsteps.append(calculate_t_gradients(t, x_batch_train, h0))
print(grads_allsteps)
[[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2034059 , -0.46448404, 0.6272926 , ..., -0.40906236,
0.07618493, 0.6338958 ],
[ 1.2781916 , -0.20411322, 0.6174417 , ..., -0.31636393,
-0.23417974, 0.67499626],
[ 1.113218 , -0.65086263, 0.63425934, ..., -0.66614366,
-0.07726163, 0.53647137],
...,
[ 1.3399608 , -0.54088974, 0.6213518 , ..., 0.00831087,
-0.14397278, 0.2614633 ],
[ 1.213171 , -0.42787278, 0.60535026, ..., -0.56198204,
-0.09142771, 0.6212783 ],
[ 1.1901733 , -0.5743524 , 0.36872283, ..., -0.42522985,
-0.0861398 , 0.495057 ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.3487598 , 1.2738569 , -0.48500937, ..., 0.6011117 ,
-0.20381093, 0.45596513],
[ 0.37931004, 1.2778724 , -0.8682532 , ..., 0.8170228 ,
0.1456329 , 0.23715591],
[ 0.5984771 , 0.92434835, -0.8879645 , ..., 0.38756457,
-0.17436962, 0.47174054],
...,
[ 0.61081064, 0.99631476, -0.5104377 , ..., 0.5042721 ,
0.02844866, 0.34626445],
[ 0.7126102 , 1.0205276 , -0.60710275, ..., 0.49418694,
-0.16092762, 0.41363668],
[ 0.8581749 , 1.1259711 , -0.5824491 , ..., 0.45388597,
-0.16205123, 0.72434616]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 3.8507193e-01, 1.2925258e+00, 1.2027258e+00, ...,
3.2430276e-01, 2.2319333e-01, -2.5218868e-01],
[ 5.9262186e-01, 1.4497797e+00, 1.2479483e+00, ...,
4.6175608e-01, 2.5466472e-01, -2.4279505e-01],
[ 2.5734475e-01, 1.4562432e+00, 1.1020679e+00, ...,
6.6081107e-01, 1.9841105e-01, -2.5595558e-01],
...,
[ 5.1541841e-01, 1.6206543e+00, 9.6205616e-01, ...,
7.2725344e-01, 2.5501373e-01, -7.7709556e-04],
[ 4.4518453e-01, 1.6381552e+00, 1.0112666e+00, ...,
5.5238277e-01, 2.4137528e-01, -2.6242572e-01],
[ 6.6721851e-01, 1.5826726e+00, 1.1282607e+00, ...,
3.2301426e-01, 2.2295776e-01, 1.1724380e-01]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.14262576, 0.578709 , 0.1149607 , ..., 0.1229499 ,
-0.42344815, 0.8837458 ],
[-0.09711604, 0.04376438, -0.11737494, ..., 0.00389774,
0.01737173, 0.17246482],
[ 0.24414796, 0.30101255, -0.12234146, ..., -0.04850931,
-0.31790918, 0.21326394],
...,
[-0.20562285, 0.21999156, 0.02703794, ..., -0.03547464,
-0.59052145, 0.04695258],
[ 0.2087476 , 0.46558812, -0.18172565, ..., -0.01167884,
-0.20868361, 0.09055485],
[-0.22442941, 0.16119067, 0.10854454, ..., 0.14752978,
-0.32307786, 0.343314 ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[-1.1414615 , 0.37376842, -1.0230722 , ..., 0.60619426,
0.22550163, -0.6948315 ],
[-1.0124328 , 0.27892357, -0.96915233, ..., 0.7048603 ,
-0.15284726, -0.6734605 ],
[-0.8542529 , 0.25970122, -0.90076745, ..., 0.8825682 ,
-0.02474228, -0.55014515],
...,
[-0.89430666, 0.68327624, -1.0109956 , ..., 0.31722566,
-0.23703958, -0.6766514 ],
[-0.8633691 , 0.28742114, -0.9896866 , ..., 0.98315084,
0.0115847 , -0.55474746],
[-0.7229766 , 0.62417865, -1.2342371 , ..., 0.85149145,
-0.04468453, -0.60606724]], dtype=float32)>]]
You need to make sure the stateful parameter of the SimpleRNN is False, because according to the docs:
If True, the last state for each sample at index i in a batch will be
used as initial state for the sample of index i in the following
batch.
So, your code will also calculate gradients for each timestep if you set stateful to False.
Found a way to implement Gradient Tape repeatedly. del tape after saving the statistic into a list will reduce the burden on GPU
emb_layer= model_rnn.layers[1]; rnn_layer= model_rnn.layers[2]
n_steps = 40
dhtdh0_rnn= []
for t in range(n_steps):
with tf.GradientTape() as tape:
tape.watch(h0)
et= emb_layer(xtr_pad[:100])
ht_all= rnn_layer(et, initial_state= [h0])
ht= ht_all[:,t,:,]
dhtdh0_t= tape.gradient(ht, h0)
grad_agg= tf.reduce_mean(abs(dhtdh0_t), [0,1])
print('step', t+1, 'done')
dhtdh0_rnn.append(np.log(grad_agg))
del tape
I am working on python with keras. I learned in my theory study that in a neural network the weights are only between the input layer and a hidden layer or between hidden layers.
I wrote this code, where I added two layers:
NN.add(Dense(4, input_shape=array_input.shape, activation='relu', name="Layer", kernel_constraint=changeWeight()))
NN.add(Dense(4, activation='relu', name="Output"))
NN.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.3), metrics=['accuracy'])
print(NN.summary())
a = NN.fit(array_input, array_input, epochs=100)
for lay in NN.layers:
print(lay.name)
print(lay.get_weights())
I think that one is the hidden layer (the one renamed "Layer") and the other is the output layer. The problem is that if i printed "lay.get_weights()" there are two arrays of weights, one for each layer. Like this:
[array([[-1.5516974 , -1.600516 , -0. , 0. ],
[-0. , -2.1766946 , 0.32734624, -0. ],
[-0. , -0. , 0.32156652, -0.812184 ],
[-0. , -0. , -0. , -0.7288372 ]],
dtype=float32), array([-1.8015273, -1.801546 , -0.1462403, 0. ], dtype=float32)]
Output
[array([[-1.5045888 , -0.14155084, -0.29977936, -0.0492779 ],
[-1.2379107 , -0.44411597, -0.41499865, -0.2560569 ],
[ 1.2397875 , -0.3541401 , 1.2223543 , 1.5617256 ],
[ 0.18388063, 0.44298917, -0.2201969 , -0.1165269 ]],
dtype=float32), array([-0.82720596, 0. , 1.1942271 , 1.7084894 ], dtype=float32)]
Can someone explain to me where is the problem. I don't understand keras API, do I?
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer#get_weights
get_weights() returns the weight and value of the bias in an array.
Each of your inputs is connected to the first layers. So the weight matrix has a shape of (input.shape, number of neurons in the current layer) and the bias vector has a shape of (number of neurons in the current layer, ).
Therefore, without knowing what your input array contains, I know that this array has a shape of (4,).
For the second layer, the same process is repeated
weight : (number of neurons of the last layer, number of neurons of the current layer)
bias (number of neurone of the current layer,)
Try this example:
NN = Sequential()
NN.add(Dense(2, input_shape=(3,), activation='relu', name="Layer"))
NN.add(Dense(4, activation='relu', name="Output"))
for lay in NN.layers:
print(lay.name)
print(lay.get_weights())
Output:
Layer
[array([[-0.674668 , -0.34347552],
[ 0.63090587, 0.8558588 ],
[-0.5063792 , -0.23311883]], dtype=float32), array([0., 0.], dtype=float32)]
Output
[array([[-0.07787323, 0.22444701, 0.52729607, 0.07616615],
[-0.5380094 , -0.3146367 , -0.73177123, -0.9248886 ]],
dtype=float32), array([0., 0., 0., 0.], dtype=float32)]
Graphical representation :
I created a Conv1D model with Keras and prompted me with a ValueError during training. I don't quite understand where the model is having problems.
I tried to modify the data shape to (3780,6,1) but still prompted me conv1d_46_input to have 3 dimensions, but got array with shape (3780, 6)
def baseline_model():
model = models.Sequential()
model.add(layers.Conv1D(1, 5, input_shape=(6,1), activation="tanh"))
model.add(layers.MaxPool1D(pool_size=2))
model.add(layers.core.Flatten())
model.add(layers.Dense(2))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
return model
# df is pandas DataFrame
X = np.array(df[['rp', 'x', 'y', 'class', 'at', 'dt']], dtype=np.float64)
y = np.array(df[['ap', 'dp']], dtype=np.float64)
# X = np.expand_dims(X, -1)
# y = np.expand_dims(y, -1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
mode = baseline_model()
history = mode.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test))
X=np.array([[-69.3078, 0. , 1. , 1. , 90. , 90. ],
[-69.4585, 0. , 2. , 1. , 90. , 90. ],
[-69.4776, 0. , 3. , 1. , 90. , 90. ],
...,
[-65.8291, 35. , 33. , 1. , 90. , 90. ],
[-71.0137, 35. , 34. , 1. , 90. , 90. ],
[-67.2308, 35. , 35. , 1. , 90. , 90. ]])
y=np.array([[ 15.4463, -17.5046],
[ 15.4777, -17.536 ],
[ 15.5092, -17.5675],
...,
[ 15.8361, -17.8944],
[ 15.8809, -17.9392],
[ 15.9259, -17.9842]])
# X,y type is numpy array
# X shape is (4725, 6) ,y shape is (4725, 2)
# X[0] shape is (6,) , y[0] shape is (2,)
ValueError: Error when checking input: expected conv1d_46_input to have 3 dimensions, but got array with shape (3780, 6)
Your first Conv layer is expecting a data Dim that looks like (Batch_size, dim1, dim2)
Here you have a X shape of (3780, 6), so 3780 arrays of dim 6, but you need to have 3780 arrays of dim (6, 1).
In order to do that, you can simply expand the dim of your X, by doing that :
X = np.expand_dims(X, -1)
This will add the extra dimension that you need.
I have a Keras model. If I get one layer of my model and then call sess.run(layer.weights[0]) and layer.get_weights()[0] I obtain different results.
>>> layer.get_weights()[0]
array([[ 0.05829928, -0.01113867, 0.07874133, ..., -0.0832592 ,
-0.02382897, 0.02150916],
[-0.06571103, 0.06946308, 0.01161512, ..., -0.01296931,
-0.01047098, 0.08497558],
[-0.06404617, 0.01107556, 0.07584237, ..., -0.04085402,
-0.00671811, -0.04153195],
...,
[-0.0100356 , -0.05815255, 0.05809011, ..., 0.0594966 ,
-0.0635704 , -0.04289378],
[-0.01073305, -0.0400929 , -0.01252703, ..., -0.00287437,
0.08347356, 0.04667132],
[-0.03608105, 0.05812681, -0.0146297 , ..., -0.0673831 ,
-0.00531388, -0.02482456]], dtype=float32)
>>> sess.run(layer.weights[0])
array([[-0.03271605, 0.02013551, 0.05350242, ..., 0.06657993,
0.08541366, -0.01483627],
[-0.02411069, -0.03852968, 0.02710939, ..., -0.00030499,
0.07864482, 0.04452118],
[-0.00293329, -0.01251988, -0.01190369, ..., 0.06554652,
-0.01539454, 0.08236458],
...,
[ 0.04456077, -0.00256501, 0.01785846, ..., -0.03573522,
0.00770979, -0.05544731],
[-0.00415177, -0.01014608, -0.0684113 , ..., -0.05186068,
0.04402267, 0.03113024],
[-0.05103095, -0.06083905, -0.0098877 , ..., -0.00747809,
-0.035869 , -0.03331041]], dtype=float32)
Why? I found this other question on Stackoverflow but I don't really understand the given answer.
layer.weights is a tensor variable, so it has to be evaluted in a session.
If you use the session from keras.backend then you should get the same values.
from keras import backend as K
K.get_session().run(layer.weights[0])
which is essentially what Keras does inside get_weights() method.
https://github.com/keras-team/keras/blob/ad578c4c19444af9d1f0e0d51a8283eb0db1a264/keras/engine/base_layer.py#L1061
https://github.com/keras-team/keras/blob/ad578c4c19444af9d1f0e0d51a8283eb0db1a264/keras/backend/tensorflow_backend.py#L2652
In the other linked question, the user got different results because a new session was created and all the variables initialized with init_op.