I'm working on a Permutational Equivariant Layer for Keras based on this paper https://arxiv.org/pdf/1612.04530.pdf and previous work by Josef Ondrej found here.
The layer itself is a Keras Model consisting of multiple layers:
from keras import backend as K
from keras import losses
from keras.layers import Average, Add, Concatenate, Maximum, Input, Dense, Lambda
from keras.models import Model
from keras.engine.topology import Layer
def PermutationEquivariant(input_shape, layer_size, tuple_dim = 2, reduce_fun = "sum", dense_params = {}):
"""
Implements a permutation equivariant layer.
Each batch in our data consists of `input_shape[0]` observations
each with `input_shape[1]` features.
Args:
input_shape -- A pair of `int` - (number of observations in one batch x
number of features of each observation). The batch dimension is not included.
layer_size -- `int`. Size of dense layer applied to each tuple of observations.
tuple_dim -- A `int`, how many observations to put in one tuple.
reduce_fun -- A `string`, type of function to "average" over all tuples starting with the same index.
Returns:
g -- A keras Model - the permutation equivariant layer.
It consists of one tuple layer that creates all possible `tuple_dim`-tuples
of observations, sorted on an axis along which the first index is constant.
The same dense layer is applied on every tuple and then some symmetric pooling function is applied
across all tuples with the same first index (for example mean or maximum).
"""
inputs = Input(shape=input_shape)## input_shape: batch_size x row x col
## SeperatedTuple layer
x = SeperatedTuples(tuple_dim, input_shape = input_shape)(inputs)## out_shape: batch_size x row x row ** (tuple_dim-1) x tuple_dim*col
## Dense layer -- implemented with a conv layer
# Use the same dense layer for each tuple
dense_input_shape = (tuple_dim*input_shape[1], ) # batch_size x tuple_dim*col
dense_layer = Dense(input_shape = dense_input_shape, units=layer_size, **dense_params)
# iterate through rows
x_i_list = []
for i in range(input_shape[0]):
xi_j_list = []
# applying the dense layer to each tuple where first index equals i
# here we could also use a 1x1 convolution. Instead of reusing
# the dense layer for each tuple, we would be reusing the kernels
for j in range(input_shape[0] ** (tuple_dim-1)):
input_ij = Lambda(lambda x : x[:,i,j,:], output_shape=(tuple_dim*input_shape[-1],))(x) ##out_shape: batch_size x tuple_dim * col
xi_j_list += [dense_layer(input_ij)] ## xi_j_list-shape: row x batch_size x layer_size
## Pooling layer
# Pooling the list of the dense outputs of all the tuples where first index equals i to out_shape: batch_size x layer_size
# note that axis=0 because in previous step row-axis comes before batch_size-axis
# Use Lambda Wrapper to preserve the output being a Keras Tensor
if reduce_fun == "mean":
pooling_layer = Average(axis=1)
#pooling_layer = Lambda(lambda x : K.mean(x, axis = 0))
elif reduce_fun == "max":
pooling_layer = Maximum()
#pooling_layer = Lambda(lambda x : K.max(x, axis = 0))
elif reduce_fun == "sum":
pooling_layer = Add()
#pooling_layer = Lambda(lambda x : K.sum(x, axis = 0))
else:
raise ValueError("Invalid value for argument `reduce_fun` provided. ")
xi = pooling_layer(xi_j_list) ## xi-shape: batch_size x layer_size
x_i_list += [xi]
# x_i_list-shape:
# Concatenate the results of each row
x = Lambda(lambda x : K.stack(x, axis=1), output_shape = (input_shape[0], layer_size))(x_i_list) ## out_shape: batch_size x row x layer_size
model = Model(inputs=inputs, outputs=x)
return model
class SeperatedTuples(Layer):
"""
Creates all possible tuples of rows of 2D tensor, with an additional axis
along which the first elements are constant.
In the case of tuple_dim = 2, from one input batch:
x_1,
x_2,
...
x_n,
where x_i are rows of the tensor, it creates 3D output tensor:
[[x_1 | x_1, x_1 | x_2 ... x_1 | x_n],
[x_2 | x_1, x_2 | x_2 ... x_2 | x_n],
...
... x_n | x_n]]
Args:
tuple_dim -- A `int`. Dimension of one tuple (i.e. how many rows from the input
tensor to combine to create a row in output tensor)
input_shape -- A `tuple` of `int`. In the most frequent case where our data
has shape (batch_size x num_rows x num_cols) this should be (num_rows x num_cols).
"""
def __init__(self, tuple_dim = 2, **kwargs):
self.tuple_dim = tuple_dim
super(SeperatedTuples, self).__init__(**kwargs)
def create_indices(self, n, k = 2):
"""
Creates all integer valued coordinate k-tuples in k dimensional hypercube with edge size n.
for example n = 4, k = 2
returns [[0, 0], [0, 1], [0, 2], [0, 3],
[1, 0], [1, 1], [1, 2], [1, 3],
...
[3, 0], [3, 1], [3, 2], [3, 3]]
Args:
n -- A `int`, edge size of the hypercube.
k -- A `int`, dimension of the hypercube.
Returns:
indices_n_k -- A `list` of `list` of `int`. Each inner list represents coordinates of one integer point
in the hypercube.
"""
if k == 0:
indices_n_k = [[]]
else:
indices_n_k_minus_1 = self.create_indices(n, k-1)
indices_n_k = [[i] + indices_n_k_minus_1[c] for i in range(n) for c in range(n**(k-1))]
return indices_n_k
def create_seperated_indices(self, n, k = 2):
"""
Same as create_indices, just that there is an additional axis along which the first value of the tuples is constant
for example n = 4, k = 2
returns [[[0, 0], [0, 1], [0, 2], [0, 3]],
[[1, 0], [1, 1], [1, 2], [1, 3]],
...
[[3, 0], [3, 1], [3, 2], [3, 3]]]
shape: row x row x k
"""
indices = self.create_indices(n,k)
seperated_indices = [indices[i:i + n] for i in range(0, len(indices), n)]
return seperated_indices
def build(self, input_shape):
# Create indexing tuple
self.gathering_indices = self.create_seperated_indices(input_shape[-2], self.tuple_dim)
super(SeperatedTuples, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
"""
input_dim : batch_size x rows x cols
output_dim : batch_size x rows x rows ** (tuple_dim-1) x cols * tuple_dim
"""
stacks_of_tuples = K.map_fn(
fn = lambda z : ## z shape: row x col
K.stack(
[K.concatenate(
[K.reshape(
K.gather(z, i), ## shape: tuple_dim x col
shape = (1,-1)
) ## shape: 1 x tuple_dim*col
for i in indices # i-dim: tuple_dim, indices-shape: row x tuple_dim
], ## shape: row x 1 x tuple_dim*col
axis = 0
) ## shape: row x tuple_dim*col
for indices in self.gathering_indices # gathering_indices-shape: row x row x tuple_dim
],
axis=0), ## shape: row x row x tuple_dim*col
elems = x ## shape: batch_size x row x col
) ## shape: batch_size x row x row x tuple_dim*col
return stacks_of_tuples
def compute_output_shape(self, input_shape):
"""
input_shape: batch_size x rows x cols
output_shape: batch_size x rows x rows ** (tuple_dim-1) x cols * tuple_dim
"""
output_shape = list(input_shape)
output_shape[-1] = output_shape[-1] * self.tuple_dim
output_shape[-2] = output_shape[-2] ** self.tuple_dim
return tuple(output_shape)
When testing the PermutationEquivariant layer all alone, everything seems to work fine (run 1). However, when I try to incorporate it in a larger model, the outputs just repeat themselves (run 2).
from keras.models import Model
from keras.layers import Input, Lambda
import numpy as np
# parameters for Permutational Equivariant layer
input_shape = (2,5)
dense_params = {'kernel_initializer': 'glorot_normal', 'bias_initializer': 'glorot_normal', 'activation': 'tanh'}
sample = np.random.random((1,) + input_shape)
# run 1: Using only the PermutationEquivariant layer as a model by itself seems to work
model_1 = PermutationEquivariant(input_shape=input_shape, layer_size=10, tuple_dim=2, reduce_fun="sum", dense_params = dense_params)
model_1.compile(optimizer='sgd', loss='categorical_crossentropy')
print("model_1: \n", model_1.predict(sample))
#model_1:
#[[[-1.0494264 -1.6808903 1.2861781 -0.90004706 1.6178854
# 1.6686234 -1.5724193 1.2454509 0.3730019 -1.4580158 ]
# [-1.3904197 -1.467866 1.0848606 -1.2094728 1.6304723
# 1.6369174 -1.4074551 0.58116794 0.292305 -1.7162979 ]]]
# run 2: Incorporating the PermutationEquivariant layer inside another model makes the output constant along the first axis
inputs = Input(shape=input_shape)
x = PermutationEquivariant(input_shape=input_shape, layer_size=10, tuple_dim=2, reduce_fun="sum", dense_params = dense_params)(inputs)
model_2 = Model(inputs=inputs,outputs = x)
model_2.compile(optimizer='sgd', loss='categorical_crossentropy')
print("model_2: \n", model_2.predict(sample))
enter code here
#model_2:
# [[[ 0.72823656 1.2213255 -0.28404936 1.4711846 -0.49544945
# 1.7930243 -0.7502286 1.892496 -1.675402 -0.2252224 ]
# [ 0.72823656 1.2213255 -0.28404936 1.4711846 -0.49544945
# 1.7930243 -0.7502286 1.892496 -1.675402 -0.2252224 ]]]
I have tried theano and tensorflow as backends, both with the same result. Does anybody have an idea why it behaves differently when inside another model / what am I missing? I appreciate any help!
Related
I am trying to implement mini-batch gradient descent for logistic regression. However when I try to test it on my data set with labels {-1, 1}, it seems my prediction is either almost always 1 or -1, leaving me with a test score around 50 % (since the true labels are approx. 50/50 between -1 and 1) when the target is above 95 %.
Can anyone help spot the mistake(s) in my code below?
def logistic(z):
"""
Helper function
Computes the logistic function 1/(1+e^{-x}) to each entry in input vector z.
Args:
z: numpy array shape (,d)
Returns:
logi: numpy array shape (,d) each entry transformed by the logistic function
"""
logi = np.zeros(z.shape)
logi = np.array([1 / (1+np.exp(-z[i])) for i in range(len(z))])
assert logi.shape == z.shape
return logi
class LogisticRegressionClassifier():
def __init__(self):
self.w = None
def fit(self, X, y, w=None, lr=0.1, batch_size=16, epochs=10):
"""
Run mini-batch Gradient Descent for logistic regression
use batch_size data points to compute gradient in each step.
Args:
X: np.array shape (n,d) dtype float32 - Features
y: np.array shape (,n) dtype int32 - Labels
w: np.array shape (,d) dtype float32 - Initial parameter vector
lr: scalar - learning rate for gradient descent
batch_size: number of elements to use in minibatch
epochs: Number of scans through the data
sets:
w: numpy array shape (,d) learned weight vector w
history: list/np.array len epochs
"""
if w is None: w = np.zeros(X.shape[1])
history = []
n = np.size(X, 0)
for i in range(epochs):
b = batch_size
X_ = np.copy(X)
X_shuf = np.take(X_,np.random.permutation(X_.shape[0]),axis=0,out=X_)
for i in range(n//b):
sample = X_shuf[b*i:(i+1)*b]
g = (1/b)*sum([-y[i]*sample[i,:]*sigmoid(-y[i]*np.dot(w,sample[i,:])) for i in range(b)])
w = np.array(w - lr*g)
history.append(w)
self.w = w
self.history = history
return w
def predict(self, X):
""" Classify each data element in X
Args:
X: np.array shape (n,d) dtype float - Features
Returns:
p: numpy array shape (n, ) dtype int32, class predictions on X (0, 1)
"""
z = np.dot(X,self.w.T)
print(z)
out = logistic(z)
return out
def score(self, X, y):
""" Compute model accuracy on Data X with labels y
Args:
X: np.array shape (n,d) dtype float - Features
y: np.array shape (n,) dtype int - Labels
Returns:
s: float, number of correct prediction divivded by n.
"""
s = 0
n = np.size(X,0)
pred = self.predict(X)
pred_labels = []
for i in range(n):
if pred[i] > 0.5:
pred_labels += [1]
if pred[i] <= 0.5:
pred_labels += [-1]
for i in range(n):
if pred_labels[i] == y[i]:
s += 1
return s / n
```
You forgot to shuffle the labels alongside the training data. If you have
[3, 1] [-1]
[2, 3] [ 1]
After you shuffled the training data, the labels get mismatched
[2, 3] [-1]
[3, 1] [ 1]
I want to elementwise multiply a dense tensor with shape [n, n, k] with a sparse tensor that has the shape [n, n, 1]. I want the values from the sparse tensor to repeat along the axis with the size s, like it would do if I used a dense tensor instead and relied on implicit broadcasting.
However the SparseTensor.__mul__ operation does not support broadcasting the sparse operand. I didn't find an operator to explicitly broadcast the sparse Tensor. How could I achieve this?
If you do not want to just convert the sparse tensor to dense, you can extract select the right values from the dense tensor to build a sparse result directly, something like this:
import tensorflow as tf
import numpy as np
with tf.Graph().as_default(), tf.Session() as sess:
# Input data
x = tf.placeholder(tf.float32, shape=[None, None, None])
y = tf.sparse.placeholder(tf.float32, shape=[None, None, 1])
# Indices of sparse tensor without third index coordinate
indices2 = y.indices[:, :-1]
# Values of dense tensor corresponding to sparse tensor values
x_sp = tf.gather_nd(x, indices2)
# Values of the resulting sparse tensor
res_vals = tf.reshape(x_sp * tf.expand_dims(y.values, 1), [-1])
# Shape of the resulting sparse tensor
res_shape = tf.shape(x, out_type=tf.int64)
# Make sparse tensor indices
k = res_shape[2]
v = tf.size(y.values)
# Add third coordinate to existing sparse tensor coordinates
idx1 = tf.tile(tf.expand_dims(indices2, 1), [1, k, 1])
idx2 = tf.tile(tf.range(k), [v])
res_idx = tf.concat([tf.reshape(idx1, [-1, 2]), tf.expand_dims(idx2, 1)], axis=1)
# Make sparse result
res = tf.SparseTensor(res_idx, res_vals, res_shape)
# Dense value for testing
res_dense = tf.sparse.to_dense(res)
# Dense operation for testing
res_dense2 = x * tf.sparse.to_dense(y)
# Test
x_val = np.arange(48).reshape(4, 4, 3)
y_val = tf.SparseTensorValue([[0, 0, 0], [2, 3, 0], [3, 1, 0]], [1, 2, 3], [4, 4, 1])
res_dense_val, res_dense2_val = sess.run((res_dense, res_dense2),
feed_dict={x: x_val, y: y_val})
print(np.allclose(res_dense_val, res_dense2_val))
# True
For any 2D tensor like
[[2,5,4,7],
[7,5,6,8]],
I want to do softmax for the top k element in each row and then construct a new tensor by replacing all the other elements to 0.
The result should be to get the softmax of top k (here k=2) elements for each row [[7,5],[8,7]],
which is thus
[[0.880797,0.11920291],
[0.7310586,0.26894143]]
and then reconstruct a new tensor according to the index of the top k elements in the original tensor, the final result should be
[[0,0.11920291,0,0.880797],
[0.26894143,0,0,0.7310586]].
Is it possible to implement this kind of masked softmax in tensorflow? Many thanks in advance!
Here is how you can do that:
import tensorflow as tf
# Input data
a = tf.placeholder(tf.float32, [None, None])
num_top = tf.placeholder(tf.int32, [])
# Find top elements
a_top, a_top_idx = tf.nn.top_k(a, num_top, sorted=False)
# Apply softmax
a_top_sm = tf.nn.softmax(a_top)
# Reconstruct into original shape
a_shape = tf.shape(a)
a_row_idx = tf.tile(tf.range(a_shape[0])[:, tf.newaxis], (1, num_top))
scatter_idx = tf.stack([a_row_idx, a_top_idx], axis=-1)
result = tf.scatter_nd(scatter_idx, a_top_sm, a_shape)
# Test
with tf.Session() as sess:
result_val = sess.run(result, feed_dict={a: [[2, 5, 4, 7], [7, 5, 6, 8]], num_top: 2})
print(result_val)
Output:
[[0. 0.11920291 0. 0.880797 ]
[0.26894143 0. 0. 0.7310586 ]]
EDIT:
Actually, there is a function that more closely does what you intend, tf.sparse.softmax. However, it requires a SparseTensor as input, and I'm not sure it should be faster since it has to figure out which sparse values go together in the softmax. The good thing about this function is that you could have different number of elements to softmax in each row, but in your case that does not seem to be important. Anyway, here is an implementation with that, in case you find it useful.
import tensorflow as tf
a = tf.placeholder(tf.float32, [None, None])
num_top = tf.placeholder(tf.int32, [])
# Find top elements
a_top, a_top_idx = tf.nn.top_k(a, num_top, sorted=False)
# Flatten values
sparse_values = tf.reshape(a_top, [-1])
# Make sparse indices
shape = tf.cast(tf.shape(a), tf.int64)
a_row_idx = tf.tile(tf.range(shape[0])[:, tf.newaxis], (1, num_top))
sparse_idx = tf.stack([a_row_idx, tf.cast(a_top_idx, tf.int64)], axis=-1)
sparse_idx = tf.reshape(sparse_idx, [-1, 2])
# Make sparse tensor
a_top_sparse = tf.SparseTensor(sparse_idx, sparse_values, shape)
# Reorder sparse tensor
a_top_sparse = tf.sparse.reorder(a_top_sparse)
# Softmax
result_sparse = tf.sparse.softmax(a_top_sparse)
# Convert back to dense (or you can keep working with the sparse tensor)
result = tf.sparse.to_dense(result_sparse)
# Test
with tf.Session() as sess:
result_val = sess.run(result, feed_dict={a: [[2, 5, 4, 7], [7, 5, 6, 8]], num_top: 2})
print(result_val)
# Same as before
Let's say you have a weights tensor w with shape (None, N)
Find the minimum value of the top k elements
top_kw = tf.math.top_k(w, k=10, sorted=False)[0]
min_w = tf.reduce_min(top_kw, axis=1, keepdims=True)
Generate a boolean mask for the weights tensor
mask_w = tf.greater_equal(w, min_w)
mask_w = tf.cast(mask_w, tf.float32)
Compute custom softmax using the mask
w = tf.multiply(tf.exp(w), mask_w) / tf.reduce_sum(tf.multiply(tf.exp(w), mask_w), axis=1, keepdims=True)
This is how I define a neural network
import tensorflow as tf
class MyFun:
def __init__(self, x, y, sizes, activations, scope):
with tf.variable_scope(scope):
last_out = tf.concat([x, y], axis=1)
for l, size in enumerate(sizes):
last_out = tf.layers.dense(last_out, size, activation=activations[l])
self.vars = tf.trainable_variables(scope=scope)
self.output = last_out
I need to use preprocess input x and y (both placeholders) into features before feeding them into the network. More specifically, I want to use quadratic features, i.e.,
new_input = [1, x, y, x**2, y**2, cross(x,y)]
where cross(x,y) includes the product between all elements of [x, y], i.e.,
cross(x,y) = [x_1*x_2, x_1*x_3, ..., x_1*y_1, ...]
How can I do it elegantly? Is there an equivalent of sklearn.preprocessing.PolynomialFeatures?
Here is one option:
# Suppose your placeholders are one dimensional vectors, with sizes 3 and 7:
x = tf.placeholder(tf.float32,shape=[3])
y = tf.placeholder(tf.float32, shape=[7])
# concat the constant 1.0 with x and y:
z = tf.concat((tf.constant(1.0,shape=(1,)),x,y),axis=0)
# construct all products of pairs:
new_input = [z[i]*z[j] for i in range(3+7-1) for j in range(i,3+7)]
# convert the list of tensors to a tensor (optional):
new_input = tf.stack(new_input)
EDIT 1
Extending this to the case where x and y have a batch dimension:
x = tf.placeholder(tf.float32,shape=[None,3])
y = tf.placeholder(tf.float32, shape=[None,7])
# I use 1.0+0*x[:,:1] instead of tf.constant(1.0)
z = tf.concat((1.0+0*x[:,:1],x,y),axis=1)
new_input = [z[:,i]*z[:,j] for i in range(3+7-1) for j in range(i,3+7)]
new_input = tf.stack(new_input,1)
I got an error when trying to create a simple binary classification for XOR case using Theano. It said dimension mismatch, but I can't find out what variable cause that.
and the strange part, my program is works when I change the number of neuron in the last layer. When I change to use 2 neuron in the last layer, and change that layer to softmax layer, and also use the negative log likelihood (multiclass classification style), this program is works fine.
This is my full code:
import numpy as np
import theano
import theano.tensor as T
class HiddenLayer(object):
def __init__(self, input, nIn, nOut, is_last, W=None):
self.input = input
W_val = np.random.randn(nIn,nOut)*0.001
b_val = np.zeros((nOut,))
self.W = theano.shared(np.asarray(W_val,dtype=theano.config.floatX),
name='W',borrow=True)
self.b = theano.shared(np.asarray(b_val,dtype=theano.config.floatX),
name='b',borrow=True)
self.z = T.dot(input,self.W) + self.b
if(is_last==0):
self.output = T.switch(self.z < 0 , 0 ,self.z)
else:
self.output = T.nnet.sigmoid(self.z)
self.y_pred = self.output > 0.5
self.params = [self.W, self.b]
def cost_function(self,y):
return -T.mean(y*T.log(self.output)+(1-y)*T.log(1-self.output))
def errors(self,y):
return T.mean(T.neq(self.y_pred,y))
alfa = 1
epoch = 1000
neu = 5
inpx = np.array([[1,0],[1,1],[0,0],[0,1]])
inpy = np.array([1,0,0,1])
x = T.fmatrix('x')
y = T.ivector('y')
layer0 = HiddenLayer(
input = x,
nIn = 2,
nOut = neu,
is_last=0
)
layer1 = HiddenLayer(
input = layer0.output,
nIn = neu,
nOut = 1,
is_last=1
)
params = layer0.params + layer1.params
cost = layer1.cost_function(y)
grads = T.grad(cost, params)
updates = [(param_i, param_i - alfa * grad_i) for param_i, grad_i in zip(params, grads)]
eror = layer1.errors(y)
train_model = theano.function([x,y], [eror,cost],updates=updates,allow_input_downcast=True)
test_model = theano.function([x,y],[eror,layer1.y_pred],allow_input_downcast=True)
for i in xrange(epoch):
etr,ctr = train_model(inpx, inpy)
if i%(epoch/10)==0:
print etr,ctr
et,pt = test_model(inpx,inpy)
print pt
and the error:
ValueError: Input dimension mis-match. (input[0].shape[1] = 1, input[1].shape[1] = 4)
Apply node that caused the error: Elemwise{neq,no_inplace}(sigmoid.0, DimShuffle{x,0}.0)
Toposort index: 41
Inputs types: [TensorType(float32, matrix), TensorType(int32, row)]
Inputs shapes: [(4L, 1L), (1L, 4L)]
Inputs strides: [(4L, 4L), (16L, 4L)]
Inputs values: [array([[ 0.94264328],
[ 0.99725735],
[ 0.5 ],
[ 0.95675617]], dtype=float32), array([[1, 0, 0, 1]])]
Outputs clients: [[Shape(Elemwise{neq,no_inplace}.0), Sum{acc_dtype=int64}(Elemwise{neq,no_inplace}.0)]]
Thank you in advance for any help.
Your problem is with your y and inpy variables: what you are trying to do is to have y be the expected output of the network. Your network is given a dataset with 4 elements, each having 2 features, you thus have 4 rows in your input matrix, and 2 columns. You are thus expected to have 4 elements in your predicted output, that is 4 rows in your y or inpy matrix, but you are using a vector, which in theano is a row vector and thus has only one row. You need either to transpose your y vector when computing the cost, or to define your y variable as a matrix, and thus to have inpy as a (4,1) matrix instead of a (4,) vector (once again, vectors are row vectors in theano).
Hope this helps,
Best