I got an error when trying to create a simple binary classification for XOR case using Theano. It said dimension mismatch, but I can't find out what variable cause that.
and the strange part, my program is works when I change the number of neuron in the last layer. When I change to use 2 neuron in the last layer, and change that layer to softmax layer, and also use the negative log likelihood (multiclass classification style), this program is works fine.
This is my full code:
import numpy as np
import theano
import theano.tensor as T
class HiddenLayer(object):
def __init__(self, input, nIn, nOut, is_last, W=None):
self.input = input
W_val = np.random.randn(nIn,nOut)*0.001
b_val = np.zeros((nOut,))
self.W = theano.shared(np.asarray(W_val,dtype=theano.config.floatX),
name='W',borrow=True)
self.b = theano.shared(np.asarray(b_val,dtype=theano.config.floatX),
name='b',borrow=True)
self.z = T.dot(input,self.W) + self.b
if(is_last==0):
self.output = T.switch(self.z < 0 , 0 ,self.z)
else:
self.output = T.nnet.sigmoid(self.z)
self.y_pred = self.output > 0.5
self.params = [self.W, self.b]
def cost_function(self,y):
return -T.mean(y*T.log(self.output)+(1-y)*T.log(1-self.output))
def errors(self,y):
return T.mean(T.neq(self.y_pred,y))
alfa = 1
epoch = 1000
neu = 5
inpx = np.array([[1,0],[1,1],[0,0],[0,1]])
inpy = np.array([1,0,0,1])
x = T.fmatrix('x')
y = T.ivector('y')
layer0 = HiddenLayer(
input = x,
nIn = 2,
nOut = neu,
is_last=0
)
layer1 = HiddenLayer(
input = layer0.output,
nIn = neu,
nOut = 1,
is_last=1
)
params = layer0.params + layer1.params
cost = layer1.cost_function(y)
grads = T.grad(cost, params)
updates = [(param_i, param_i - alfa * grad_i) for param_i, grad_i in zip(params, grads)]
eror = layer1.errors(y)
train_model = theano.function([x,y], [eror,cost],updates=updates,allow_input_downcast=True)
test_model = theano.function([x,y],[eror,layer1.y_pred],allow_input_downcast=True)
for i in xrange(epoch):
etr,ctr = train_model(inpx, inpy)
if i%(epoch/10)==0:
print etr,ctr
et,pt = test_model(inpx,inpy)
print pt
and the error:
ValueError: Input dimension mis-match. (input[0].shape[1] = 1, input[1].shape[1] = 4)
Apply node that caused the error: Elemwise{neq,no_inplace}(sigmoid.0, DimShuffle{x,0}.0)
Toposort index: 41
Inputs types: [TensorType(float32, matrix), TensorType(int32, row)]
Inputs shapes: [(4L, 1L), (1L, 4L)]
Inputs strides: [(4L, 4L), (16L, 4L)]
Inputs values: [array([[ 0.94264328],
[ 0.99725735],
[ 0.5 ],
[ 0.95675617]], dtype=float32), array([[1, 0, 0, 1]])]
Outputs clients: [[Shape(Elemwise{neq,no_inplace}.0), Sum{acc_dtype=int64}(Elemwise{neq,no_inplace}.0)]]
Thank you in advance for any help.
Your problem is with your y and inpy variables: what you are trying to do is to have y be the expected output of the network. Your network is given a dataset with 4 elements, each having 2 features, you thus have 4 rows in your input matrix, and 2 columns. You are thus expected to have 4 elements in your predicted output, that is 4 rows in your y or inpy matrix, but you are using a vector, which in theano is a row vector and thus has only one row. You need either to transpose your y vector when computing the cost, or to define your y variable as a matrix, and thus to have inpy as a (4,1) matrix instead of a (4,) vector (once again, vectors are row vectors in theano).
Hope this helps,
Best
Related
I am replicating a pytorch model in keras and ahve problems to see where the extra dimension comes from.
This how my code looks so far:
class Attention(tf.keras.Model):
def __init__(self, input_shape):
super(Attention, self).__init__()
in_features=input_shape[-1]
small_in_features = max(math.floor(in_features/10), 1)
self.d_k = small_in_features
query = tf.keras.models.Sequential()
query.add(tf.keras.layers.Dense(in_features))
query.add(tf.keras.layers.Dense(small_in_features,activation="tanh"))
self.query= query
self.key = tf.keras.layers.Dense(small_in_features)
def call(self, inp):
# inp.shape should be (B,N,C)
q = self.query(inp) # (B,N,C/10)
k = self.key(inp) # B,N,C/10
k = tf.transpose(k)
print(q)
print(k)
x = tf.linalg.matmul(q, k) / math.sqrt(self.d_k) # B,N,N
x = tf.nn.softmax(x) # over rows
x = tf.transpose(x)
x = tf.linalg.matmul(x, inp) # (B, N, C)
return x
But if I want to add it to my Sequential model I get this Error:
ValueError: Dimensions must be equal, but are 1 and 256 for '{{node attention_19/MatMul}} = BatchMatMulV2[T=DT_FLOAT, adj_x=false, adj_y=false](attention_19/sequential_36/Identity, attention_19/transpose)' with input shapes: [?,256,1], [1,256,?].
I have now printed my 'q' and 'k' and it prints out like following:
Tensor("attention_19/sequential_36/Identity:0", shape=(None, 256, 1), dtype=float32)
Tensor("attention_19/transpose:0", shape=(1, 256, None), dtype=float32)
So they are 3 dimensional where one dimension is unfilled.I dont quite understand why it happens.
How can I "remove" the extra dimension or bring this custom layer to work?
Note: The original codes seems to use 3 dimensional Input but I want 2 dimensional input.
I am writing a custom layer in Tensorflow 2.0 and I ran to a problem as follow:
I want to transform a 1D weight array (5x1) to a 2D array (10x10). Suppose I have the index to transform from 1D to 2D as follow, weight_index_lst:
weight_id, row, col
1,5,6
2,6,7
3,7,8
4,8,9
5,9,10
The others location of the 2D array will just get a value of 0. Here's my script for the custom layers. My input is in (10x1) shape. For the w_mat, it receives 0 anywhere else that self.w is not assigned
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class mylayer(layers.Layer):
def __init__(self, weight_index_lst, **kwargs):
super(mylayer, self).__init__(**kwargs)
self.weight_index_lst= weight_index_lst
def build(self):
self.w = self.add_weight(shape = (5,1),
initializer = 'he_normal',
trainable = True)
def call(self, inputs):
ct = 0
w_mat = tf.Variable(np.zeros((21, 21)),dtype='float32',trainable=False)
for i in range(20):
i1 = self.weight_index_lst[i,1] #row index
i2 = self.weight_index_lst[i,2] #column index
w_mat[i1,i2].assign(self.w[ct,0]) #problem with no gradient provided
#or w_mat[i1,i2] = self.w[ct,0] #resource variable cannot be assigned
ct = ct+1
y = tf.matmul(w_mat,inputs)
return y
I could have declared a (10x10) weight array but my deep learning wants the others weight to be 0 and cannot be trained.
If you want to specifically create a new layer with the weights and such then the resolution to your problem (no gradients propagating through assign) is to change all of your operations to be symbolic tensor operations - then TF will be able to propagate the gradients. One way to do so is to create 1d tensor of weights you want to train, append non-trainable const tensor with 0.0 value and then use tf.gather to select the needed weights and/or constant zero for each of n**2 elements of the matrix you want to use to multiply the layer's input by. Since all operations are symbolic tensor operations TF will be able to propagate gradients with no problems. Code of such approach below:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
class mylayer(layers.Layer):
def __init__(self, n, weight_index_lst, **kwargs):
super(mylayer, self).__init__(**kwargs)
self.weight_index_lst = weight_index_lst
self.n = n
def build(self, input_shape):
self.w = self.add_weight(shape = (len(self.weight_index_lst),),
initializer = 'he_normal',
trainable = True)
def call(self, inputs):
const_zero = tf.constant([0.], dtype=tf.float32)
const_zero_and_weights = tf.concat([const_zero, self.w], axis=0)
ct = 1 # start with 1 since 0 means take the non-trainable 0. from const_zero_and_weights
selector = np.zeros((self.n ** 2), dtype=np.int32) # indicies
for i, j in self.weight_index_lst:
selector[i * self.n + j] = ct
ct = ct+1
t_ind = tf.constant(selector, dtype=tf.int32)
w_flattened = tf.gather(const_zero_and_weights, t_ind)
w_matrix = tf.reshape(w_flattened, (self.n, self.n))
y = tf.matmul(w_matrix, inputs)
return y
m = tf.keras.Sequential([
layers.Dense(21**2, input_shape=(45,)),
layers.Reshape(target_shape=(21,21)),
mylayer(21, [(4,5), (5,6), (6,7), (7,8), (8,9)]),
])
m.summary()
You don't need to create a trainable layer for this. Consider just using non-trainable lambda layer:
def select_as_needed(x, wrc, n):
selector = np.zeros(n * n, dtype=np.int32) # tensor with the index of input element we want to select in each cell (0 otherwise)
mask = np.zeros(n * n, dtype=np.float32) # 0./1. tensor with ones only on the positions where we put some selected element
for w, r, c in wrc:
selector[r * n + c] = w
mask[r * n + c] = 1.0
t_ind = tf.constant(selector, dtype=tf.int32)
t_mask = tf.constant(mask, dtype=tf.float32)
return tf.gather(x, t_ind, axis=1) * mask # if we don't multiply by mask the 0-index value of input will go to all positions for which we didn't select anything
wrc = [(0,4,5), (1,5,6), (2,6,7), (3,7,8), (4,8,9)] # same as your table, but 0-based
n = 10
model = tf.keras.models.Sequential([
# ... your stuff
tf.keras.layers.Dense(5, 'linear'), # output of 5 neurons (or replace with whatever else you have which is producing 5 outputs per sample)
tf.keras.layers.Lambda(select_as_needed, arguments={'wrc': wrc, 'n':n}),
tf.keras.layers.Reshape(target_shape=(n, n)),
])
I try to reproduce results generated by the LSTMCell from TensorFlow to be sure that I know what it does.
Here is my TensorFlow code:
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
timesteps = 7
num_input = 4
X = tf.placeholder("float", [None, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.random.normal(size = (1, 7, num_input))
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print e
Here is its output:
[[-0.13285545 -0.13569424 -0.23993783]]
[[-0.04818152 0.05927373 0.2558436 ]]
[[-0.13818116 -0.13837864 -0.15348436]]
[[-0.232219 0.08512601 0.05254192]]
[[-0.20371495 -0.14795329 -0.2261929 ]]
[[-0.10371902 -0.0263292 -0.0914975 ]]
[[0.00286371 0.16377522 0.059478 ]]
And here is my own implementation:
n_steps, _ = X.shape
h = np.zeros(shape = self.hid_dim)
c = np.zeros(shape = self.hid_dim)
for i in range(n_steps):
x = X[i,:]
vec = np.concatenate([x, h])
#vec = np.concatenate([h, x])
gs = np.dot(vec, self.kernel) + self.bias
g1 = gs[0*self.hid_dim : 1*self.hid_dim]
g2 = gs[1*self.hid_dim : 2*self.hid_dim]
g3 = gs[2*self.hid_dim : 3*self.hid_dim]
g4 = gs[3*self.hid_dim : 4*self.hid_dim]
I = vsigmoid(g1)
N = np.tanh(g2)
F = vsigmoid(g3)
O = vsigmoid(g4)
c = c*F + I*N
h = O * np.tanh(c)
print h
And here is its output:
[-0.13285543 -0.13569425 -0.23993781]
[-0.01461723 0.08060743 0.30876374]
[-0.13142865 -0.14921292 -0.16898363]
[-0.09892188 0.11739943 0.08772941]
[-0.15569218 -0.15165766 -0.21918869]
[-0.0480604 -0.00918626 -0.06084118]
[0.0963612 0.1876516 0.11888081]
As you might notice I was able to reproduce the first hidden vector, but the second one and all the following ones are different. What am I missing?
i examined this link and your code is almost perfect but you forgot to add forget_bias value(default 1.0) in this line F = vsigmoid(g3) its actualy F = vsigmoid(g3+self.forget_bias) or in your case its 1 F = vsigmoid(g3+1)
here is my imp with numpy:
import numpy as np
import tensorflow as tf
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
batch=1
timesteps = 7
num_input = 4
X = tf.placeholder("float", [batch, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.reshape(range(28),[batch, timesteps, num_input])
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print(e)
print("\nmy imp\n")
#my impl
def sigmoid(x):
return 1/(1+np.exp(-x))
kernel,bias=sess.run([lstm._kernel,lstm._bias])
f_b_=lstm._forget_bias
c,h=np.zeros([batch,num_input-1]),np.zeros([batch,num_input-1])
for step in range(timesteps):
inpt=np.split(x_val,7,1)[step][0]
lstm_mtrx=np.matmul(np.concatenate([inpt,h],1),kernel)+bias
i,j,f,o=np.split(lstm_mtrx,4,1)
c=sigmoid(f+f_b_)*c+sigmoid(i)*np.tanh(j)
h=sigmoid(o)*np.tanh(c)
print(h)
output:
[[ 0.06964055 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.617855e-04 -1.316892e-02 8.596722e-06]]
[[ 3.9425286e-06 -5.1347450e-03 7.5078127e-08]]
[[ 8.7508155e-08 -1.9560163e-03 6.3853928e-10]]
[[ 1.8867894e-09 -7.3784427e-04 5.8551406e-12]]
[[ 4.0385355e-11 -2.7728223e-04 5.3957669e-14]]
my imp
[[ 0.06964057 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.61785520e-04 -1.31689185e-02 8.59672610e-06]]
[[ 3.94252745e-06 -5.13474567e-03 7.50781122e-08]]
[[ 8.75080644e-08 -1.95601574e-03 6.38539112e-10]]
[[ 1.88678843e-09 -7.37844070e-04 5.85513438e-12]]
[[ 4.03853841e-11 -2.77282006e-04 5.39576024e-14]]
Tensorflow uses glorot_uniform() function to initialize the lstm kernel, which samples weights from a random uniform distribution. We need to fix a value for the kernel to get reproducible results:
import tensorflow as tf
import numpy as np
np.random.seed(0)
timesteps = 7
num_input = 4
x_val = np.random.normal(size = (1, timesteps, num_input))
num_units = 3
def glorot_uniform(shape):
limit = np.sqrt(6.0 / (shape[0] + shape[1]))
return np.random.uniform(low=-limit, high=limit, size=shape)
kernel_init = glorot_uniform((num_input + num_units, 4 * num_units))
My implementation of the LSTMCell (well, actually it's just slightly rewritten tensorflow's code):
def sigmoid(x):
return 1. / (1 + np.exp(-x))
class LSTMCell():
"""Long short-term memory unit (LSTM) recurrent network cell.
"""
def __init__(self, num_units, initializer=glorot_uniform,
forget_bias=1.0, activation=np.tanh):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell.
initializer: The initializer to use for the kernel matrix. Default: glorot_uniform
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training.
activation: Activation function of the inner states. Default: np.tanh.
"""
# Inputs must be 2-dimensional.
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
self._initializer = initializer
def build(self, inputs_shape):
input_depth = inputs_shape[-1]
h_depth = self._num_units
self._kernel = self._initializer(shape=(input_depth + h_depth, 4 * self._num_units))
self._bias = np.zeros(shape=(4 * self._num_units))
def call(self, inputs, state):
"""Run one step of LSTM.
Args:
inputs: input numpy array, must be 2-D, `[batch, input_size]`.
state: a tuple of numpy arrays, both `2-D`, with column sizes `c_state` and
`m_state`.
Returns:
A tuple containing:
- A `2-D, [batch, output_dim]`, numpy array representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is equal to num_units.
- Numpy array(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
"""
num_proj = self._num_units
(c_prev, m_prev) = state
input_size = inputs.shape[-1]
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = np.hstack([inputs, m_prev]).dot(self._kernel)
lstm_matrix += self._bias
i, j, f, o = np.split(lstm_matrix, indices_or_sections=4, axis=0)
# Diagonal connections
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
m = sigmoid(o) * self._activation(c)
new_state = (c, m)
return m, new_state
X = x_val.reshape(x_val.shape[1:])
cell = LSTMCell(num_units, initializer=lambda shape: kernel_init)
cell.build(X.shape)
state = (np.zeros(num_units), np.zeros(num_units))
for i in range(timesteps):
x = X[i,:]
output, state = cell.call(x, state)
print(output)
Produces output:
[-0.21386017 -0.08401277 -0.25431477]
[-0.22243588 -0.25817422 -0.1612211 ]
[-0.2282134 -0.14207162 -0.35017249]
[-0.23286737 -0.17129192 -0.2706512 ]
[-0.11768674 -0.20717363 -0.13339118]
[-0.0599215 -0.17756104 -0.2028935 ]
[ 0.11437953 -0.19484555 0.05371994]
While your Tensorflow code, if you replace the second line with
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units, initializer = tf.constant_initializer(kernel_init))
returns:
[[-0.2138602 -0.08401276 -0.25431478]]
[[-0.22243595 -0.25817424 -0.16122109]]
[[-0.22821338 -0.1420716 -0.35017252]]
[[-0.23286738 -0.1712919 -0.27065122]]
[[-0.1176867 -0.2071736 -0.13339119]]
[[-0.05992149 -0.177561 -0.2028935 ]]
[[ 0.11437953 -0.19484554 0.05371996]]
Here is a blog which will answer any conceptual questions related to LSTM's. Seems that there is a lot which goes into building an LSTM from scratch!
Of course, this answer doesn't solve your question but just giving a direction.
Considering Linear Algebra, it's possible to exist a dimension mismatch in the matrix multiplication between I*N (red circle), affecting the output, given that n x m dot m x p will give you a n x p dimensional output.
I'm getting a runtime error:
RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)`
and can't figure out how to fix it.
The error appears to refer to the line:
i_enc = F.normalize(input =i_batch, p=2, dim=1, eps=1e-12) # (batch, K, feat_dim)
I'm trying to encode image features (batch x 36 x 2038) by applying a L2 norm. Below is the full code for the section.
def forward(self, q_batch, i_batch):
# batch size = 512
# q -> 512(batch)x14(length)
# i -> 512(batch)x36(K)x2048(f_dim)
# one-hot -> glove
emb = self.embed(q_batch)
output, hn = self.gru(emb.permute(1, 0, 2))
q_enc = hn.view(-1,self.h_dim)
# image encoding with l2 norm
i_enc = F.normalize(input =i_batch, p=2, dim=1, eps=1e-12) # (batch, K, feat_dim)
q_enc_copy = q_enc.repeat(1, self.K).view(-1, self.K, self.h_dim)
q_i_concat = torch.cat((i_enc, q_enc_copy), -1)
q_i_concat = self.non_linear(q_i_concat, self.td_W, self.td_W2 )#512 x 36 x 512
i_attention = self.att_w(q_i_concat) #512x36x1
i_attention = F.softmax(i_attention.squeeze(),1)
#weighted sum
i_enc = torch.bmm(i_attention.unsqueeze(1), i_enc).squeeze() # (batch, feat_dim)
# element-wise multiplication
q = self.non_linear(q_enc, self.q_W, self.q_W2)
i = self.non_linear(i_enc, self.i_W, self.i_W2)
h = torch.mul(q, i) # (batch, hid_dim)
# output classifier
# BCE with logitsloss
score = self.c_Wo(self.non_linear(h, self.c_W, self.c_W2))
return score
I would appreciate any help.
Thanks
I would suggest to check the shape of i_batch (e.g. print(i_batch.shape)), as I suspect i_batch has only 1 dimension (e.g. of shape [N]).
This would explain why PyTorch is complaining you can normalize only over the dimension #0; while you are asking for the operation to be done over a dimension #1 (c.f. dim=1).
I am trying to model CNN for 1-D signal but I am unable to understand rank errors.
My program goes like this:
#Weights
def init_weights(shape):
init_random_dist = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(init_random_dist)
#Bias
def init_bias(shape):
init_bias = tf.constant(0.1,shape=shape)
return tf.Variable(init_bias)
def conv1d(x,W):
#x is input accelration data and W is corresponding weight
x = tf.cast(x, tf.float32)
tf.nn.conv1d(x,W,stride=1,padding='VALID')
def convolution_layer(input_x,shape):
w = init_weights(shape)
b = init_bias([shape[3]])
return tf.nn.relu(conv1d(input_x,w)+b)
Now placeholders
x = tf.placeholder(tf.float32,shape=[1,1,200,1])
y_true = tf.placeholder(tf.float32,shape=[None,6])
while creating 1st layer using con_layer_1 = convolution_layer(x,shape=[1,20,1,32]) i get rank ValueError which i'm unable to degubg. Error statement is:
ValueError: Shape must be rank 4 but is rank 5 for 'conv1d_20/Conv2D' (op: 'Conv2D') with input shapes: [1,1,1,200,1], [1,1,20,1,32].
The input and weights shapes to the nn.conv1d is not right. The input shape for the nn.conv1d should be of the size : [ batch_size, input_length, input_channels] and the weights matrix should be of size [filter_size, inputs_channels, output_channels]. So you need to change your code to :
def convolution_layer(input_x,shape):
w = init_weights(shape)
b = init_bias([shape[2]])
return tf.nn.relu(conv1d(input_x,w)+b)
x = tf.placeholder(tf.float32,shape=[1,200,1])
y_true = tf.placeholder(tf.float32,shape=[None,6])
con_layer_1 = convolution_layer(x,shape=[20,1,32])
Note: You should try yo use tf.layers API that takes care of the weights assignment and all.