Trying to get similar results on same dataset with Keras and PyTorch.
Data
from numpy import array
from numpy import hstack
from sklearn.model_selection import train_test_split
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
def get_data():
# define input sequence
in_seq1 = array([x for x in range(0,500,10)])/1
in_seq2 = array([x for x in range(5,505,10)])/1
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# convert into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
return X_train,x_test,Y_train, y_test
Keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu',
input_shape=(n_timesteps, n_features),
kernel_initializer='uniform',
recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=keras.optimizers.K.epsilon(),
decay=0.0,
amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))
yhat = model.predict(x_test, verbose=0)
mean_squared_error(y_test, yhat)
PyTorch - module class
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features # number of parallel inputs
self.seq_len = seq_length # number of timesteps
self.n_hidden = 1024 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
# self.l_linear1 = torch.nn.Linear(512, 512)
self.l_linear2 = torch.nn.Linear(512, 1)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
x = lstm_out.contiguous().view(batch_size,-1)
x = F.relu(x)
x = F.relu(self.l_linear(x))
# x = F.relu(self.l_linear1(x))
x = self.l_linear2(x)
return x
PyTorch - init and train
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(),
lr=1e-3,
betas=[0.9,0.999],
eps=keras.optimizers.K.epsilon(),
weight_decay=0,
amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)
train_episodes = 200
batch_size = 32
eval_batch_size = 32
for t in range(train_episodes):
# TRAIN
mv_net.train()
for b in range(0,len(X_train),batch_size):
inpt = X_train[b:b+batch_size,:,:]
target = Y_train[b:b+batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL
mv_net.eval()
mv_net.init_hidden(eval_batch_size)
acc = 0
for b in range(0,len(x_test),eval_batch_size):
inpt = x_test[b:b+eval_batch_size,:,:]
target = y_test[b:b+eval_batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy())
print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))
mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device)
otp = mv_net(val)
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))
Results
Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different
I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params
I cant see what is wrong with (kinda tutorialic) PyTorch code
I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:
return_sequences: Boolean. Whether to return the last output in the
output sequence, or the full sequence.
this basically means that the input shape of your self.l_linear needs to be torch.nn.Linear(1024, 512) instead of self.n_hidden*self.seq_len, 512.
Now you also need to do the same as keras does and only use the last output in your forward pass:
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out[:,-1]
x = torch.nn.functional.relu(x)
x = torch.nn.functional.relu(self.l_linear(x))
x = self.l_linear2(x)
return x
when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.
Keras:
38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259
PyTorch:
step: 199 train loss: 41.043 eval acc: 1142.688
I hope this helps others having a similar problem.
PS also note that keras is resetting the hidden state (stateful=False) by default.
Related
I got an assignment and stuck with it while going down the rabbit hole of learning PyTorch, LSTM and cnn.
Provided the well known MNIST library I take combinations of 4 numbers and per combination it falls down into one of 7 labels.
eg:
1111 label 1 (follow a constant trend)
1234 label 2 increasing trend
4321 label 3 decreasing trend
...
7382 label 7 decreasing trend - increasing trend - decreasing trend
The shape of my tensor after loading of the tensor become (3,4,28,28) where the 28 comes from the MNIST image's width and height. 3 is the batch size and 4 is the channels (4 images).
I'm somewhat stuck with how to pass this into a PyTorch backed LSTM and CNN as basically all Google searches lead to articles where simply one image is passed in.
I was thinking of reshaping it to 1 long array of (pixel values) where I put all of the values of the first image row by row (28) after each other, then appended by the same approach for the second, third and fourth image. So that would make 4 * 28 * 28 = 3136.
Is my way of thinking on how to tackle this a correct one or should I rethink? I'm rather new to this all and looking for some guidance on how to go forward. I've been reading loads of articles, YT videos, ... but all seem to touch the basic stuff or alternatives of the same subject.
I have written some code but running it gives errors.
import numpy as np
import torch
import torch.nn as nn
from torch import optim, softmax
from sklearn.model_selection import train_test_split
#dataset = sequences of 4 MNIST images each
#datalabels =7
#Data
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.data_label, test_size=0.15,
random_state=42)
#model
class Mylstm(nn.Module):
def __init__(self, input_size, hidden_size, n_layers, n_classes):
super(Mylstm, self).__init__()
self.input_size = input_size
self.n_layers = n_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True)
# readout layer
self.fc = nn.Linear(hidden_size, n_classes)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_size).requires_grad_()
# initialize the cell state:
c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_size).requires_grad_()
out, (h_n, h_c) = self.lstm(x, (h0.detach(), c0.detach()))
x = h_n[-1, :, 1]
x = self.fc(x)
x = softmax(x, dim=1)
return x
#Hyperparameters
input_size = 28
hidden_size = 256
sequence_length = 28
n_layers = 2
n_classes = 7
learning_rate = 0.001
model = Mylstm(input_size, hidden_size, n_layers, n_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#training
bs = 0
num_epochs = 5
batch_size=3
if np.mod(x_train.shape[0], batch_size) == 0.0:
iter = int(x_train.shape[0] / batch_size)
else:
iter = int(x_train.shape[0] / batch_size) + 1
bs = 0
for i in range(iter):
sequences = x_test[bs:bs + batch_size, :]
labels = y_test[bs:bs + batch_size]
test_images = dataset.load_images(sequences)
bs += batch_size
for epoch in range(num_epochs):
for i in range(iter):
sequences = x_train[bs:bs + batch_size, :]
labels = y_train[bs:bs + batch_size]
input_images = dataset.load_images(sequences)
bs += batch_size
images=(torch.from_numpy(input_images)).view(batch_size,4,-1)
labels=torch.from_numpy(labels)
optimizer.zero_grad()
output = model(images)
# calculate Loss
loss = criterion(output, labels)
loss.backward()
optimizer.step()
The error I'm currently getting is:
RuntimeError: input.size(-1) must be equal to input_size. Expected 28, got 784
Change your input size from 28 to 784. (784=28*28).
Input size argument is the number of features in one element of the sequence, so the number of feature of an mnist image, so the number of pixels which is width*hight of the image.
I am trying to solve the very simple non-linear problem. It is XOR gate.
I my school knowledge. XOR can be solve by using 2 input nodes, 2 hidden layer nodes. And 1 output. It is binary classification problem.
I generate the 1000 of random integer number it is 0 or 1 and then do backpropagation. But for some unknown reason my network has not learned anything. The training accuracy is constant at 50.
# coding: utf-8
import matplotlib
import torch
import torch.nn as nn
from torch.autograd import Variable
matplotlib.use('TkAgg') # My buggy OSX 10.13.6 requires this
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from tqdm import tqdm
import random
N = 1000
batch_size = 10
epochs = 40
hidden_size = 2
output_size = 1
lr = 0.1
def return_xor(N):
tmp_x = []
tmp_y = []
for i in range(N):
a = (random.randint(0, 1) == 1)
b = (random.randint(0, 1) == 1)
if (a and not b) or (not a and b):
q = True
else:
q = False
input_features = (a, b)
output_class = q
tmp_x.append(input_features)
tmp_y.append(output_class)
return tmp_x, tmp_y
# In[495]:
# Training set
x, y = return_xor(N)
x = torch.tensor(x, dtype=torch.float, requires_grad=True)
y = torch.tensor(y, dtype=torch.float, requires_grad=True)
# Test dataset
x_test, y_test = return_xor(100)
x_test = torch.tensor(x_test)
y_test = torch.tensor(y_test)
class MyDataset(Dataset):
"""Define my own `Dataset` in order to use `Variable` with `autograd`"""
def __init__(self, x, y):
self.x = x
self.y = y
def __getitem__(self, index):
return self.x[index], self.y[index]
def __len__(self):
return len(self.x)
dataset = MyDataset(x, y)
test_dataset = MyDataset(x_test, y_test)
print(dataset.x.shape)
print(dataset.y.shape)
# Make data iterable by loading to a loader. Shuffle, batch_size kwargs put them here in order to remind I myself
train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
print(f"They are {len(train_loader)} batches in the dataset")
shown = 0
for (x, y) in train_loader:
if shown == 1:
break
print(f"{x.shape} {x.dtype}")
print(f"{y.shape} {y.dtype}")
shown += 1
class MyModel(nn.Module):
"""
Binary classification
2 input nodes
2 hidden nodes
1 output node
"""
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = torch.nn.Linear(input_size, hidden_size)
self.fc2 = torch.nn.Linear(hidden_size, output_size)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, out):
out = self.fc1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
# Create my network
net = MyModel(dataset.x.shape[1], hidden_size, output_size)
CUDA = torch.cuda.is_available()
if CUDA:
net = net.cuda()
criterion = torch.nn.BCELoss(reduction='elementwise_mean')
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# Train the network
correct_train = 0
total_train = 0
for epoch in range(epochs):
for i, (batches, labels) in enumerate(train_loader):
batcesh = Variable(batches.float())
labels = Variable(labels.float())
output = net(batches) # Forward pass
optimizer.zero_grad()
loss = criterion(output, labels.view(10, 1))
loss.backward()
optimizer.step()
total_train += labels.size(0)
correct_train += (predicted == labels.long()).sum()
if (i + 1) % 10 == 0:
print(f"""
Epoch {epoch+1}/{epochs},
Iteration {i+1}/{len(dataset)//batch_size},
Training Loss: {loss.item()},
Training Accuracy: {100*correct_train/total_train}
""")
Solution:
I did initialized weight, Adaptive learning rate
https://github.com/elcolie/nnbootcamp/blob/master/Study-XOR.ipynb
I am not sure what results you are getting, as the code you have posted in the question doesn't work (It gives errors with pytorch 0.4.1 like predicted not defined etc). But syntax issues apart, there are other problems.
Your model is not actually two layer as it does not use non-linearity after the first output. Effectively this is one layer network and to fix that you can modify your model's forward as follows:
def forward(self, out):
out = torch.nn.functional.relu(self.fc1(out))
out = self.fc2(out)
out = self.sigmoid(out)
return out
You can try sigmoid or tanh non-linearity as well... but the non-linearity is a must. This should fix the problem.
I also see that you are using only 2 hidden units. This might be restrictive and you might want to increase that to something like 5 or 10.
I am trying to make hourly predictions using a recurrent neural network using TensorFlow and Keras in Python.I have assigned my inputs of the neural network to be (None, None, 5) shown in my .
However, I am getting the errorː
ValueError: Error when checking input: expected gru_3_input to have shape (None, None, 10) but got array with shape (1, 4, 1) My MVCE code isː
%matplotlib inline
#!pip uninstall keras
#!pip install keras==2.1.2
import tensorflow as tf
import pandas as pd
from pandas import DataFrame
import math
#####Create the Recurrent Neural Network###
model = Sequential()
model.add(GRU(units=5,
return_sequences=True,
input_shape=(None, num_x_signals)))
## This line is going to map the above 512 values to just 1 (num_y_signal)
model.add(Dense(num_y_signals, activation='sigmoid'))
if False:
from tensorflow.python.keras.initializers import RandomUniform
# Maybe use lower init-ranges.##### I may have to change these during debugging####
init = RandomUniform(minval=-0.05, maxval=0.05)
model.add(Dense(num_y_signals,
activation='linear',
kernel_initializer=init))
warmup_steps = 5
def loss_mse_warmup(y_true, y_pred):
#
# Ignore the "warmup" parts of the sequences
# by taking slices of the tensors.
y_true_slice = y_true[:, warmup_steps:, :]
y_pred_slice = y_pred[:, warmup_steps:, :]
# These sliced tensors both have this shape:
# [batch_size, sequence_length - warmup_steps, num_y_signals]
# Calculate the MSE loss for each value in these tensors.
# This outputs a 3-rank tensor of the same shape.
loss = tf.losses.mean_squared_error(labels=y_true_slice,
predictions=y_pred_slice)
loss_mean = tf.reduce_mean(loss)
return loss_mean
optimizer = RMSprop(lr=1e-3) ### This is somthing related to debugging
model.compile(loss=loss_mse_warmup, optimizer=optimizer)#### I may have to make the output a singnal rather than the whole data set
print(model.summary())
model.fit_generator(generator=generator,
epochs=20,
steps_per_epoch=100,
validation_data=validation_data)
I am not sure why this could be, but i believe it could something to do with reshaping my training and testing data. ɪ have also attached my full error message to my code to make the problem reproducible.
I'm unsure about the correctness but here it is:
%matplotlib inline
#!pip uninstall keras
#!pip install keras==2.1.2
import tensorflow as tf
import pandas as pd
from pandas import DataFrame
import math
import numpy
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
import datetime
from keras.layers import Input, Dense, GRU, Embedding
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
datetime = [datetime.datetime(2012, 1, 1, 1, 0, 0) + datetime.timedelta(hours=i) for i in range(10)]
X=np.array([2.25226244,1.44078451,0.99174488,0.71179491,0.92824542,1.67776948,2.96399534,5.06257161,7.06504245,7.77817664
,0.92824542,1.67776948,2.96399534,5.06257161,7.06504245,7.77817664])
y= np.array([0.02062136,0.00186715,0.01517354,0.0129046 ,0.02231125,0.01492537,0.09646542,0.28444476,0.46289928,0.77817664
,0.02231125,0.01492537,0.09646542,0.28444476,0.46289928,0.77817664])
X = X[1:11]
y= y[1:11]
df = pd.DataFrame({'date':datetime,'y':y,'X':X})
df['t']= [x for x in range(10)]
df['X-1'] = df['X'].shift(-1)
x_data = df['X-1'].fillna(0)
y_data = y
num_data = len(x_data)
#### training and testing split####
train_split = 0.6
num_train = int(train_split*num_data)
num_test = num_data-num_train## number of observations in test set
#input train test
x_train = x_data[0:num_train].reshape(-1, 1)
x_test = x_data[num_train:].reshape(-1, 1)
#print (len(x_train) +len( x_test))
#output train test
y_train = y_data[0:num_train].reshape(-1, 1)
y_test = y_data[num_train:].reshape(-1, 1)
#print (len(y_train) + len(y_test))
### number of input signals
num_x_signals = x_data.shape[0]
# print (num_x_signals)
## number of output signals##
num_y_signals = y_data.shape[0]
#print (num_y_signals)
####data scalling'###
x_scaler = MinMaxScaler(feature_range=(0,1))
x_train_scaled = x_scaler.fit_transform(x_train)
x_test_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(x_test)
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(y_test)
def batch_generator(batch_size, sequence_length):
"""
Generator function for creating random batches of training-data.
"""
# Infinite loop. providing the neural network with random data from the
# datase for x and y
while True:
# Allocate a new array for the batch of input-signals.
x_shape = (batch_size, sequence_length, num_x_signals)
x_batch = np.zeros(shape=x_shape, dtype=np.float16)
# Allocate a new array for the batch of output-signals.
y_shape = (batch_size, sequence_length, num_y_signals)
y_batch = np.zeros(shape=y_shape, dtype=np.float16)
# Fill the batch with random sequences of data.
for i in range(batch_size):
# Get a random start-index.
# This points somewhere into the training-data.
idx = np.random.randint(num_train - sequence_length)
# Copy the sequences of data starting at this index.
x_batch[i] = x_train_scaled[idx:idx+sequence_length]
y_batch[i] = y_train_scaled[idx:idx+sequence_length]
yield (x_batch, y_batch)
batch_size =20
sequence_length = 2
generator = batch_generator(batch_size=batch_size,
sequence_length=sequence_length)
x_batch, y_batch = next(generator)
#########Validation Set Start########
def batch_generator(batch_size, sequence_length):
"""
Generator function for creating random batches of training-data.
"""
# Infinite loop. providing the neural network with random data from the
# datase for x and y
while True:
# Allocate a new array for the batch of input-signals.
x_shape = (batch_size, sequence_length, num_x_signals)
x_batch = np.zeros(shape=x_shape, dtype=np.float16)
# Allocate a new array for the batch of output-signals.
y_shape = (batch_size, sequence_length, num_y_signals)
y_batch = np.zeros(shape=y_shape, dtype=np.float16)
# Fill the batch with random sequences of data.
for i in range(batch_size):
# Get a random start-index.
# This points somewhere into the training-data.
idx = np.random.randint(num_train - sequence_length)
# Copy the sequences of data starting at this index.
x_batch[i] = x_test_scaled[idx:idx+sequence_length]
y_batch[i] = y_test_scaled[idx:idx+sequence_length]
yield (x_batch, y_batch)
validation_data= next(batch_generator(batch_size,sequence_length))
# validation_data = (np.expand_dims(x_test_scaled, axis=0),
# np.expand_dims(y_test_scaled, axis=0))
#Validation set end
#####Create the Recurrent Neural Network###
model = Sequential()
model.add(GRU(units=5,
return_sequences=True,
input_shape=(None, num_x_signals)))
## This line is going to map the above 512 values to just 1 (num_y_signal)
model.add(Dense(num_y_signals, activation='sigmoid'))
if False:
from tensorflow.python.keras.initializers import RandomUniform
# Maybe use lower init-ranges.##### I may have to change these during debugging####
init = RandomUniform(minval=-0.05, maxval=0.05)
model.add(Dense(num_y_signals,
activation='linear',
kernel_initializer=init))
warmup_steps = 5
def loss_mse_warmup(y_true, y_pred):
#
# Ignore the "warmup" parts of the sequences
# by taking slices of the tensors.
y_true_slice = y_true[:, warmup_steps:, :]
y_pred_slice = y_pred[:, warmup_steps:, :]
# These sliced tensors both have this shape:
# [batch_size, sequence_length - warmup_steps, num_y_signals]
# Calculate the MSE loss for each value in these tensors.
# This outputs a 3-rank tensor of the same shape.
loss = tf.losses.mean_squared_error(labels=y_true_slice,
predictions=y_pred_slice)
loss_mean = tf.reduce_mean(loss)
return loss_mean
optimizer = RMSprop(lr=1e-3) ### This is somthing related to debugging
model.compile(loss=loss_mse_warmup, optimizer=optimizer)#### I may have to make the output a singnal rather than the whole data set
print(model.summary())
model.fit_generator(generator=generator,
epochs=20,
steps_per_epoch=100,
validation_data=validation_data)
I've only changed part of code between validation set start and validation set end.
I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]
Here is my model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
My training sequences with output:
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
output:
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
error:
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED -------------------------------------------------------------------
I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?
New Model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
New Training
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
training with tensor sizes printed:
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
Evaluation
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct / float(total)
eval_model(model)
I think there is an issue in a way you are trying to solve an entailment problem.
Maybe you can do it this way:
design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them
together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution
So your model can look like this (double check the dimensions):
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here
def forward(self, sentence1, sentence2):
embeds_1 = self.embedding(sentence1)
embeds_2 = self.embedding(sentence2)
_, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)
concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
scores = self.mlp(concat)
probas = F.softmax(scores) #from torch.functional ...
Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc).
Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).
I am new to TensorFlow and neural networks. I am trying to build a neural network that can classify images in the CIFAR-10 dataset.
Here is my code:
import tensorflow as tf
import pickle
import numpy as np
import random
image_size= 32*32*3 # because 3 channels
n_classes = 10
lay1_size = 50
batch_size = 100
def unpickle(filename):
with open(filename,'rb') as f:
data = pickle.load(f, encoding='latin1')
x = data['data']
y = data['labels']
# shuffle the data
z = list(zip(x,y))
random.shuffle(z)
x, y = zip(*z)
x = x[:batch_size]
y = y[:batch_size]
# covert decimals to one hot arrays
y = np.eye(n_classes)[[y]]
return x, y
# set up network
def add_layer(inputs, in_size, out_size, activation_function=None):
W = tf.Variable(tf.random_normal([in_size, out_size]), dtype=tf.float32)
b = tf.Variable(tf.zeros([1,out_size]) + 0.1, dtype=tf.float32)
Wx_plus_b = tf.matmul(inputs, W) + b
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
def compute_accuracy(v_xs, v_ys):
global prediction
y_pre = sess.run(prediction, feed_dict={xs:v_xs})
correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(v_ys, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
result = sess.run(accuracy, feed_dict={xs:v_xs, ys:v_ys})
return result
xs = tf.placeholder(tf.float32, [None,image_size])
ys = tf.placeholder(tf.float32)
lay1 = add_layer(xs, image_size, lay1_size, activation_function=tf.nn.tanh)
lay2 = add_layer(lay1, lay1_size, lay1_size, activation_function=tf.nn.tanh)
prediction = add_layer(lay2, lay1_size, n_classes, activation_function=tf.nn.softmax)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(ys*tf.log(prediction), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
#train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
# run network
sess = tf.Session()
sess.run(tf.initialize_all_variables())
x_test, y_test = unpickle('test_batch')
for i in range(1000):
x_train, y_train = unpickle('data_batch_1')
sess.run(train_step, feed_dict={xs:x_train,ys:y_train})
if i % 50 == 0:
print(compute_accuracy(x_test, y_test))
sess.close()
I am using two hidden layers with 50 nodes in each layer. I am running 1,000 cycles, where in each cycle I shuffle data in the dataset and pick the first 100 images of that shuffle to train on.
I am consistently getting ~0.1 accuracy, the machine is not learning at all.
When I modify the code to use the MNIST dataset instead of the CIFAR-10 dataset I get ~0.87 accuracy.
I took code from an MNIST tutorial and am trying to modify it to classify CIFAR-10 data.
I can't figure out what's wrong here. How do I get my algorithm to learn?