I am trying to implement Ridge Regression in pytorch, defining the loss function and plotting said function over different iterations. The only issue is, I keep getting an error code: mat1 and mat2 shapes cannot be multiplied (1000x10 and 1x1). I would like to convert the second matrix to a 1x10 in order to complete the code but I can't seem to get it to work.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
n = 1000
p = 10
mean = np.zeros((p))
val = 0.8
cov = np.ones((p,p))*val
cov = cov + np.eye(p)*(1-val)
np.random.seed(10)
X = np.random.multivariate_normal(mean, cov, n)
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1))),axis=0)
delta=0.5
Sigma = np.eye(n,n,k=-1)*0.4 + np.eye(n,n)*1 + np.eye(n,n,k=1)*0.4
mean = np.zeros(n)
e = np.random.multivariate_normal(mean, Sigma, 1)
y=X#theta_true + delta*e.T
import torch
X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()
Sigma_t = torch.from_numpy(Sigma).float()
import torch.nn as nn
import torch.nn.functional as F
class MyLinear(nn.Module):
def __init__(self):
super(MyLinear, self).__init__()
self.linear = nn.Linear(1, 1)
def forward(self, x):
out = self.linear(x)
return out
def L2_norm(model):
return torch.sum(list(model.parameters())[0]**2)
def L1_norm(model):
return torch.sum(torch.abs(list(model.parameters())[0]))
def ridge_loss(y_pred, y_true, model, lambda_):
mse = F.mse_loss(y_pred, y_true)
regularization = lambda_ * L2_norm(model)
return mse + regularization
import matplotlib.pyplot as plt
model = MyLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
lambda_ = 0.1
num_epochs = 1000
loss_values = []
for epoch in range(num_epochs):
optimizer.zero_grad()
y_pred = model(X_t)
loss = ridge_loss(y_pred, y_t, model, lambda_)
loss_values.append(loss.item())
loss.backward()
optimizer.step()
plt.plot(loss_values)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Ridge Regression Loss over Iterations')
plt.show()
I tried changing the theta_true definition to transform the matrix but the same error occurred.
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1)))).reshape(10, 1)
Your Linear layer in MyLinear (line 37) is what is causing the issue.
self.linear = nn.Linear(1, 1)
means 1 input channel, one output channel, but x, as you have it here has shape (1000, 10), meaning it has 10 channels. So you will need to change that line to
self.linear = nn.Linear(10, 1)
that will do the trick, here is the image I get with that change:
Related
I'm trying to fit a small dataset(just 7x1 size) with a 3-layer perceptron model, but the loss can't converge. I'm fresh to machine learning area, can someone please give me a hint to adjust my code?
import torch
import torch.nn as nn
import torch.nn.functional as F
vec_shape = [7, 1]
x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]
# x = torch.tensor(x_0).reshape(vec_shape).float()
x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
y = torch.tensor(y_0).reshape(vec_shape).float()
class Net(nn.Module):
def __init__(self,n_input,n_hidden,n_output):
super(Net,self).__init__()
self.hidden1 = nn.Linear(n_input,n_hidden)
self.hidden2 = nn.Linear(n_hidden,n_hidden)
self.predict = nn.Linear(n_hidden,n_output)
def forward(self,input):
out = self.hidden1(input)
out = F.relu(out)
out = self.hidden2(out)
out = torch.sigmoid(out)
out =self.predict(out)
return out
def weight_init(self):
for op in self.modules():
if isinstance(op, nn.Linear):
nn.init.normal_(op.weight.data)
nn.init.normal_(op.bias.data)
net = Net(1,10,1)
net.weight_init()
# print(net)
optimizer = torch.optim.SGD(net.parameters(),lr = 0.1)
loss_func = torch.nn.MSELoss()
for t in range(500):
prediction = net(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(t%50 == 0):
print('Loss = %.4f' % loss.data)
I tried to expand the model or shrink it, but both changes don't work.
Rescaling and normalization is key in machine learning, your setup is pretty good and you apply some rescaling but simply not enough. With the limited amount of datapoints you have, the range is way too large. So just like you do with x_0, apply torch.log to y_0. You can always scale back the predictions after training. Below you can find the adapted code, I changed two things:
torch.log on y_0
Learning rate to 0.01
Number of iterations to 50000
Added a print statement to show rescaling of predictions
import torch
import torch.nn as nn
import torch.nn.functional as F
vec_shape = [7, 1]
x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]
# x = torch.tensor(x_0).reshape(vec_shape).float()
x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
y = torch.log(torch.tensor(y_0).reshape(vec_shape).float()) # modified
class Net(nn.Module):
def __init__(self,n_input,n_hidden,n_output):
super(Net,self).__init__()
self.hidden1 = nn.Linear(n_input,n_hidden)
self.hidden2 = nn.Linear(n_hidden,n_hidden)
self.predict = nn.Linear(n_hidden,n_output)
def forward(self,input):
out = self.hidden1(input)
out = F.relu(out)
out = self.hidden2(out)
out = torch.sigmoid(out)
out =self.predict(out)
return out
def weight_init(self):
for op in self.modules():
if isinstance(op, nn.Linear):
nn.init.normal_(op.weight.data)
nn.init.normal_(op.bias.data)
net = Net(1,10,1)
net.weight_init()
# print(net)
optimizer = torch.optim.SGD(net.parameters(),lr = 0.01) # modified
loss_func = torch.nn.MSELoss()
for t in range(50000): # modified
prediction = net(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(t%50 == 0):
print('Loss = %.4f' % loss.data)
print(torch.exp(net(x))) # added
I also recommend normalizing your dataset after the logarithmic rescaling, for instance by dividing by the standard deviation and subtracting the mean.
I have implemented and trained a neural network in Pytorch, however, I am interested in the derivative of the neural network parameters with respect to the input.
I have extensively searched for any procedure to that would allow evaluating the derivative of weights with respect to a given input, but I did not find anything.
I know that I can compute the gradients of a function in the following way.
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
But How would I do that with a trained neural network instead of a function Q?
Thanks in advance.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
# In[2]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
from torch import optim
# In[3]:
nSimul = 32768
T1 = 1.0
T2 = 2.0
K = 110.0
spot = 100.0
vol = 0.2
vol0 = 0.5 # vol is increased over the 1st period so we have more points in the wings
# simulate all Gaussian returns (N1, N2) first
# returns: matrix of shape [nSimul, TimeSteps=2]
returns = np.random.normal(size=[nSimul,2])
# generate paths, step by step, and not path by path as customary
# this is to avoid slow Python loops, using NumPy's optimized vector functions instead
# generate the vector of all scenarios for S1, of shape [nSimul]
S1 = spot * np.exp(-0.5*vol0*vol0*T1 + vol0*np.sqrt(T1)*returns[:,0])
# generate the vector of all scenarios for S2, of shape [nSimul]
S2 = S1 * np.exp(-0.5*vol*vol*(T2-T1) + vol*np.sqrt(T2-T1)*returns[:,1])
# training set, X and Y are both vectors of shape [nSimul]
X = S1
Y = np.maximum(0, S2 - K)
xAxis = np.linspace(20, 200, 100)
xAxis=xAxis.reshape(-1,1)
# In[4]:
#Normalization of the simulated data:
meanX = np.mean(X)
stdX = np.std(X)
meanY = np.mean(Y)
stdY = np.std(Y)
normX = (X - meanX) / stdX
normY = (Y - meanY) / stdY
normX=normX.reshape(-1,1)
normY=normY.reshape(-1,1)
# In[5]:
class NeuralNetwork(nn.Module):
def __init__(self,inputsize,outputsize):
super(NeuralNetwork, self).__init__()
#self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(inputsize,3),
nn.ELU(),
nn.Linear(3, 5),
nn.ELU(),
nn.Linear(5,3),
nn.ELU(),
nn.Linear(3,outputsize),
)
w = torch.empty(0,1)
nn.init.normal_(w)
def forward(self, x):
#x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
# In[6]:
inputDim = 1 # takes variable 'x'
outputDim = 1 # takes variable 'y'
learningRate = 0.05
epochs = 10000
#weight=torch.empty(3)
model = NeuralNetwork(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
model.cuda()
# In[7]:
#criterion = torch.nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
# In[ ]:
def ridge_loss(outputs,labels):
torch.mean((outputs-labels)**2)
# In[ ]:
# In[9]:
#Adam optmization
criterion = torch.nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.05)
# In[10]:
for epoch in range(epochs):
# Converting inputs and labels to Variable
if torch.cuda.is_available():
inputs = Variable(torch.from_numpy(normX).cuda().float())
labels = Variable(torch.from_numpy(normY).cuda().float())
else:
inputs = Variable(torch.from_numpy(normX).float())
labels = Variable(torch.from_numpy(normY).float())
# Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
optimizer.zero_grad()
# get output from the model, given the inputs
outputs = model(inputs)
# get loss for the predicted output
loss = criterion(outputs, labels)
print(loss)
# get gradients w.r.t to parameters
loss.backward()
# update parameters
optimizer.step()
print('epoch {}, loss {}'.format(epoch, loss.item()))
# In[11]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[13]:
def BlackScholes(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
d2 = d1 - sigma * np.sqrt(T)
return norm.cdf(d1) * S0 - norm.cdf(d2) * K * np.exp(-r*T)
def BlackScholesCallDelta(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
return norm.cdf(d1)
BlackScholes_vec=np.vectorize(BlackScholes)
BlackScholesCallDelta_vec=np.vectorize(BlackScholesCallDelta)
# In[14]:
BS_price=BS_prices=BlackScholes_vec(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis,predicted,label="Neural Regression")
line_BS = plt.plot(xAxis,BS_price, label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[15]:
Prices_rg_mc_diff=[]
for i in range(len(xAxis)-1):
delta=(predicted[i+1]-predicted[i])/(xAxis[i+1]-xAxis[i])
Prices_rg_mc_diff.append(delta)
# In[16]:
BS_delta=BlackScholesCallDelta(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis[1:],Prices_rg_mc_diff,label="Neural Regression")
line_BS = plt.plot(xAxis[1:],BS_delta[1:], label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[17]:
model.backward(retain_graph=True)
# In[ ]:
print(NeuralNetwork.weight.grad)
# In[ ]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[21]:
c3=torch.from_numpy((predicted.reshape(-1,1)), requires_grad=True)
c4=torch.from_numpy(xAxis, requires_grad=True)
#c5=torch.Tensor(c3)
#c6=torch.Tensor(c4)
loss = criterion(c3,c4) # calculating loss
loss.backward()
# In[28]:
torch.tensor(predicted.reshape(-1,1), requires_grad=True)
torch.tensor(xAxis, requires_grad=True)
criterion(torch.tensor(predicted.reshape(-1,1), requires_grad=True),torch.tensor(xAxis, requires_grad=True))
loss.backward()
You need to explicitly use requires_grad = True when create a tensor. And to calculate gradient you first need to apply some operation on the tensor.
Here is an example:
import torch
x = torch.rand(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()
out.backward()
print(x.grad)
Output:
tensor([[3.3720, 3.4302],
[3.4030, 3.3605]])
In this way you are using torch.autograd to calculate the gradient for tensor x. See autograd for more.
And for neural network you can simply use the network and backward it afterward.
A neural network Example:
import torch
import torch.nn as nn
import torch.nn.functional as f
x = torch.rand(2, 2)
# define a neural network
network = nn.Sequential(
nn.Linear(2,100),
nn.Linear(100,2)
)
pred = network(x)
loss = f.mae_loss(pred, x) # calculating loss
loss.backward()
# Update weights with gradients
network[0].weight = 0.1 * network[0].weight.grad
network[1].weight = 0.1 * network[1].weight.grad
Note: I didn't put any activation function in network for the sack of simplicity.
Example of backward() using torch.nn.MSELoss():
import torch
from torch.nn import MSELoss
criterion = MSELoss()
a = torch.tensor([1.,2.], requires_grad=True)
b = a**2
loss = criterion(b, a)
loss.backward()
print(a.grad)
Output:
tensor([0., 6.])
I am new to tensorflow-2 and I was starting my learning curve, with the follow simple Linear-Regression model:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Make data
num_samples, w, b = 20, 0.5, 2
xs = np.asarray(range(num_samples))
ys = np.asarray([x*w + b + np.random.normal() for x in range(num_samples)])
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(xs, dtype=tf.float32)
plt.plot(xs, ys, 'ro')
class Linear(tf.keras.Model):
def __init__(self, name='linear', **kwargs):
super().__init__(name='linear', **kwargs)
self.w = tf.Variable(0, True, name="w", dtype=tf.float32)
self.b = tf.Variable(1, True, name="b", dtype=tf.float32)
def call(self, inputs):
return self.w*inputs + self.b
class Custom(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
if epoch % 20 == 0:
preds = self.model.predict(xts)
plt.plot(xs, preds, label='{} {:7.2f}'.format(epoch, logs['loss']))
print('The average loss for epoch {} is .'.format(epoch, logs['loss']))
x = tf.keras.Input(dtype=tf.float32, shape=[])
#model = tf.keras.Sequential([tf.keras.layers.Dense(units=1, input_shape=[1])])
model = Linear()
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='MSE')
model.fit(x=xts, y=yts, verbose=1, batch_size=4, epochs=250, callbacks=[Custom()])
plt.legend()
plt.show()
For a reason I don't understand it seems like my model is not fitting the curve.
I also tried with keras.layers.Dense(1) and I had the same exact result.
Also it seems like the results don't correspond to a proper loss function, as around epoch 120 the model should have less loss than on 250.
Can you maybe help me understand what I am doing wrong?
Thanks a lot!
There is a small bug in your code as xts and yts are identical to each other, i.e. you wrote
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(xs, dtype=tf.float32)
instead of
xts = tf.convert_to_tensor(xs, dtype=tf.float32)
yts = tf.convert_to_tensor(ys, dtype=tf.float32)
which is why the loss doesn't make sense. Once this has been fixed the results are as expected, see the plot below.
I tried to implement multivariate linear regression from scratch and it works pretty well actually. When I was testing it with a toy dataset, I run into sometimes the predictions were 'NaN'. I know what are the possible NaN reasons though, I couldn't understand which one causes it in my script.
Note: with 0.0001 learning rate and 1.000.000 iterations, I got a really good line for the dataset though, when learning rate is 0.001 and the number of iterations is 1.000.000, the predictions were NaN.
Here is the script:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
class MultivariateLinearRegression():
#constructor
def __init__(self, learning_rate, learning_algorithm, epoch_num):
self.learning_rate = learning_rate
self.learning_algoritm = learning_algorithm
self.epoch_num = epoch_num
self.theta = 0
self.training_sample = 0
def train(self, X, Y):
Y = Y.reshape((Y.size, 1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
bias = np.ones([X.shape[0], 1])
X = np.concatenate((X, bias), 1)
self.theta = np.zeros([X.shape[1], 1])
self.training_sample = X.shape[0]
cost_history = []
for i in range (self.epoch_num):
hypothesis = X.dot(self.theta)
cost_func = np.transpose(X).dot(np.subtract(hypothesis, Y))
gradient = (self.learning_rate / self.training_sample) * cost_func
self.theta = np.subtract(self.theta, gradient)
cost_history.append(self.theta)
return cost_history
def predict(self, X):
X = np.array(X)
bias = np.ones([1]).reshape((1,1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
X = np.concatenate((X, bias))
return np.transpose(X).dot(self.theta)[0] # [63,1]
datas = pd.read_csv('pattern_recognition_data.txt').to_numpy()
X = datas[0:25,0]
Y = datas[0:25:,1]
X_test = datas[25:29, 0]
Y_test = datas[25:29, 1]
mlr = MultivariateLinearRegression(0.001, 'gradient descent', 1000000) # 0.0001 ve 1.000.000
mlr.train(X, Y)
Y_pred = []
for x in X_test:
print('X : ', x)
Y_pred.append(mlr.predict([x]))
plt.plot(X, Y, 'bs')
plt.plot(X_test, Y_pred, 'r')
Thanks in advance
The dataset:
39,144
47,220
45,138
47,145
65,162
46,142
67,170
42,124
67,158
56,154
64,162
56,150
59,140
34,110
42,128
48,130
45,135
17,114
20,116
19,124
36,136
50,142
39,120
21,120
44,160
53,158
63,144
29,130
25,125
69,175
This code attempts to utilize a custom implementation of dropout :
%reset -f
import torch
import torch.nn as nn
# import torchvision
# import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
num_epochs = 1000
number_samples = 10
from sklearn.datasets import make_moons
from matplotlib import pyplot
from pandas import DataFrame
# generate 2d classification dataset
X, y = make_moons(n_samples=number_samples, noise=0.1)
# scatter plot, dots colored by class value
x_data = [a for a in enumerate(X)]
x_data_train = x_data[:int(len(x_data) * .5)]
x_data_train = [i[1] for i in x_data_train]
x_data_train
y_data = [y[i[0]] for i in x_data]
y_data_train = y_data[:int(len(y_data) * .5)]
y_data_train
x_test = [a[1] for a in x_data[::-1][:int(len(x_data) * .5)]]
y_test = [a for a in y_data[::-1][:int(len(y_data) * .5)]]
x = torch.tensor(x_data_train).float() # <2>
print(x)
y = torch.tensor(y_data_train).long()
print(y)
x_test = torch.tensor(x_test).float()
print(x_test)
y_test = torch.tensor(y_test).long()
print(y_test)
class Dropout(nn.Module):
def __init__(self, p=0.5, inplace=False):
# print(p)
super(Dropout, self).__init__()
if p < 0 or p > 1:
raise ValueError("dropout probability has to be between 0 and 1, "
"but got {}".format(p))
self.p = p
self.inplace = inplace
def forward(self, input):
print(list(input.shape))
return np.random.binomial([np.ones((len(input),np.array(list(input.shape))))],1-dropout_percent)[0] * (1.0/(1-self.p))
def __repr__(self):
inplace_str = ', inplace' if self.inplace else ''
return self.__class__.__name__ + '(' \
+ 'p=' + str(self.p) \
+ inplace_str + ')'
class MyLinear(nn.Linear):
def __init__(self, in_feats, out_feats, drop_p, bias=True):
super(MyLinear, self).__init__(in_feats, out_feats, bias=bias)
self.custom_dropout = Dropout(p=drop_p)
def forward(self, input):
dropout_value = self.custom_dropout(self.weight)
return F.linear(input, dropout_value, self.bias)
my_train = data_utils.TensorDataset(x, y)
train_loader = data_utils.DataLoader(my_train, batch_size=2, shuffle=True)
my_test = data_utils.TensorDataset(x_test, y_test)
test_loader = data_utils.DataLoader(my_train, batch_size=2, shuffle=True)
# Device configuration
device = 'cpu'
print(device)
# Hyper-parameters
input_size = 2
hidden_size = 100
num_classes = 2
learning_rate = 0.0001
pred = []
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p):
super(NeuralNet, self).__init__()
# self.drop_layer = nn.Dropout(p=p)
# self.drop_layer = MyLinear()
# self.fc1 = MyLinear(input_size, hidden_size, p)
self.fc1 = MyLinear(input_size, hidden_size , p)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
# out = self.drop_layer(x)
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
model = NeuralNet(input_size, hidden_size, num_classes, p=0.9).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Move tensors to the configured device
images = images.reshape(-1, 2).to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch) % 100 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
Custom dropout is implemented as :
class Dropout(nn.Module):
def __init__(self, p=0.5, inplace=False):
# print(p)
super(Dropout, self).__init__()
if p < 0 or p > 1:
raise ValueError("dropout probability has to be between 0 and 1, "
"but got {}".format(p))
self.p = p
self.inplace = inplace
def forward(self, input):
print(list(input.shape))
return np.random.binomial([np.ones((len(input),np.array(list(input.shape))))],1-dropout_percent)[0] * (1.0/(1-self.p))
def __repr__(self):
inplace_str = ', inplace' if self.inplace else ''
return self.__class__.__name__ + '(' \
+ 'p=' + str(self.p) \
+ inplace_str + ')'
class MyLinear(nn.Linear):
def __init__(self, in_feats, out_feats, drop_p, bias=True):
super(MyLinear, self).__init__(in_feats, out_feats, bias=bias)
self.custom_dropout = Dropout(p=drop_p)
def forward(self, input):
dropout_value = self.custom_dropout(self.weight)
return F.linear(input, dropout_value, self.bias)
It seems I've implemented the dropout function incorrectly ? :
np.random.binomial([np.ones((len(input),np.array(list(input.shape))))],1-dropout_percent)[0] * (1.0/(1-self.p))
How to modify in order to correctly utilize dropout ?
These posts were useful in getting to this point :
Hinton's Dropout in 3 Lines of Python :
https://iamtrask.github.io/2015/07/28/dropout/
Making a Custom Dropout Function : https://discuss.pytorch.org/t/making-a-custom-dropout-function/14053/2
It seems I've implemented the dropout function incorrectly?
np.random.binomial([np.ones((len(input),np.array(list(input.shape))))],1 dropout_percent)[0] * (1.0/(1-self.p))
In fact, the above implementation is known as Inverted Dropout. Inverted Dropout is how Dropout is implemented in practice in the various deep learning frameworks.
What is inverted dropout?
Before jump into the inverted dropout, it can be helpful to see how Dropout works for a single neuron:
Since during train phase a neuron is kept on with probability q (=1-p), during the testing phase we have to emulate the behavior of the ensemble of networks used in the training phase. To this end, the authors suggest scaling the activation function by a factor of q during the test phase in order to use the expected output produced in the training phase as the single output required in the test phase (Section 10, Multiplicative Gaussian Noise). Thus:
Inverted dropout is a bit different. This approach consists in the scaling of the activations during the training phase, leaving the test phase untouched. The scale factor is the inverse of the keep probability 1/1-p = 1/q, thus:
Inverted dropout helps to define the model once and just change a parameter (the keep/drop probability) to run train and test on the same model. Direct Dropout, instead, force you to modify the network during the test phase because if you don’t multiply by q the output the neuron will produce values that are higher respect to the one expected by the successive neurons (thus the following neurons can saturate or explode): that’s why Inverted Dropout is the more common implementation.
References:
Dropout Regularization, coursera by Andrew NG
What is inverted dropout?
Dropout: scaling the activation versus inverting the dropout
Analysis of Dropout
How implement inverted dropout Pytorch?
class MyDropout(nn.Module):
def __init__(self, p: float = 0.5):
super(MyDropout, self).__init__()
if p < 0 or p > 1:
raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
self.p = p
def forward(self, X):
if self.training:
binomial = torch.distributions.binomial.Binomial(probs=1-self.p)
return X * binomial.sample(X.size()) * (1.0/(1-self.p))
return X
How to implement in Numpy?
import numpy as np
pKeep = 0.8
weights = np.ones([1, 5])
binary_value = np.random.rand(weights.shape[0], weights.shape[1]) < pKeep
res = np.multiply(weights, binary_value)
res /= pKeep # this line is called inverted dropout technique
print(res)
How to implement in Tensorflow?
import tensorflow as tf
tf.enable_eager_execution()
weights = tf.ones(shape=[1, 5])
keep_prob = 0.8
random_tensor = keep_prob
random_tensor += tf.random_uniform(weights.shape)
# 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
binary_tensor = tf.floor(random_tensor)
ret = tf.div(weights, keep_prob) * binary_tensor
print(ret)
Implementation with Torch and bernoulli..
def forward(self, x):
output = x # self.W.t() + self.bias
if self.training:
sample = torch.distributions.bernoulli.Bernoulli(self.keep_prob).sample(output.size())
print(sample)
return output * sample
return output