I've trained a simple machine learning model, a polynomial regression. The pseudocode of prediction function is as follows:
def f(x):
"""
x is a np.ndarray of shape (m, )
"""
# X is stacked of x ** 0, x ** 1, x ** 2, ..., x ** (n - 1) by rows
# X is of shape of (m, n)
# m is the number of training examples
X = generate(x)
Y = np.dot(X, W)
return Y
W is trained parameters. Here the shape of Y is (m, 1), but if I return Y.squeeze(), say of shape (m,), I get a very different standard deviation on the test set, say 70 for the former and 8 for the latter.
I use random initialisation, but I've trained and tested many times, the std of the squeezed version is much smaller. So I just wonder why.
I just show the complete codes below, and you can test by yourself. My questions are in line 90 and line 91
# python: 3.5.2
# encoding: utf-8
# numpy: 1.14.1
import numpy as np
import matplotlib.pyplot as plt
def load_data(filename):
xys = []
with open(filename, 'r') as f:
for line in f:
xys.append(map(float, line.strip().split()))
xs, ys = zip(*xys)
return np.asarray(xs), np.asarray(ys)
def evaluate(ys, ys_pred):
std = np.sqrt(np.mean(np.abs(ys - ys_pred) ** 2))
return std
def linear_regression(x_train, y_train, n=2, learning_rate=0.0005, epochs=1000, l2=0, Print=False):
"""
This target function is: y = b + w1 * x^1 + w2 * x^2 + ...
also y = b + np.dot(w.T, x)
:param x_train: np.ndarray
:param y_train: np.ndarray
:return: a trained model (as a function), trained by x_train and y_train
"""
# get the number of train e.g.
m = x_train.shape[0]
# set and initialize parameters here
# intercept
b = np.float64(-10)
# weights
w = np.float64(np.random.randn(n, 1))
# convert the x_train matrix to a design matrix
X = np.zeros((n, m), dtype=np.float64)
for i in range(n):
X[i, :] = x_train ** (i + 1)
X = np.float64(X)
Y = np.float64(np.reshape(y_train, newshape=(1, m)))
# if plot of the training process is needed
costs = []
# train on the dataset
for epoch in range(epochs):
# compute the gradient of cost on w
Z = b + np.dot(w.T, X)
dZ = Z - Y
dw = 1./m * np.dot(X, dZ.T)
db = 1./m * np.squeeze(np.sum(dZ))
# update the parameters, for w, I also set "weight decay"
w -= learning_rate * dw + l2 * w
b -= learning_rate * db
cost = np.squeeze(0.5/m * np.dot(dZ, dZ.T))
costs.append(cost)
if Print == True and epoch % 25 == 0:
print("Cost after " + str(epoch) + " iterations " + ": " + str(cost))
# plot the costs
if Print == True:
plt.plot(costs)
plt.show()
def pred(x):
assert type(x) is np.ndarray
m = x.shape[0]
# convert the x_train matrix to a design matrix
X = np.zeros((n, m))
for i in range(n):
X[i, :] = x ** (i + 1)
# to predict
Y = b + np.dot(w.T, X)
return Y.T
# return Y.squeeze()
return pred
if __name__ == '__main__':
train_file = 'train.txt'
test_file = 'test.txt'
# load data
x_train, y_train = load_data(train_file)
x_test, y_test = load_data(test_file)
print(x_train.shape)
print(x_test.shape)
# use a trained linear-regression model
f = linear_regression(x_train, y_train, n=2, epochs=10000, Print=False, learning_rate=1e-8, l2=5e-2)
# compute the predictions
y_test_pred = f(x_test)
# use the test set to evaluate the model
std = evaluate(y_test, y_test_pred)
print('the standard deviation:{:.1f}'.format(std))
# show the result
plt.plot(x_train, y_train, 'ro', markersize=3)
plt.plot(x_test, y_test, 'k')
plt.plot(x_test, y_test_pred)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear Regression')
plt.legend(['train', 'test', 'pred'])
plt.show()
Related
I want to implement gradient descent with numpy for linear regression but I have some error in this code:
import numpy as np
# Code Example
rng = np.random.RandomState(10)
X = 10*rng.rand(1000, 5) # feature matrix
y = 0.9 + np.dot(X, [2.2, 4, -4, 1, 2]) # target vector
# GD implementation for linear regression
def GD(X, y, eta=0.1, n_iter=20):
theta = np.zeros((X.shape[0], X.shape[1]))
for i in range(n_iter):
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
theta = theta - eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.1, n_iter=20):
theta = np.zeros(1, X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = 2 * np.mean((np.dot(theta.T, X[j,:]) - y[j]) * X[j,:])
theta = theta - eta * grad
return theta
# MSE loss for linear regression with numpy
def MSE(X, y, theta):
return np.mean((X.dot(theta.T) - y)**2)
# linear regression with GD and MSE with numpy
theta_gd = GD(X, y)
theta_sgd = SGD(X, y)
print('MSE with GD: ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
The error is
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
ValueError: operands could not be broadcast together with shapes (5,5) (1000,)
and I can't solve it.
Minor changes in your code that resolve dimensionality issues during matrix multiplication make the code run successfully. In particular, note that a linear regression on a design matrix X of dimension Nxk has a parameter vector theta of size k.
In addition, I'd suggest some changes in SGD() that make it a proper stochastic gradient descent. Namely, evaluating the gradient over random subsets of the data realized as realized by randomly partitioning the index set of the train data with np.random.shuffle() and looping through it. The batch_size determines the size of each subset after which the parameter estimate is updated. The argument seed ensures reproducibility.
# GD implementation for linear regression
def GD(X, y, eta=0.001, n_iter=100):
theta = np.zeros(X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = (2 * np.mean(X[j,:] # theta - y[j]) * X[j,:]) # changed line
theta -= eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.001, n_iter=1000, batch_size=25, seed=7678):
theta = np.zeros(X.shape[1])
indexSet = list(range(len(X)))
np.random.seed(seed)
for i in range(n_iter):
np.random.shuffle(indexSet) # random shuffle of index set
for j in range(round(len(X) / batch_size)+1):
X_sub = X[indexSet[j*batch_size:(j+1)*batch_size],:]
y_sub = y[indexSet[j*batch_size:(j+1)*batch_size]]
if(len(X_sub) > 0):
grad = (2 * np.mean(X_sub # theta - y_sub) * X_sub) # changed line
theta -= eta * np.mean(grad, axis=0)
return theta
Running the code, I get
print('MSE with GD : ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
> MSE with GD : 0.07602
MSE with SGD: 0.05762
Each observation has 5 features, and X contains 1000 observations:
X = rng.rand(1000, 5) * 10 # X.shape == (1000, 5)
Create y which is perfectly linearly correlated with X (with no distortions):
real_weights = np.array([2.2, 4, -4, 1, 2]).reshape(-1, 1)
real_bias = 0.9
y = X # real_weights + real_bias # y.shape == (1000, 1)
G.D. implementation for linear regression:
Note:
w (weights) is your theta variable.
I have also added the calculation of b (bias).
def GD(X, y, eta=0.1, n_iter=20):
# Initialize weights and a bias (all zeros):
w = np.zeros((X.shape[1], 1)) # w.shape == (5, 1)
b = 0
# Gradient descent
for i in range(n_iter):
errors = X # w + b - y # errors.shape == (1000, 1)
dw = 2 * np.mean(errors * X, axis=0).reshape(5, 1)
db = 2 * np.mean(errors)
w -= eta * dw
b -= eta * db
return w, b
Testing:
w, b = GD(X, y, eta=0.003, n_iter=5000)
print(w, b)
[[ 2.20464905]
[ 4.00510139]
[-3.99569374]
[ 1.00444026]
[ 2.00407476]] 0.7805448262466914
Notes:
Your function SGD also contains some error..
I'm using the # operator because it's just my preference over np.dot.
I have a class Linear Regression and want to check how does it work with dataset load_boston. I calculated the Mean absolute percentage error (MAPE) and the result is nan.
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
warnings.filterwarnings('ignore')
class LinearRegressionSGD(BaseEstimator):
def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
'''
epsilon: difference for the rate of change of weights
max_steps: maximum number of steps in gradient descent
w0: np.array (d,) - initial weights
alpha: learning step
'''
self.epsilon = epsilon
self.max_steps = max_steps
self.w0 = w0
self.alpha = alpha
self.w = None
self.w_history = []
def fit(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: self
"""
l, d = X.shape
if self.w0 is None:
self.w0 = np.zeros(d)
self.w = self.w0
for step in range(self.max_steps):
self.w_history.append(self.w)
w_new = self.w - self.alpha * self.calc_gradient(X, y)
if (np.linalg.norm(w_new - self.w) < self.epsilon):
break
self.w = w_new
return self
def predict(self, X):
"""
X: np.array (l, d)
---
output: np.array (l)
"""
if self.w is None:
raise Exception('Not trained yet')
l, d = X.shape
y_pred = []
for i in range(l):
y_pred.append(np.dot(X[i], self.w))
return np.array(y_pred)
def calc_gradient(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: np.array (d)
"""
l, d = X.shape
gradient = []
for j in range(d):
dQ = 0
for i in range(l):
dQ += (2 / l) * X[i][j] * (np.dot(X[i], self.w) - y[i])
gradient.append(dQ)
return np.array(gradient)
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.3, random_state=10)
def MAPE(y_true, y_pred):
"""
y_true: np.array (l)
y_pred: np.array (l)
---
output: float [0, +inf)
"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Task 2
sgd = LinearRegressionSGD()
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print(MAPE(y_test, y_pred_sgd))
# Task 3
a, b = X_test.shape
w_0 = np.random.uniform(-2, 2, (b))
lr = LinearRegressionSGD(w0=w_0)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(MAPE(y_test, y_pred_lr))
But when I create X, y like below, the code works properly and MAPE gives float value
n_features = 2
n_objects = 300
num_steps = 100
np.random.seed(1)
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))
X = np.random.uniform(-5, 5, (n_objects, n_features))
y = np.dot(X, w_true) + np.random.normal(0, 1, (n_objects))
What is the problem with my code? and how to fix it to get the float value?
(Sorry for my bad English, its not my native language)
I'm doing a hands-on for learning and have created a model in python using numpy that's being trained on breast cancer dataSet from sklearn library. Model is running without any error and giving me Train and Test accuracy as 92.48826291079813% and 90.9090909090909% respectively. However somehow I'm not able to complete the hands-on since (probably) my result is different than expected. I don't know where the problem is because I don't know the right answer, also don't see any error.
Would request someone to help me with this. Code is given below.
#Import numpy as np and pandas as pd
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
**Define method initialiseNetwork() initilise weights with zeros of shape(num_features, 1) and also bias b to zero
parameters: num_features(number of input features)
returns : dictionary of weight vector and bias**
def initialiseNetwork(num_features):
W = np.zeros((num_features,1))
b = 0
parameters = {"W": W, "b": b}
return parameters
** define function sigmoid for the input z.
parameters: z
returns: $1/(1+e^{(-z)})$ **
def sigmoid(z):
a = 1/(1 + np.exp(-z))
return a
** Define method forwardPropagation() which implements forward propagtion defined as Z = (W.T dot_product X) + b, A = sigmoid(Z)
parameters: X, parameters
returns: A **
def forwardPropagation(X, parameters):
W = parameters["W"]
b = parameters["b"]
Z = np.dot(W.T,X) + b
A = sigmoid(Z)
return A
** Define function cost() which calculate the cost given by −(sum(Y\*log(A)+(1−Y)\*log(1−A)))/num_samples, here * is elementwise product
parameters: A,Y,num_samples(number of samples)
returns: cost **
def cost(A, Y, num_samples):
cost = -1/num_samples * np.sum(Y*np.log(A) + (1-Y)*(np.log(1-A)))
#cost = Y*np.log(A) + (1-Y)*(np.log(1-A))
return cost
** Define method backPropgation() to get the derivatives of weigths and bias
parameters: X,Y,A,num_samples
returns: dW,db **
def backPropagration(X, Y, A, num_samples):
dZ = A - Y
dW = (np.dot(X,dZ.T))/num_samples #(X dot_product dZ.T)/num_samples
db = np.sum(dZ)/num_samples #sum(dZ)/num_samples
return dW, db
** Define function updateParameters() to update current parameters with its derivatives
w = w - learning_rate \* dw
b = b - learning_rate \* db
parameters: parameters,dW,db, learning_rate
returns: dictionary of updated parameters **
def updateParameters(parameters, dW, db, learning_rate):
W = parameters["W"] - (learning_rate * dW)
b = parameters["b"] - (learning_rate * db)
return {"W": W, "b": b}
** Define the model for forward propagation
parameters: X,Y, num_iter(number of iterations), learning_rate
returns: parameters(dictionary of updated weights and bias) **
def model(X, Y, num_iter, learning_rate):
num_features = X.shape[0]
num_samples = X.shape[1]
parameters = initialiseNetwork(num_features) #call initialiseNetwork()
for i in range(num_iter):
#A = forwardPropagation(X, Y, parameters) # calculate final output A from forwardPropagation()
A = forwardPropagation(X, parameters)
if(i%100 == 0):
print("cost after {} iteration: {}".format(i, cost(A, Y, num_samples)))
dW, db = backPropagration(X, Y, A, num_samples) # calculate derivatives from backpropagation
parameters = updateParameters(parameters, dW, db, learning_rate) # update parameters
return parameters
** Run the below cell to define the function to predict the output.It takes updated parameters and input data as function parameters and returns the predicted output **
def predict(X, parameters):
W = parameters["W"]
b = parameters["b"]
b = b.reshape(b.shape[0],1)
Z = np.dot(W.T,X) + b
Y = np.array([1 if y > 0.5 else 0 for y in sigmoid(Z[0])]).reshape(1,len(Z[0]))
return Y
** The code in the below cell loads the breast cancer data set from sklearn.
The input variable(X_cancer) is about the dimensions of tumor cell and targrt variable(y_cancer) classifies tumor as malignant(0) or benign(1) **
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
** Split the data into train and test set using train_test_split(). Set the random state to 25. Refer the code snippet in topic 4 **
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
random_state = 25)
** Since the dimensions of tumor is not uniform you need to normalize the data before feeding to the network
The below function is used to normalize the input data. **
def normalize(data):
col_max = np.max(data, axis = 0)
col_min = np.min(data, axis = 0)
return np.divide(data - col_min, col_max - col_min)
** Normalize X_train and X_test and assign it to X_train_n and X_test_n respectively **
X_train_n = normalize(X_train)
X_test_n = normalize(X_test)
** Transpose X_train_n and X_test_n so that rows represents features and column represents the samples
Reshape Y_train and y_test into row vector whose length is equal to number of samples.Use np.reshape() **
X_trainT = X_train_n.T
#print(X_trainT.shape)
X_testT = X_test_n.T
#print(X_testT.shape)
y_trainT = y_train.reshape(1,X_trainT.shape[1])
y_testT = y_test.reshape(1,X_testT.shape[1])
** Train the network using X_trainT,y_trainT with number of iterations 4000 and learning rate 0.75 **
parameters = model(X_trainT, y_trainT, 4000, 0.75) #call the model() function with parametrs mentioned in the above cell
** Predict the output of test and train data using X_trainT and X_testT using predict() method> Use the parametes returned from the trained model **
yPredTrain = predict(X_trainT, parameters) # pass weigths and bias from parameters dictionary and X_trainT as input to the function
yPredTest = predict(X_testT, parameters) # pass the same parameters but X_testT as input data
** Run the below cell print the accuracy of model on train and test data. ***
accuracy_train = 100 - np.mean(np.abs(yPredTrain - y_trainT)) * 100
accuracy_test = 100 - np.mean(np.abs(yPredTest - y_testT)) * 100
print("train accuracy: {} %".format(accuracy_train))
print("test accuracy: {} %".format(accuracy_test))
My Output:
train accuracy: 92.48826291079813 %
test accuracy: 90.9090909090909 %
I figured out where the problem was. It was the third line in predict function where I was reshaping bias which was not at all necessary.
def predict(X, parameters):
W = parameters["W"]
b = parameters["b"]
**b = b.reshape(b.shape[0],1)**
Z = np.dot(W.T,X) + b
Y = np.array([1 if y > 0.5 else 0 for y in sigmoid(Z[0])]).reshape(1,len(Z[0]))
return Y
and third line in back-propagation function needed to be corrected as np.sum(dZ)/num_samples.
def backPropagration(X, Y, A, num_samples):
dZ = A - Y
dW = (np.dot(X,dZ.T))/num_samples
** db = sum(dZ)/num_samples **
return dW, db
After I corrected both functions, the model gave me train accuracy as 98.59154929577464% and test accuracy as 93.00699300699301%.
I tried to implement multivariate linear regression from scratch and it works pretty well actually. When I was testing it with a toy dataset, I run into sometimes the predictions were 'NaN'. I know what are the possible NaN reasons though, I couldn't understand which one causes it in my script.
Note: with 0.0001 learning rate and 1.000.000 iterations, I got a really good line for the dataset though, when learning rate is 0.001 and the number of iterations is 1.000.000, the predictions were NaN.
Here is the script:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
class MultivariateLinearRegression():
#constructor
def __init__(self, learning_rate, learning_algorithm, epoch_num):
self.learning_rate = learning_rate
self.learning_algoritm = learning_algorithm
self.epoch_num = epoch_num
self.theta = 0
self.training_sample = 0
def train(self, X, Y):
Y = Y.reshape((Y.size, 1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
bias = np.ones([X.shape[0], 1])
X = np.concatenate((X, bias), 1)
self.theta = np.zeros([X.shape[1], 1])
self.training_sample = X.shape[0]
cost_history = []
for i in range (self.epoch_num):
hypothesis = X.dot(self.theta)
cost_func = np.transpose(X).dot(np.subtract(hypothesis, Y))
gradient = (self.learning_rate / self.training_sample) * cost_func
self.theta = np.subtract(self.theta, gradient)
cost_history.append(self.theta)
return cost_history
def predict(self, X):
X = np.array(X)
bias = np.ones([1]).reshape((1,1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
X = np.concatenate((X, bias))
return np.transpose(X).dot(self.theta)[0] # [63,1]
datas = pd.read_csv('pattern_recognition_data.txt').to_numpy()
X = datas[0:25,0]
Y = datas[0:25:,1]
X_test = datas[25:29, 0]
Y_test = datas[25:29, 1]
mlr = MultivariateLinearRegression(0.001, 'gradient descent', 1000000) # 0.0001 ve 1.000.000
mlr.train(X, Y)
Y_pred = []
for x in X_test:
print('X : ', x)
Y_pred.append(mlr.predict([x]))
plt.plot(X, Y, 'bs')
plt.plot(X_test, Y_pred, 'r')
Thanks in advance
The dataset:
39,144
47,220
45,138
47,145
65,162
46,142
67,170
42,124
67,158
56,154
64,162
56,150
59,140
34,110
42,128
48,130
45,135
17,114
20,116
19,124
36,136
50,142
39,120
21,120
44,160
53,158
63,144
29,130
25,125
69,175
I am following a tutorial on rnn's in TensorFlow but I have a question concerning the input formats.
They are taking raw_x (one hot vector) and basically first cutting that up in pieces of length 200 (batch_size) to form data_x. That is good.
Then they further cut up data_x in pieces of length 5 (num_step, or graph width) with:
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
However, if I look in the data, the slices of x do not match data_x. The first one does, but then they diverge.
Am I misunderstanding the above code? I would like to understand how x is being created or what it is supposed to look like.
I had expected the second item to be 0 1 0 1 0.
Also, I thought an epoch is when you go through the data completely, from this it seems that they split up the data in 1000 parts (epoch size)?
If it helps, this is my full code. I am trying to figure out what is going on in x. at line 48:
import numpy as np
import tensorflow as tf
# %matplotlib inline
import matplotlib.pyplot as plt
# Global config variables
num_steps = 5 # number of truncated backprop steps ('n' in the discussion above)
batch_size = 200
num_classes = 2
state_size = 4
learning_rate = 0.1
def gen_data(size=1000000):
print('generating data');
X = np.array(np.random.choice(2, size=(size,)))
Y = []
for i in range(size):
threshold = 0.5
if X[i-3] == 1:
threshold += 0.5
if X[i-8] == 1:
threshold -= 0.25
if np.random.rand() > threshold:
Y.append(0)
else:
Y.append(1)
return X, np.array(Y)
# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/reader.py
def gen_batch(raw_data, batch_size, num_steps):
print('generating batches');
raw_x, raw_y = raw_data
data_length = len(raw_x)
# partition raw data into batches and stack them vertically in a data matrix
batch_partition_length = data_length // batch_size
data_x = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
data_y = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
for i in range(batch_size):
data_x[i] = raw_x[batch_partition_length * i:batch_partition_length * (i + 1)]
data_y[i] = raw_y[batch_partition_length * i:batch_partition_length * (i + 1)]
# further divide batch partitions into num_steps for truncated backprop
epoch_size = batch_partition_length // num_steps
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
def gen_epochs(n, num_steps):
for i in range(n):
yield gen_batch(gen_data(), batch_size, num_steps)
"""
Placeholders
"""
x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
init_state = tf.zeros([batch_size, state_size])
"""
RNN Inputs
"""
# Turn our x placeholder into a list of one-hot tensors:
# rnn_inputs is a list of num_steps tensors with shape [batch_size, num_classes]
x_one_hot = tf.one_hot(x, num_classes)
rnn_inputs = tf.unstack(x_one_hot, axis=1)
"""
Definition of rnn_cell
This is very similar to the __call__ method on Tensorflow's BasicRNNCell. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py
"""
with tf.variable_scope('rnn_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
def rnn_cell(rnn_input, state):
with tf.variable_scope('rnn_cell', reuse=True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
return tf.tanh(tf.matmul(tf.concat(axis=1, values=[rnn_input, state]), W) + b)
"""
Adding rnn_cells to graph
This is a simplified version of the "rnn" function from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn.py
"""
state = init_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = rnn_cell(rnn_input, state)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
"""
Predictions, loss, training step
Losses and total_loss are simlar to the "sequence_loss_by_example" and "sequence_loss"
functions, respectively, from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/seq2seq.py
"""
#logits and predictions
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
predictions = [tf.nn.softmax(logit) for logit in logits]
# Turn our y placeholder into a list labels
y_as_list = [tf.squeeze(i, axis=[1]) for i in tf.split(axis=1, num_or_size_splits=num_steps, value=y)]
#losses and train_step
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=label) for \
logit, label in zip(logits, y_as_list)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
"""
Function to train the network
"""
def train_network(num_epochs, num_steps, state_size=4, verbose=True):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
training_losses = []
for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_loss = 0
training_state = np.zeros((batch_size, state_size))
if verbose:
print("\nEPOCH", idx)
for step, (X, Y) in enumerate(epoch):
tr_losses, training_loss_, training_state, _ = \
sess.run([losses,
total_loss,
final_state,
train_step],
feed_dict={x:X, y:Y, init_state:training_state})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
if verbose:
print("Average loss at step", step,
"for last 250 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
return training_losses
training_losses = train_network(1,num_steps)
plt.plot(training_losses)
Seems like the batches are actually transposed.
So the first elements of the x-matrix (200 x 5) will fit the first 5 elements of x_raw.
Then only in the next iteration, the next 5-10 elements of x_raw will be in the first elements (again) of x.