I started learning about neural networks and decided to follow this Google code lab on convolutional neural networks, but I decided to use the CIFAR-10 dataset for image classification, but I get very low accuracy and high cross-entropy.
After training the accuracy is around 0.1 (never more than 0.2) and cross-entropy doesn't go below 230. I didn't use batch-normalization or dropout, but I should still get more accuracy here.
My code:
import tensorflow as tf
import numpy as np
import matplotlib as mpt
import math
# Just disables the warning, doesn't enable AVX/FMA
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict
def makeMiniBatch(dictionary,start,number):
matrix=np.zeros([number,3072],dtype=np.int)
labels=np.zeros([number],dtype=np.int)
for i in range(0,number):
matrix[i]=dictionary[b'data'][i+start]
labels[i]=dictionary[b'labels'][i+start]
return matrix,labels
def formatLabels(labele):
lab=np.zeros([100,10])
for i in range(0,100):
lab[i][labele[i]]=1
return lab
def formatData(values):
temp = np.zeros([100,32,32,3])
for i in range(0,100):
im_r = values[i][0:1024].reshape(32, 32)
im_g = values[i][1024:2048].reshape(32, 32)
im_b = values[i][2048:].reshape(32, 32)
temp[i] = np.dstack((im_r, im_g, im_b))
return temp
batch='D:/cifar-10-python/cifar-10-batches-py/data_batch_1'
data=unpickle(batch)
tf.set_random_seed(0)
K = 8
L = 16
M = 32
N = 200
X_=tf.placeholder(tf.float32,[None,32,32,3])
Y_=tf.placeholder(tf.float32,[None,10])
lr = tf.placeholder(tf.float32)
W1 = tf.Variable(tf.truncated_normal([5, 5, 3, K], stddev=0.1))
B1 = tf.Variable(tf.ones([K])/10)
W2 = tf.Variable(tf.truncated_normal([5, 5, K, L], stddev=0.1))
B2 = tf.Variable(tf.ones([L])/10)
W3 = tf.Variable(tf.truncated_normal([4, 4, L, M], stddev=0.1))
B3 = tf.Variable(tf.ones([M])/10)
W4 = tf.Variable(tf.truncated_normal([8 * 8 * M, N], stddev=0.1))
B4 = tf.Variable(tf.ones([N])/10)
W5 = tf.Variable(tf.truncated_normal([N, 10], stddev=0.1))
B5 = tf.Variable(tf.ones([10])/10)
stride = 1
Y1_ = tf.nn.conv2d(X_, W1, strides=[1, stride, stride, 1], padding='SAME') +
B1
Y1_max=tf.nn.max_pool(Y1_,ksize=[1,2,2,1],strides=[1,1,1,1],padding='SAME')
Y1 = tf.nn.relu(Y1_max)
Y2_ = tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') +
B2
Y2_max=tf.nn.max_pool(Y2_,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
Y2 = tf.nn.relu(Y2_max)
Y3_ = tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') +
B3
Y3_max=tf.nn.max_pool(Y3_,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
Y3 = tf.nn.relu(Y3_max)
YY = tf.reshape(Y3, shape=[-1, 8 * 8 * M])
Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
Ylogits = tf.matmul(Y4, W5) + B5
Y = tf.nn.softmax(Ylogits)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits,
labels=Y_)
cross_entropy = tf.reduce_mean(cross_entropy)*100
correct_prediction=tf.equal(tf.argmax(Y,1),tf.argmax(Y_,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
init=tf.global_variables_initializer()
sess=tf.Session()
sess.run(init)
def training_step(i):
global data
val,lab=makeMiniBatch(data,i * 100,100)
Y_labels=formatLabels(lab)
X_data=formatData(val)
max_learning_rate = 0.003
min_learning_rate = 0.0001
decay_speed = 2000.0
learning_rate = min_learning_rate + (max_learning_rate -
min_learning_rate) * math.exp(-i/decay_speed)
_,a,c = sess.run([train_step,accuracy, cross_entropy], feed_dict={X_:
X_data, Y_: Y_labels, lr:learning_rate})
print("Accuracy: ",a)
print("Cross-Entropy",c)
for i in range (0,100):
training_step(i%100)
Thanks Maxim, the normalization worked and after 30 seconds of training the network achieved an accuracy of 40%.
The changes I made to my code are the following:
def formatDatanew2(values):
ret=values.reshape(100,3,32,32).transpose(0,2,3,1).astype("float32")
ret/=255
return ret
Related
I have a neural network in Python, but it gives almost the exact same prediction for each data point and I can't work out why this is. I have tried altering the features I use to make the predictions but I get the same issue. Thanks for any help.
I have a data file which looks like this:
Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
from kaggle.
My neural network code is this:
import numpy as np
import pandas as pd
data = pd.read_csv("diabetes.csv", header=0)
print(data.head())
training_examples = data[["BloodPressure", "Glucose", "Outcome"]]
X = training_examples[["BloodPressure", "Glucose"]].to_numpy()
y = training_examples[["Outcome"]].to_numpy()
DIMENSIONS = 2
HIDDEN_LAYER = 20
# Set up the training data
# X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
# y = np.array([[0], [1], [1], [0]])
# Set the number of epochs and the learning rate
num_epochs = 10
learning_rate = 0.1
# Initialize the weights and biases
w1 = np.random.randn(DIMENSIONS, HIDDEN_LAYER)
b1 = np.zeros((1, HIDDEN_LAYER))
w2 = np.random.randn(HIDDEN_LAYER, 1)
b2 = np.zeros((1, 1))
# Define the sigmoid activation function
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Define the derivative of the sigmoid function
def sigmoid_derivative(x):
return x * (1 - x)
# Train the network
for epoch in range(num_epochs):
# Forward pass
z1 = np.dot(X, w1) + b1
a1 = sigmoid(z1)
z2 = np.dot(a1, w2) + b2
a2 = sigmoid(z2)
# Calculate the loss
loss = np.mean((a2 - y)**2)
# Print the loss every 100 epochs
if epoch % 100 == 0:
print(f'Epoch {epoch}: loss = {loss}')
# Backpropagation
dz2 = a2 - y
dw2 = np.dot(a1.T, dz2)
db2 = np.sum(dz2, axis=0)
da1 = np.dot(dz2, w2.T)
dz1 = da1 * sigmoid_derivative(a1)
dw1 = np.dot(X.T, dz1)
db1 = np.sum(dz1, axis=0)
# Update the weights and biases
w1 -= learning_rate * dw1
b1 -= learning_rate * db1
w2 -= learning_rate * dw2
b2 -= learning_rate * db2
# Make predictions on the test data
predictions = a2
# Print the predictions
print(predictions)
So I made a simple neural network for MNIST (784 input neurons, 30 hidden neurons, and 10 output neurons), but the cost function (MSE) always increases to 4.5 and never decreases, and the output neurons eventually all just output 1. Here's the code:
np.set_printoptions(suppress=True)
epochs = 50
batch = 60000
learning_rate = 3
B1 = np.random.randn(30, 1)
B2 = np.random.randn(10, 1)
W1 = np.random.randn(784, 30)
W2 = np.random.randn(30, 10)
for i in range(epochs):
X, Y = shuffle(X, Y)
c_B1 = np.zeros(B1.shape)
c_B2 = np.zeros(B2.shape)
c_W1 = np.zeros(W1.shape)
c_W2 = np.zeros(W2.shape)
for b in range(0, np.size(X, 0), batch):
inputs = X[b:b+batch]
outputs = Y[b:b+batch]
Z1 = nn_forward(inputs, W1.T, B1)
A1 = sigmoid(Z1)
Z2 = nn_forward(A1, W2.T, B2)
A2 = sigmoid(Z2)
e_L = (outputs - A2) * d_sig(Z2)
e_1 = np.multiply(np.dot(e_L, W2.T), d_sig(Z1))
d_B2 = np.sum(e_L, axis=0)
d_B1 = np.sum(e_1, axis=0)
d_W2 = np.dot(A1.T, e_L)
d_W1 = np.dot(inputs.T, e_1)
d_B2 = d_B2.reshape((np.size(B2, 0), 1))
d_B1 = d_B1.reshape((np.size(B1, 0), 1))
c_B1 = np.add(c_B1, d_B1)
c_B2 = np.add(c_B2, d_B2)
c_W1 = np.add(c_W1, d_W1)
c_W2 = np.add(c_W2, d_W2)
B1 = np.subtract(B1, (learning_rate/batch) * c_B1)
B2 = np.subtract(B2, (learning_rate/batch) * c_B2)
W1 = np.subtract(W1, (learning_rate/batch) * c_W1)
W2 = np.subtract(W2, (learning_rate/batch) * c_W2)
print(i, cost(outputs, A2))
What am I doing wrong?
Two things I notice right away:
Why do you use MSE as loss-function for a classification problem? MSE Is usually used for regression problems. Try using crossentropy.
You have sigmoid as output activation, which maps your input x to the interval (0,1), so in case you like to do classification you should look at the argmax of your output vector and use this as predicted class label.
I had a curious experience with Keras.
Info: input dataset shapes
16 features, 5000 observations
target variable: 1 dimension
Problem: Regression
While writing code for students I developed a toy network using tf using the following code (I know is not a complete example but I hope it will give you enough information)
n1 = 15 # Number of neurons in layer 1
n2 = 15 # Number of neurons in layer 2
n3 = 15
nx = number_of_x_points
n_dim = nx
n4 = 1
stddev_f = 0.1
tf.set_random_seed(5)
X = tf.placeholder(tf.float32, [n_dim, None])
Y = tf.placeholder(tf.float32, [10, None])
W1 = tf.Variable(tf.random_normal([n1, n_dim], stddev=stddev_f))
b1 = tf.Variable(tf.constant(0.0, shape = [n1,1]) )
W2 = tf.Variable(tf.random_normal([n2, n1], stddev=stddev_f))
b2 = tf.Variable(tf.constant(0.0, shape = [n2,1]))
W3 = tf.Variable(tf.random_normal([n3,n2], stddev = stddev_f))
b3 = tf.Variable(tf.constant(0.0, shape = [n3,1]))
W4 = tf.Variable(tf.random_normal([n4,n3], stddev = stddev_f))
b4 = tf.Variable(tf.constant(0.0, shape = [n4,1]))
X = tf.placeholder(tf.float32, [nx, None]) # Inputs
Y = tf.placeholder(tf.float32, [1, None]) # Labels
Z1 = tf.nn.sigmoid(tf.matmul(W1, X) + b1) # n1 x n_dim * n_dim x n_obs = n1 x n_obs
Z2 = tf.nn.sigmoid(tf.matmul(W2, Z1) + b2) # n2 x n1 * n1 * n_obs = n2 x n_obs
Z3 = tf.nn.sigmoid(tf.matmul(W3, Z2) + b3)
Z4 = tf.matmul(W4, Z3) + b4
y_ = tf.sigmoid(Z4)
cost = tf.reduce_mean(tf.square(y_-Y))
learning_rate = 0.005
training_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
training_epochs = 1000
cost_history = np.empty(shape=[1], dtype = float)
cost_meas_history = np.empty(shape=[1], dtype = float)
train_x = np.transpose(data)
train_y = np.transpose(targets)
cost_history = []
for epoch in range(training_epochs+1):
for i in range(0, train_x.shape[0], batch_size):
x_batch = train_x[i:i + batch_size,:]
y_batch = train_y[i:i + batch_size,:]
sess.run(training_step, feed_dict = {X: x_batch, Y: y_batch})
cost_ = sess.run(cost, feed_dict={ X:train_x, Y: train_y})
cost_history = np.append(cost_history, cost_)
if (epoch % 5000 == 0):
print("Reached epoch",epoch,"cost J =", cost_)
this code is working quite well and it takes on my laptop for 1000 epochs 5 sec. Now I developed the same network with keras with the code
model = tf.keras.Sequential()
model.add(layers.Dense(15, input_dim=16, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.train.AdamOptimizer(0.005),
loss='mse',
metrics=['mae'])
# Training Phase
model.fit(train_x.transpose(), train_y.transpose()/100.0, epochs=1000, batch_size=100,verbose = 0)
This code takes 43 sec. Has anyone any idea what this is the case? Now I expected Keras to be slower but not that much slower. What am I missing?
Thanks, Umberto
Ok I found the reason... It was my mistake. Due to a series of mistakes, due to programming at night after midnight (...), I realized I was comparing batch GD and mini-batch GD. My apologies to everyone and thanks to today that noticed my mistake...
If someone thinks this should be deleted is fine with me.
Now Keras and plain TF are taking exactly the same time. Thanks everyone for reading.
Best, Umberto
I'm trying to create a neural network for binary classification on the breast cancer dataset:
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
My neural network consists of 3 layers(not including input layer):
first layer: 6 neurons with tanh activation.
second layer: 6 neurons with tanh activation.
final layer: 1 neuron with sigmoid activation.
Unfortunately, I'm only getting around 44% accuracy in the training examples and around 23% accuracy in the test examples.
Here is my python code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("data.csv")
data = data.drop(['id'], axis = 1)
data = data.drop(data.columns[31], axis = 1)
data = data.replace({'M': 1, 'B': 0})
X = data
X = X.drop(['diagnosis'], axis = 1)
X = np.array(X)
X_mean = np.mean(X, axis = 1, keepdims = True)
X_std = np.std(X, axis = 1, keepdims = True)
X_n = (X - X_mean) / X_std
y = np.array(data['diagnosis'])
y = y.reshape(569, 1)
m = 378
y_train = y[:m, :]
y_test = y[m:, :]
X_train = X_n[:m, :]
X_test = X_n[m:, :]
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def dsigmoid(z):
return np.multiply(z, (1 - z))
def tanh(z):
return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
def dtanh(z):
return 1 - np.square(tanh(z))
def cost(A, Y):
m = Y.shape[0]
return -(1.0/m) *np.sum( np.dot(Y.T, np.log(A)) + np.dot((1 - Y).T, np.log(1-A)))
def train(X, y ,model, epocs, a):
W1 = model['W1']
W2 = model['W2']
W3 = model['W3']
b1 = model['b1']
b2 = model['b2']
b3 = model['b3']
costs = []
for i in range(epocs):
#forward propagation
z1 = np.dot(X, W1) + b1
a1 = tanh(z1)
z2 = np.dot(a1, W2) + b2
a2 = tanh(z2)
z3 = np.dot(a2, W3) + b3
a3 = sigmoid(z3)
costs.append(cost(a3, y))
#back propagation
dz3 = z3 - y
d3 = np.multiply(dz3, dsigmoid(z3))
dW3 = np.dot(a2.T, d3)
db3 = np.sum(d3, axis = 0, keepdims=True)
d2 = np.multiply(np.dot(d3, W3.T), dtanh(z2))
dW2 = np.dot(a1.T, d2)
db2 = np.sum(d2, axis = 0, keepdims=True)
d1 = np.multiply(np.dot(d2, W2.T), dtanh(z1))
dW1 = np.dot(X.T, d1)
db1 = np.sum(d1, axis = 0, keepdims=True)
W1 -= (a / m) * dW1
W2 -= (a / m) * dW2
W3 -= (a / m) * dW3
b1 -= (a / m) * db1
b2 -= (a / m) * db2
b3 -= (a / m) * db3
cache = {'W1': W1, 'W2': W2, 'W3': W3, 'b1': b1, 'b2': b2, 'b3': b3}
return cache, costs
np.random.seed(0)
model = {'W1': np.random.rand(30, 6) * 0.01, 'W2': np.random.rand(6, 6) * 0.01, 'W3': np.random.rand(6, 1) * 0.01, 'b1': np.random.rand(1, 6), 'b2': np.random.rand(1, 6), 'b3': np.random.rand(1, 1)}
model, costss = train(X_train, y_train, model, 1000, 0.1)
plt.plot([i for i in range(1000)], costss)
print(costss[999])
plt.show()
def predict(X,y ,model):
W1 = model['W1']
W2 = model['W2']
W3 = model['W3']
b1 = model['b1']
b2 = model['b2']
b3 = model['b3']
z1 = np.dot(X, W1) + b1
a1 = tanh(z1)
z2 = np.dot(a1, W2) + b2
a2 = tanh(z2)
z3 = np.dot(a2, W3) + b3
a3 = sigmoid(z3)
m = a3.shape[0]
y_predict = np.zeros((m, 1))
for i in range(m):
y_predict = 1 if a3[i, 0] > 0.5 else 0
return y_predict
Thanks for helping :)
I think there is a problem with your backpropagation (I made a quick test and tried your model on Tensorflow and it achieves around 92% accuracy on both train and test data).
I've made the following modification to your code:
dz3 = a3 - y
d3 = np.multiply(dz3, dsigmoid(a3))
Also your function predict returns only one number whereas it should return as many number as examples therefore instead of
y_predict = np.zeros((m, 1))
for i in range(m):
y_predict = 1 if a3[i, 0] > 0.5 else 0
return y_predict
I changed this part to
y_predict[a3[:,0] > 0.5] = 1
return y_predict
I ran the training with 2000 epochs and increase the learning rate to 1 (a=1)
I'm using tensorflow to implement a 4 hidden layer sigmoid activated neural network, the following is the code.
import tensorflow as tf
import numpy as np
from sklearn import preprocessing
import pandas as pd
#importing and preprocessing data
def get_data():
df = pd.read_csv('date_train.csv')
date_train = df.as_matrix()
X1 = date_train[:, :-1]
Y_float = date_train[:, -1]
Y = Y_float.astype(int)
#standardization
X = preprocessing.scale(X1)
N, D = X.shape
K = len(set(Y)) #length of possible y values
df_test = pd.read_csv('date_test.csv')
X_test = df_test.as_matrix()
df_ans = pd.read_csv('date_test_ans.csv')
Y_ans = df_ans.as_matrix()
Y_test = Y_ans[:,-1]
return X, Y, N, D, K, X_test, Y_test
def init_weights(shape):
return tf.Variable(tf.random_normal(shape,stddev=0.01))
#model
def forward(tfX, W1, b1, W2, b2, W3, b3, W4, b4, W5, b5):
l1 = tf.add(tf.matmul(tfX, W1), b1)
l1 = tf.nn.sigmoid(l1)
l2 = tf.add(tf.matmul(l1, W2), b2)
l2 = tf.nn.sigmoid(l2)
l3 = tf.add(tf.matmul(l2, W3), b3)
l3 = tf.nn.sigmoid(l3)
l4 = tf.add(tf.matmul(l3, W4), b4)
l4 = tf.nn.sigmoid(l4)
output = tf.matmul(l4, W5) + b5
return output
X, Y, N, D, K, X_test, Y_test = get_data()
T = np.zeros((N,K))
for i in range(N):
T[i,Y[i]] = 1
# params
h1 = 18
h2 = 18
h3 = 18
h4 = 18
learning_rate = 0.01
epochs = 100
tfX = tf.placeholder(tf.float32, [None, D])
tfY = tf.placeholder(tf.float32, [None, K])
W1 = init_weights([D,h1])
b1 = init_weights([h1])
W2 = init_weights([h1,h2])
b2 = init_weights([h2])
W3 = init_weights([h2,h3])
b3 = init_weights([h3])
W4 = init_weights([h3,h4])
b4 = init_weights([h4])
W5 = init_weights([h4,K])
b5 = init_weights([K])
#train
logits = forward(tfX, W1, b1, W2, b2, W3, b3, W4, b4, W5, b5)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY, logits=logits))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost)
predict_op = tf.argmax(logits, 1)
correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(tfY,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(epochs):
sess.run(train_op, feed_dict={tfX: X, tfY: T})
pred = sess.run(predict_op, feed_dict={tfX: X, tfY: T})
if i % 100 == 0:
print("Accuracy Rate", np.mean(Y == pred))
print(accuracy.eval(predict_op, feed_dict={tfX: X_test, tfY: Y_test}))
The code training part is running fine, but when i try to evaluate the model its producing this error (note this is my first time using eval outisde of the tutorials):
TypeError: eval() got multiple values for argument 'feed_dict'
Also running this on tensorflow-gpu if that matters
Since this is not an interactive session, you should use eval within a session.
print(accuracy.eval(predict_op, feed_dict={tfX: X_test, tfY: Y_test},session = sess))
Or
with tf.Session() as sess:
print(accuracy.eval(predict_op, feed_dict={tfX: X_test, tfY: Y_test}))
Or, declare sess as an interactive session, like so:
sess = tf.InteractiveSession()
And use eval like you are using in your present code.