Something wrong with Sigmoid curve for Logistic Regression - python

I'm trying to use logistic regression on the popularity of hits songs on Spotify from 2010-2019 based on their durations and durability, whose data are collected from a .csv file. Basically, since the popularity values of each song are numerical, I have converted each of them to binary numbers "0" to "1". If the popularity value of a hit song is less than 70, I will replace its current value to 0, and vice versa if its value is more than 70. For some reason, as the rest of my code is pretty standard in creating a sigmoid function, the end result is a straight line instead of a sigmoid curve.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
Dance = df.dnce
Dance = np.array(Dance)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
df.loc[df['popu'] <= 70, 'popu'] = 0
df.loc[df['popu'] > 70, 'popu'] = 1
def Logistic_Regression(X, y, iterations, alpha):
ones = np.ones((X.shape[0], ))
X = np.vstack((ones, X))
X = X.T
b = np.zeros(X.shape[1])
for i in range(iterations):
z = np.dot(X, b)
p_hat = sigmoid(z)
gradient = np.dot(X.T, (y - p_hat))
b = b + alpha * gradient
if (i % 1000 == 0):
print('LL, i ', log_likelihood(X, y, b), i)
return b
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def log_likelihood(X, y, b):
z = np.dot(X, b)
LL = np.sum(y*z - np.log(1 + np.exp(z)))
return LL
def LR1():
Dur = df.dur
Dur = np.array(Dur)
Pop = df.popu
Pop = [int(i) for i in Pop]; Pop = np.array(Pop)
plt.figure(figsize=(10,8))
colormap = np.array(['r', 'b'])
plt.scatter(Dur, Pop, c = colormap[Pop], alpha = .4)
b = Logistic_Regression(Dur, Pop, iterations = 8000, alpha = 0.00005)
print('Done')
p_hat = sigmoid(np.dot(Dur, b[1]) + b[0])
idxDur = np.argsort(Dur)
plt.plot(Dur[idxDur], p_hat[idxDur])
plt.show()
LR1()
df

Your logreg params arent coming out correctly, thus something is wrong in your gradient descent.
If I do
from sklearn.linear_model import LogisticRegression
df = pd.DataFrame({'popu':[0,1,0,1,1,0,0,1,0,0],'dur'[217,283,200,295,221,176,206,260,217,213]})
logreg = LogisticRegression()
logreg.fit(Dur.reshape([10,1]),Pop.reshape([10,1]))
print(logreg.coef_)
print(logreg.intercept_)
I get [0.86473507, -189.79655798]
whereas your params (b) come out [0.012136874150412973 -0.2430389407767768] for this data.
Plot of your vs scikit logregs here

Related

How can I implement interface conditions using DeepXDE for solving differential equations?

I have been using DeepXDE (which is a framework for solving differential equations). I am particularly interested in implementing interface conditions, for example, to represent perfect thermal contact and heat flux continuity at a interface between to different solids.
Problem:
So far, I've considered a simple heat transfer problem, as if it were a rod composed of two different materials, with Dirichlet conditions at x=0 and x=L:
from x=0 to x=L/2, we have conductivity coefficient a_1 and temperature T_1(x,t);
from x=L/2 to x=L, we have coefficient a_2 and temperature T_2(x,t);
at the interface, we have to meet both T_1 - T_2 = 0 and a_1dT_1/dx + a_2dT_2/dx = 0 for x=L/2 and t>0.
Although I did not find a concise solution, I tried to implement this problem. But, I have some questions:
I found a way to enforce the heat flux continuity using geom.boundary_normal( ). But, the respective loss is not decreasing (in fact, it is constant). Is it correct to use geom.boundary_normal( )? Is there an alternative way?
I am struggling to come up with away to enforce T_1 - T_2 = 0. How could I get the values of T_1 and T_2 at x=L/2 during the model train?
My code is as follows:
# Libraries to import
import deepxde as dde
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
# pamaters and geometries
a1 = 1.14 # conductivity coefficient 1
a2 = 0.01 # conductivity coefficient 2 # fairly different from a1
L = 1.0 # total length
T0 = 0.0 # temperature specified at x=0
TL = 1.0 # temperature specified at x=L
tend = 1.0 # final time for the simulation
geom1 = dde.geometry.Interval(0, L/2) # first solid
geom2 = dde.geometry.Interval(L/2, L) # second solid
timedomain = dde.geometry.TimeDomain(0, tend)
geomtime = dde.geometry.GeometryXTime(geom1|geom2, timedomain)
# Models and the respective domains
def pde_T1(x, y, _):
dy_t = dde.grad.jacobian(y, x, i=0, j=1)
dy_xx = dde.grad.hessian(y, x, i=0, j=0)
return dy_t - a1 * dy_xx
def pde_T2(x, y, _):
dy_t = dde.grad.jacobian(y, x, i=0, j=1)
dy_xx = dde.grad.hessian(y, x, i=0, j=0)
return dy_t - a2 * dy_xx
def on_domain1(x, on_domain):
return geom1.inside(x)[0]
def on_domain2(x, on_domain):
return geom2.inside(x)[0]
# Boundary and initial conditions
def on_boundary1(x, on_boundary):
return on_boundary and np.isclose(x[0], 0)
def on_boundary2(x, on_boundary):
return on_boundary and np.isclose(x[0], L)
def boundary_initial(x, on_initial):
return on_initial and np.isclose(x[1], 0)
# interface conditions
def on_interf(x, on_boundary):
return on_boundary and np.isclose(x[0], L/2)
def flux_int(x,y,X):
# I need help here.
return (a1*geom1.boundary_normal(X) + a2*geom2.boundary_normal(X)).reshape(-1,1)
def Temp_int(x,y,X):
# I need help here.
# T1_int: how to get from geom1 at x=L/2?
# T2_int = how to get from geom2 at x=L/2?
pass
# Setting the IC
def init_func(X):
x = X[:, 0:1]
y = X[:, 1:2]
t = np.zeros((len(X),1))
for count, x_ in enumerate(x):
if x_ < L/2:
t[count] = T0
else:
t[count] = T0 + 2*(Ts-T0) * (x_ - L/2)
return t
ic = dde.IC(geomtime, init_func, boundary_initial)
# Seting the BCs
pde1 = dde.OperatorBC(geomtime1, pde_T1, on_boundary = on_domain1)
pde2 = dde.OperatorBC(geomtime2, pde_T2, on_boundary = on_domain2)
bc1 = dde.icbc.DirichletBC(geomtime1, lambda x: T0*np.ones((len(x),1)), on_boundary1)
bc2 = dde.icbc.DirichletBC(geomtime2, lambda x: TL*np.ones((len(x),1)), on_boundary2) # not used in loss
# Setting the BC at the interface with 500 points
X = np.hstack( (np.full((500), L/2).reshape(-1,1), timedomain.random_points(500))).reshape(-1, 2)
FluxInterf = dde.icbc.PointSetOperatorBC(X,
np.zeros((X.shape[0],1)), # fluxes must add up to zero at x=L/2.
lambda x, y, X : flux_int(x, y, X[:,0]))
# Setting the problem
loss = [pde1, pde2, bc1, ic, FluxInterf]
data = dde.data.TimePDE(
geomtime,
None,
loss,
num_domain=1000,
num_boundary=500,
num_initial=500,
num_test=500)
loss_weights = [10, 10, 0.1, 0.1, 100]
net = dde.nn.FNN([2] + 4 * [50] + [1], "tanh", "Glorot normal")
# Enforcing BC at x=L
def output_transform(x, y):
xx, t = x[:,0:1], x[:,1:2]
return (L-xx)*y + Ts
net.apply_output_transform(output_transform)
model = dde.Model(data, net)
model.compile("adam", lr=1.0e-3, loss_weights = loss_weights)
losshistory, train_state = model.train(iterations=25000)
model.compile("L-BFGS")
losshistory, train_state = model.train()
dde.saveplot(losshistory, train_state, issave=True, isplot=True)
Thank you for your time and consideration.
Best regards.

Fit quadratic function to data using MSE

So my idea is (borrowed from neural network folks) that if I have data set D, I can fit a quadratic curve to it by first calculating the derivative of the error with respect to the parameters (a, b, and c), and then do a small update that minimizes the error. My problem is, the following code doesn't actually manage fit the curve. For linear stuff, a similar approach works, but quadratic seems to fail for some reason. Can you see what I have done wrong (either assumption or just implementation error)
EDIT: The question was not specific enough: This following code will not deal with bias in data very well. For some reason it updates a and b parameters somehow that c gets left behind. This method is similar to robotics (find path using Jacobians) and neural networks (find parameters based on error) so it is not unreasonable algorithm, now the question is, why this specific implementation does not produce results that I would expect.
In the following Python code, I use math as m, and MSE is a function that calculates Mean Squared Error between two arrays. Other than that, the code is self contained
Code:
def quadraticRegression(data, dErr):
a = 1 #Starting values
b = 1
c = 1
a_momentum = 0 #Momentum to counter steady state error
b_momentum = 0
c_momentum = 0
estimate = [a*x**2 + b*x + c for x in range(len(data))] #Estimate curve
error = MSE(data, estimate) #Get errors 'n stuff
errorOld = 0
lr = 0.0000000001 #learning rate
while abs(error - errorOld) > dErr:
#Fit a (dE/da)
deda = sum([ 2*x**2 * (a*x**2 + b*x + c - data[x]) for x in range(len(data)) ])/len(data)
correction = deda*lr
a_momentum = (a_momentum)*0.99 + correction*0.1 #0.99 is to slow down momentum when correction speed changes
a = a - correction - a_momentum
#fit b (dE/db)
dedb = sum([ 2*x*(a*x**2 + b*x + c - data[x]) for x in range(len(data))])/len(data)
correction = dedb*lr
b_momentum = (b_momentum)*0.99 + correction*0.1
b = b - correction - b_momentum
#fit c (dE/dc)
dedc = sum([ 2*(a*x**2 + b*x + c - data[x]) for x in range(len(data))])/len(data)
correction = dedc*lr
c_momentum = (c_momentum)*0.99 + correction*0.1
c = c - correction - c_momentum
#Update model and find errors
estimate = [a*x**2 +b*x + c for x in range(len(data))]
errorOld = error
print(error)
error = MSE(data, estimate)
return a, b, c, error
To me it looks like your code works totally correct! At least algorithm is correct. I've changed your code to use numpy for fast computation instead of pure Python. Also I've configured some params a bit, like changed momentum and learning rate, also implemented MSE.
Then I used matplotlib to draw plot animation. Finally on animation it looks like that your regression actually tries to fit the curve to data. Although at the last iteration for fitting sin(x) for x in [0; 2 * pi] it looks like linear approximation, but still close to data points as much as possible for quadratic curve to be. But for sin(x) for x in [0; pi] it looks like ideal approximation (it starts fitting from around 12-th iteration).
i-th frame of animation just does regression with dErr = 0.7 ** (i + 15).
My animation is a bit slow to just run script, but if you add param save like this python script.py save it will render/save to line.gif plot drawing animation. If you run the script without params it will animation on your PC screen plotting/fitting in real time.
Full code goes after graphics, code needs installing some python modules by running once python -m pip install numpy matplotlib.
Next is sin(x) for x in (0, pi):
Next is sin(x) for x in (0, 2 * pi):
Next is abs(x) for x in (-1, 1):
# Needs: python -m pip install numpy matplotlib
import math, sys
import numpy as np, matplotlib.pyplot as plt, matplotlib.animation as animation
from matplotlib.animation import FuncAnimation
x_range = (0., math.pi, 0.1) # (xmin, xmax, xstep)
y_range = (-0.2, 1.2) # (ymin, ymax)
num_iterations = 50
def f(x):
return np.sin(x)
def derr(iteration):
return 0.7 ** (iteration + 15)
def MSE(a, b):
return (np.abs(np.array(a) - np.array(b)) ** 2).mean()
def quadraticRegression(*, x, data, dErr):
x, data = np.array(x), np.array(data)
assert x.size == data.size, (x.size, data.size)
a = 1 #Starting values
b = 1
c = 1
a_momentum = 0.1 #Momentum to counter steady state error
b_momentum = 0.1
c_momentum = 0.1
estimate = a*x**2 + b*x + c #Estimate curve
error = MSE(data, estimate) #Get errors 'n stuff
errorOld = 0.
lr = 10. ** -4 #learning rate
while abs(error - errorOld) > dErr:
#Fit a (dE/da)
deda = np.sum(2*x**2 * (a*x**2 + b*x + c - data))/len(data)
correction = deda*lr
a_momentum = (a_momentum)*0.99 + correction*0.1 #0.99 is to slow down momentum when correction speed changes
a = a - correction - a_momentum
#fit b (dE/db)
dedb = np.sum(2*x*(a*x**2 + b*x + c - data))/len(data)
correction = dedb*lr
b_momentum = (b_momentum)*0.99 + correction*0.1
b = b - correction - b_momentum
#fit c (dE/dc)
dedc = np.sum(2*(a*x**2 + b*x + c - data))/len(data)
correction = dedc*lr
c_momentum = (c_momentum)*0.99 + correction*0.1
c = c - correction - c_momentum
#Update model and find errors
estimate = a*x**2 +b*x + c
errorOld = error
#print(error)
error = MSE(data, estimate)
return a, b, c, error
fig, ax = plt.subplots()
fig.set_tight_layout(True)
x = np.arange(x_range[0], x_range[1], x_range[2])
#ax.scatter(x, x + np.random.normal(0, 3.0, len(x)))
line0, line1 = None, None
do_save = len(sys.argv) > 1 and sys.argv[1] == 'save'
def g(x, derr):
a, b, c, error = quadraticRegression(x = x, data = f(x), dErr = derr)
return a * x ** 2 + b * x + c
def dummy(x):
return np.ones_like(x, dtype = np.float64) * 100.
def update(i):
global line0, line1
de = derr(i)
if line0 is None:
assert line1 is None
line0, = ax.plot(x, f(x), 'r-', linewidth=2)
line1, = ax.plot(x, g(x, de), 'r-', linewidth=2, color = 'blue')
ax.set_ylim(y_range[0], y_range[1])
if do_save:
sys.stdout.write(str(i) + ' ')
sys.stdout.flush()
label = 'iter {0} derr {1}'.format(i, round(de, math.ceil(-math.log(de) / math.log(10)) + 2))
line1.set_ydata(g(x, de))
ax.set_xlabel(label)
return line1, ax
if __name__ == '__main__':
anim = FuncAnimation(fig, update, frames = np.arange(0, num_iterations), interval = 200)
if do_save:
anim.save('line.gif', dpi = 200, writer = 'imagemagick')
else:
plt.show()

Use loop for np.random or reshape array to matrix?

I am new to programming in general, however I am trying really hard for a project to randomly choose some outcomes depending on the probability of that outcome happening for lotteries that i have generated and i would like to use a loop to get random numbers each time.
This is my code:
import numpy as np
p = np.arange(0.01, 1, 0.001, dtype = float)
alpha = 0.5
alpha = float(alpha)
alpha = np.zeros((1, len(p))) + alpha
def w(alpha, p):
return np.exp(-(-np.log(p))**alpha)
w = w(alpha, p)
def P(w):
return np.exp(np.log2(w))
prob_win = P(w)
prob_lose = 1 - prob_win
E = 10
E = float(E)
E = np.zeros((1, len(p))) + E
b = 0
b = float(b)
b = np.zeros((1, len(p))) + b
def A(E, b, prob_win):
return (E - b * (1 - prob_win)) / prob_win
a = A(E, b, prob_win)
a = a.squeeze()
prob_array = (prob_win, prob_lose)
prob_matrix = np.vstack(prob_array).T.squeeze()
outcomes_array = (a, b)
outcomes_matrix = np.vstack(outcomes_array).T
outcome_pairs = np.vsplit(outcomes_matrix, len(p))
outcome_pairs = np.array(outcome_pairs).astype(np.float)
prob_pairs = np.vsplit(prob_matrix, len(p))
prob_pairs = np.array(prob_pairs)
nominalized_prob_pairs = [outcome_pairs / np.sum(outcome_pairs) for
outcome_pairs in np.vsplit(prob_pairs, len(p)) ]
The code works fine but I would like to use a loop or something similar for the next line of code as I want to get for each row/ pair of probabilities to get 5 realizations. When i use size = 5 i just get a really long list but I do not know which values still belong to the pairs as when size = 1
realisations = np.concatenate([np.random.choice(outcome_pairs[i].ravel(),
size=1 , p=nominalized_prob_pairs[i].ravel()) for i in range(len(outcome_pairs))])
or if I use size=5 as below how can I match the realizations to the initial probabilities? Do i need to cut the array after every 5th element and then store the values in a matrix with 5 columns and a new row for every 5th element of the initial array? if yes how could I do this?
realisations = np.concatenate([np.random.choice(outcome_pairs[i].ravel(),
size=1 , p=nominalized_prob_pairs[i].ravel()) for i in range(len(outcome_pairs))])
What are you trying to produce exactly ? Be more concise.
Here is a starter clean code where you can produce linear data.
import numpy as np
def generate_data(n_samples, variance):
# generate 2D data
X = np.random.random((n_samples, 1))
# adding a vector of ones to ease calculus
X = np.concatenate((np.ones((n_samples, 1)), X), axis=1)
# generate two random coefficients
W = np.random.random((2, 1))
# construct targets with our data and weights
y = X # W
# add some noise to our data
y += np.random.normal(0, variance, (n_samples, 1))
return X, y, W
if __name__ == "__main__":
X, Y, W = generate_data(10, 0.5)
# check random value of x for example
for x in X:
print(x, end=' --> ')
if x[1] <= 0.4:
print('prob <= 0.4')
else:
print('prob > 0.4')

How to calculate logistic regression accuracy

I am a complete beginner in machine learning and coding in python, and I have been tasked with coding logistic regression from scratch to understand what happens under the hood. So far I have coded for the hypothesis function, cost function and gradient descent, and then coded for the logistic regression. However on coding for printing the accuracy I get a low output (0.69) which doesnt change with increasing iterations or changing the learning rate. My question is, is there a problem with my accuracy code below? Any help pointing to the right direction would be appreciated
X = data[['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst']]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = data["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)
X = data["diagnosis"].map(lambda x: float(x))
def Sigmoid(z):
if z < 0:
return 1 - 1/(1 + math.exp(z))
else:
return 1/(1 + math.exp(-z))
def Hypothesis(theta, x):
z = 0
for i in range(len(theta)):
z += x[i]*theta[i]
return Sigmoid(z)
def Cost_Function(X,Y,theta,m):
sumOfErrors = 0
for i in range(m):
xi = X[i]
hi = Hypothesis(theta,xi)
error = Y[i] * math.log(hi if hi >0 else 1)
if Y[i] == 1:
error = Y[i] * math.log(hi if hi >0 else 1)
elif Y[i] == 0:
error = (1-Y[i]) * math.log(1-hi if 1-hi >0 else 1)
sumOfErrors += error
constant = -1/m
J = constant * sumOfErrors
#print ('cost is: ', J )
return J
def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
sumErrors = 0
for i in range(m):
xi = X[i]
xij = xi[j]
hi = Hypothesis(theta,X[i])
error = (hi - Y[i])*xij
sumErrors += error
m = len(Y)
constant = float(alpha)/float(m)
J = constant * sumErrors
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = []
constant = alpha/m
for j in range(len(theta)):
CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
new_theta_value = theta[j] - CFDerivative
new_theta.append(new_theta_value)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test, Hypothesis(X,theta))
for i in range(length):
prediction = round(Hypothesis(X[i],theta))
answer = Y[i]
if prediction == answer.all():
correct += 1
my_accuracy = (correct / length)*100
print ('LR Accuracy %: ', my_accuracy)
def Logistic_Regression(X,Y,alpha,theta,num_iters):
theta = np.zeros(X.shape[1])
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
Cost_Function(X,Y,theta,m)
print ('theta: ', theta)
print ('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
initial_theta = [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
alpha = 0.0001
iterations = 1000
Logistic_Regression(X,Y,alpha,initial_theta,iterations)
This is using data from the wisconsin breast cancer dataset (https://www.kaggle.com/uciml/breast-cancer-wisconsin-data) where I am weighing in 30 features - although changing the features to ones which are known to correlate also doesn't change my accuracy.
Python gives us this scikit-learn library that makes our work easier,
this worked for me:
from sklearn.metrics import accuracy_score
y_pred = log.predict(x_test)
score =accuracy_score(y_test,y_pred)
Accuracy is one of the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations. Higher accuracy means model is preforming better.
Accuracy = TP+TN/TP+FP+FN+TN
TP = True positives
TN = True negatives
FN = False negatives
TN = True negatives
While you are using accuracy measure your false positives and false negatives should be of similar cost. A better metric is the F1-score which is given by
F1-score = 2*(Recall*Precision)/Recall+Precision where,
Precision = TP/TP+FP
Recall = TP/TP+FN
Read more here
https://en.wikipedia.org/wiki/Precision_and_recall
The beauty about machine learning in python is that important modules like scikit-learn is open source so you can always look at the actual code.
Please use the below link to scikit learn metrics source code which will give you an idea how scikit-learn calculates the accuracy score when you do
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/metrics
I'm not sure how you arrived at a value of 0.0001 for alpha, but I think it's too low. Using your code with the cancer data shows that cost is decreasing with each iteration -- it's just going glacially.
When I raise this to 0.5, I still get a decreasing costs, but at a more reasonable level. After 1000 iterations it reports:
cost: 0.23668000993020666
And after fixing the Accuracy function I'm getting 92% on the test segment of the data.
You have Numpy installed, as shown by X = np.array(X). You should really consider using it for your operations. It will be orders of magnitude faster for jobs like this. Here is a vectorized version that gives results instantly rather than waiting:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv("cancerdata.csv")
X = df.values[:,2:-1].astype('float64')
X = (X - np.mean(X, axis =0)) / np.std(X, axis = 0)
## Add a bias column to the data
X = np.hstack([np.ones((X.shape[0], 1)),X])
X = MinMaxScaler().fit_transform(X)
Y = df["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy %: ', my_accuracy)
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
#print ('theta: ', theta)
print ('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 2000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
I think I might have a different versions of scikit, because I had change the MinMaxScaler line to make it work. The result is that I can 10K iterations in the blink of an eye and the results of the applying the model to the test set is about 97% accuracy.
This also works using Vectorization to calculate the accuracy
But Accuracy is not recommended metric as the above Answer noted (if the data is not well_blanced you should not use accuracy instead you use F1-score)
clf = sklearn.linear_model.LogisticRegressionCV();
clf.fit(X.T, Y.T);
LR_predictions = clf.predict(X.T)
print ('Accuracy of logistic regression: %d ' % float((np.dot(Y,LR_predictions) + np.dot(1-Y,1-LR_predictions))/float(Y.size)*100) +
'% ' + "(percentage of correctly labelled datapoints)")

Questions on Logistic Regression

I'm now using the training set from OpenClassroom(http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=DeepLearning&doc=exercises/ex4/ex4.html) to give it a try on Logistic Regression, and I only use LR,unlike that page which uses LR and Newton's methods.
below is my code:
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append(float(lineArr2[0]))
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# def autoNorm(dataSet):
# # newValue = (oldValue-min)/(max-min)
# minVals = min(dataSet)
# maxVals = max(dataSet)
# ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals)))
# normDataSet = zeros(shape(dataSet))
# m,n = shape(dataSet)
# normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1)))))
# normDataSet = normDataSet/tile(ranges, (m,1))
# return normDataSet, ranges, minVals
def gradDescent(dataMatIn, classLabels):
x = mat(dataMatIn)
y = mat(classLabels).transpose()
m,n = shape(x)
alpha = 0.001
maxCycles = 100000
theta = ones((n,1))
for k in range(maxCycles):
h = sigmoid(x*theta)
error = h - y
cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h))
print("Iteration %d | Cost: %f" % (k, cost))
theta = theta - alpha * (x.transpose() * error /m)
return theta
def plotBestFit(weights):
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
min_x = min(mat(dataMat)[:, 1])
max_x = max(mat(dataMat)[:, 1])
x = arange(min_x, max_x, 1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel)
print weights
plotBestFit(weights.getA())
here is my questions:
1. I trained it for 100,000 times, with error was printed each iteration, I didn't see it converaged anyway, well, actually I'm not sure here.
2. I'm not sure how to paint the classifier correctly by matplotlib, when the maxCycle is 200,000, I can get a somewhat reasonable classifier as well as the maxCyle is 100,000, the paint seems not reasonable at all.
maxCycle is 100,000
UPDATE CODE:
count = 0
for i in range(80):
result = sigmoid(dataMat[i] * weights)
if result > 0.5:
a = 1
else:
a = 0
if float(a) != classLabel[i][0]:
count += 1
errorRate = (float(count)/80)
print "error count is: %f, error rate is: %f" %(count,errorRate)
Your code is actually fine! Here are some remarks:
You initialized the thetas with all ones. I would not do so in this example. The first call of the sigmoid function will return values close to 1, because the product of theta and x gives very large numbers. The computation of log(1 - h) can result in error, because log is not defined at 0. I prefer to initialize thetas with 0's.
When calculating the cost function you missed the division by m. It does not matter for the algorithm, but it's better to follow the theory.
It's a good idea to plot the cost function, and not just print its values. The correct trend can be seen very clearly.
In order to converge, this particular example needs much more iterations. I reached a good result at 500.000 iterations.
The post has been updated, see the UPDATE below
Here are my plots:
As you can see the resulting separation line matches the plot shown in your tutorial very well.
Here is my code. It differs a little bit from yours, but they are very similar.
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
x = np.mat(dataMatIn)
y = np.mat(classLabels)
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
for k in range(maxCycles):
h = sigmoid(x*theta)
cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
if ((k % 1000) == 0):
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
grad = (x.transpose() * (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(dataMat, classLabel, weights):
arrY = np.asarray(classLabel)
arrX = np.asarray(dataMat)
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(np.mat(dataMat)[:, 1])
max_x1 = max(np.mat(dataMat)[:, 1])
x1_val = np.arange(min_x1, max_x1, 1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000)
print(weights)
plotBestFit(dataMat, classLabel, weights)
UPDATE
After reading your questions in the comments to the first edition of the post I tried to optimize the code to achieve the convergence of the cost function using much smaller number of iterations.
Indeed the feature standardization makes miracles :)
An even better result was achieved after only 30 iterations!
Here are the new plots:
Because of the standardization you need to scale each new test example, in order to classify it.
Here is the new code. I changed some data types to avoid unnecessary data type conversions.
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return np.asarray(dataMat), np.asarray(labelMat)
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(x, y, alpha, maxCycles):
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
cost_iter = []
for k in range(maxCycles):
h = sigmoid(np.dot(x, theta))
cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
cost_iter.append(k)
grad = np.dot(x.transpose(), (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_iter, cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(arrX, arrY, weights):
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(arrX[:, 1:2])
max_x1 = max(arrX[:, 1:2])
x1_val = np.arange(min_x1, max_x1, 0.1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]
#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)
dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))
dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide( (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]), dataMatStd_m[:, 1:3])
weights = gradDescent(dataMatStand, classLabel, 1.0, 30)
print(weights)
plotBestFit(dataMatStand, classLabel, weights)

Categories

Resources