Polynomial Regression without scikitlearn - python

Tried doing a polynomial regression. However, for any values of n other than 3, the error increases significantly, the x vs y_hat plot actually starts going downwards.
The logs have been taken to get rid of the outliers
import random
import numpy as np
import matplotlib.pyplot as plt
import math
x = np.array([math.log10(1), math.log10(9), math.log10(22), math.log10(24), math.log10(25), math.log10(26), math.log10(27), math.log10(28), math.log10(29), math.log10(30), math.log10(31), math.log10(32), math.log10(33), math.log10(34), math.log10(35)])
y = np.array([math.log10(8), math.log10(9), math.log10(51), math.log10(115), math.log10(164), math.log10(209),math.log10(278), math.log10(321), math.log10(382),math.log10(456), math.log10(596), math.log10(798),math.log10(1140), math.log10(1174), math.log10(1543)])
c = random.random()
plt.scatter(x, y)
n = 3
m=[]
x_real = []
alpha = 0.0001
y_hat = []
for i in range(1, n+1):
x_real.append(x**i)
m.append(random.random())
x_real = np.array(x_real)
m = np.array(m)
x_real = np.transpose(x_real)
y_hat = np.matmul(x_real, m)+c
error = 0.5*(np.sum((y-y_hat)**2))
print(error)
sum = np.sum(y_hat-y)
for epochs in range(101):
for items in range(n):
m[items] = m[items] - (alpha*(sum*x[items]))
c = c - (alpha*sum)
y_hat = (np.matmul(x_real, m))+c
error = 0.5*(np.sum((y-y_hat)**2))
print(error)
plt.plot(x, y_hat)

You need to update the value of sum for each epoch :
prev = 0
for epochs in range(101):
sum = np.sum(y_hat-y)
for items in range(n):
m[items] = m[items] - (alpha*(sum*x[items]))
c = c - (alpha*sum)
y_hat = (np.matmul(x_real, m))+c
error = 0.5*(np.sum((y-y_hat)**2))
if error == prev:
break
print(error)
plt.plot(x, y_hat)
Just a small error, I assume !
Also you can break the epoch loop once the errors are too close, or in your case when they are equal for successive epochs.

Related

Jupyter script suddenly does not recognize class anymore (NameError)

I am trying to run a Jupyter script in PyCharm. The script takes +- 2 hours to run. When I ran it for the first time, my memory ran out of memory and I exited pyCharm. I deleted some files from my PC and launched PyCharm again. When I opened my script and ran the code again, it gave me an error: suddenly it does not recognize my class 'Progressbar' anymore while it did recognize it the first time I ran the script. I did not have this error before. Does anyone knows what is going on here?
This is the script:
import sys
import collections
import itertools
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
from scipy.spatial.distance import squareform
import pandas as p
plt.style.use('bmh')
%matplotlib inline
try:
from IPython.display import clear_output
have_ipython = True
except ImportError:
have_ipython = False
class KnnDtw(object):
"""K-nearest neighbor classifier using dynamic time warping
as the distance measure between pairs of time series arrays
Arguments
---------
n_neighbors : int, optional (default = 5)
Number of neighbors to use by default for KNN
max_warping_window : int, optional (default = infinity)
Maximum warping window allowed by the DTW dynamic
programming function
subsample_step : int, optional (default = 1)
Step size for the timeseries array. By setting subsample_step = 2,
the timeseries length will be reduced by 50% because every second
item is skipped. Implemented by x[:, ::subsample_step]
"""
def __init__(self, n_neighbors=5, max_warping_window=10000, subsample_step=1):
self.n_neighbors = n_neighbors
self.max_warping_window = max_warping_window
self.subsample_step = subsample_step
def fit(self, x, l):
"""Fit the model using x as training data and l as class labels
Arguments
---------
x : array of shape [n_samples, n_timepoints]
Training data set for input into KNN classifer
l : array of shape [n_samples]
Training labels for input into KNN classifier
"""
self.x = x
self.l = l
def _dtw_distance(self, ts_a, ts_b, d = lambda x,y: abs(x-y)):
"""Returns the DTW similarity distance between two 2-D
timeseries numpy arrays.
Arguments
---------
ts_a, ts_b : array of shape [n_samples, n_timepoints]
Two arrays containing n_samples of timeseries data
whose DTW distance between each sample of A and B
will be compared
d : DistanceMetric object (default = abs(x-y))
the distance measure used for A_i - B_j in the
DTW dynamic programming function
Returns
-------
DTW distance between A and B
"""
# Create cost matrix via broadcasting with large int
ts_a, ts_b = np.array(ts_a), np.array(ts_b)
M, N = len(ts_a), len(ts_b)
cost = sys.maxsize * np.ones((M, N))
# Initialize the first row and column
cost[0, 0] = d(ts_a[0], ts_b[0])
for i in range(1, M):
cost[i, 0] = cost[i-1, 0] + d(ts_a[i], ts_b[0])
for j in range(1, N):
cost[0, j] = cost[0, j-1] + d(ts_a[0], ts_b[j])
# Populate rest of cost matrix within window
for i in range(1, M):
for j in range(max(1, i - self.max_warping_window),
min(N, i + self.max_warping_window)):
choices = cost[i - 1, j - 1], cost[i, j-1], cost[i-1, j]
cost[i, j] = min(choices) + d(ts_a[i], ts_b[j])
# Return DTW distance given window
return cost[-1, -1]
def _dist_matrix(self, x, y):
"""Computes the M x N distance matrix between the training
dataset and testing dataset (y) using the DTW distance measure
Arguments
---------
x : array of shape [n_samples, n_timepoints]
y : array of shape [n_samples, n_timepoints]
Returns
-------
Distance matrix between each item of x and y with
shape [training_n_samples, testing_n_samples]
"""
# Compute the distance matrix
dm_count = 0
# Compute condensed distance matrix (upper triangle) of pairwise dtw distances
# when x and y are the same array
if(np.array_equal(x, y)):
x_s = np.shape(x)
dm = np.zeros((x_s[0] * (x_s[0] - 1)) // 2, dtype=np.double)
p = ProgressBar(np.shape(dm)[0])
for i in range(0, x_s[0] - 1):
for j in range(i + 1, x_s[0]):
dm[dm_count] = self._dtw_distance(x[i, ::self.subsample_step],
y[j, ::self.subsample_step])
dm_count += 1
p.animate(dm_count)
# Convert to squareform
dm = squareform(dm)
return dm
# Compute full distance matrix of dtw distnces between x and y
else:
x_s = np.shape(x)
y_s = np.shape(y)
dm = np.zeros((x_s[0], y_s[0]))
dm_size = x_s[0]*y_s[0]
p = ProgressBar(dm_size)
for i in range(0, x_s[0]):
for j in range(0, y_s[0]):
dm[i, j] = self._dtw_distance(x[i, ::self.subsample_step],
y[j, ::self.subsample_step])
# Update progress bar
dm_count += 1
p.animate(dm_count)
return dm
def predict(self, x):
"""Predict the class labels or probability estimates for
the provided data
Arguments
---------
x : array of shape [n_samples, n_timepoints]
Array containing the testing data set to be classified
Returns
-------
2 arrays representing:
(1) the predicted class labels
(2) the knn label count probability
"""
dm = self._dist_matrix(x, self.x)
# Identify the k nearest neighbors
knn_idx = dm.argsort()[:, :self.n_neighbors]
# Identify k nearest labels
knn_labels = self.l[knn_idx]
# Model Label
mode_data = mode(knn_labels, axis=1)
mode_label = mode_data[0]
mode_proba = mode_data[1]/self.n_neighbors
return mode_label.ravel(), mode_proba.ravel()
class ProgressBar:
"""This progress bar was taken from PYMC
"""
def __init__(self, iterations):
self.iterations = iterations
self.prog_bar = '[]'
self.fill_char = '*'
self.width = 40
self.__update_amount(0)
if have_ipython:
self.animate = self.animate_ipython
else:
self.animate = self.animate_noipython
def animate_ipython(self, iter):
print ('\r', self,
sys.stdout.flush())
self.update_iteration(iter + 1)
def update_iteration(self, elapsed_iter):
self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
self.prog_bar += ' %d of %s complete' % (elapsed_iter, self.iterations)
def __update_amount(self, new_amount):
percent_done = int(round((new_amount / 100.0) * 100.0))
all_full = self.width - 2
num_hashes = int(round((percent_done / 100.0) * all_full))
self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
pct_place = (len(self.prog_bar) // 2) - len(str(percent_done))
pct_string = '%d%%' % percent_done
self.prog_bar = self.prog_bar[0:pct_place] + \
(pct_string + self.prog_bar[pct_place + len(pct_string):])
def __str__(self):
return str(self.prog_bar)
time = np.linspace(0,20,1000)
amplitude_a = 5*np.sin(time)
amplitude_b = 3*np.sin(time + 1)
m = KnnDtw()
distance = m._dtw_distance(amplitude_a, amplitude_b)
fig = plt.figure(figsize=(12,4))
_ = plt.plot(time, amplitude_a, label='A')
_ = plt.plot(time, amplitude_b, label='B')
_ = plt.title('DTW distance between A and B is %.2f' % distance)
_ = plt.ylabel('Amplitude')
_ = plt.xlabel('Time')
_ = plt.legend()
#m._dist_matrix(np.random.random((4,50)), np.random.random((4,50)))
# Import the HAR dataset
x_train_file = open('UCI HAR Dataset/train/X_train.txt', 'r')
y_train_file = open('UCI HAR Dataset/train/y_train.txt', 'r')
x_test_file = open('UCI HAR Dataset/test/X_test.txt', 'r')
y_test_file = open('UCI HAR Dataset/test/y_test.txt', 'r')
# Create empty lists
x_train = []
y_train = []
x_test = []
y_test = []
# Mapping table for classes
labels = {1:'WALKING', 2:'WALKING UPSTAIRS', 3:'WALKING DOWNSTAIRS',
4:'SITTING', 5:'STANDING', 6:'LAYING'}
# Loop through datasets
for x in x_train_file:
x_train.append([float(ts) for ts in x.split()])
for y in y_train_file:
y_train.append(int(y.rstrip('\n')))
for x in x_test_file:
x_test.append([float(ts) for ts in x.split()])
for y in y_test_file:
y_test.append(int(y.rstrip('\n')))
# Convert to numpy for efficiency
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
m = KnnDtw(n_neighbors=1, max_warping_window=10)
m.fit(x_train[::10], y_train[::10])
label, proba = m.predict(x_test[::10])
from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(label, y_test[::10], target_names=[l for l in labels.values()]))
conf_mat = confusion_matrix(label, y_test[::10])
fig = plt.figure(figsize=(6,6))
width = np.shape(conf_mat)[1]
height = np.shape(conf_mat)[0]
res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest')
for i, row in enumerate(conf_mat):
for j, c in enumerate(row):
if c>0:
plt.text(j-.2, i+.1, c, fontsize=16)
cb = fig.colorbar(res)
plt.title('Confusion Matrix')
_ = plt.xticks(range(6), [l for l in labels.values()], rotation=90)
_ = plt.yticks(range(6), [l for l in labels.values()])
import time
time_taken = []
windows = [1,2,5,10,50,100,500,1000,5000]
for w in windows:
begin = time.time()
t = KnnDtw(n_neighbors=1, max_warping_window=w)
t.fit(x_train[:20], y_train[:20])
label, proba = t.predict(x_test[:20])
end = time.time()
time_taken.append(end - begin)
fig = plt.figure(figsize=(12,5))
_ = plt.plot(windows, [t/400. for t in time_taken], lw=4)
plt.title('DTW Execution Time with \nvarying Max Warping Window')
plt.ylabel('Execution Time (seconds)')
plt.xlabel('Max Warping Window')
plt.xscale('log')
Error code:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-3-4c142a5156b6> in <module>
188 return mode_label.ravel(), mode_proba.ravel()
189
--> 190 class ProgressBar:
191 """This progress bar was taken from PYMC
192 """
<ipython-input-3-4c142a5156b6> in ProgressBar()
279 m = KnnDtw(n_neighbors=1, max_warping_window=10)
280 m.fit(x_train[::10], y_train[::10])
--> 281 label, proba = m.predict(x_test[::10])
282
283 from sklearn.metrics import classification_report, confusion_matrix
<ipython-input-3-4c142a5156b6> in predict(self, x)
173 """
174
--> 175 dm = self._dist_matrix(x, self.x)
176
177 # Identify the k nearest neighbors
<ipython-input-3-4c142a5156b6> in _dist_matrix(self, x, y)
145 dm_size = x_s[0]*y_s[0]
146
--> 147 p = ProgressBar(dm_size)
148
149 for i in range(0, x_s[0]):
NameError: name 'ProgressBar' is not defined

Use loop for np.random or reshape array to matrix?

I am new to programming in general, however I am trying really hard for a project to randomly choose some outcomes depending on the probability of that outcome happening for lotteries that i have generated and i would like to use a loop to get random numbers each time.
This is my code:
import numpy as np
p = np.arange(0.01, 1, 0.001, dtype = float)
alpha = 0.5
alpha = float(alpha)
alpha = np.zeros((1, len(p))) + alpha
def w(alpha, p):
return np.exp(-(-np.log(p))**alpha)
w = w(alpha, p)
def P(w):
return np.exp(np.log2(w))
prob_win = P(w)
prob_lose = 1 - prob_win
E = 10
E = float(E)
E = np.zeros((1, len(p))) + E
b = 0
b = float(b)
b = np.zeros((1, len(p))) + b
def A(E, b, prob_win):
return (E - b * (1 - prob_win)) / prob_win
a = A(E, b, prob_win)
a = a.squeeze()
prob_array = (prob_win, prob_lose)
prob_matrix = np.vstack(prob_array).T.squeeze()
outcomes_array = (a, b)
outcomes_matrix = np.vstack(outcomes_array).T
outcome_pairs = np.vsplit(outcomes_matrix, len(p))
outcome_pairs = np.array(outcome_pairs).astype(np.float)
prob_pairs = np.vsplit(prob_matrix, len(p))
prob_pairs = np.array(prob_pairs)
nominalized_prob_pairs = [outcome_pairs / np.sum(outcome_pairs) for
outcome_pairs in np.vsplit(prob_pairs, len(p)) ]
The code works fine but I would like to use a loop or something similar for the next line of code as I want to get for each row/ pair of probabilities to get 5 realizations. When i use size = 5 i just get a really long list but I do not know which values still belong to the pairs as when size = 1
realisations = np.concatenate([np.random.choice(outcome_pairs[i].ravel(),
size=1 , p=nominalized_prob_pairs[i].ravel()) for i in range(len(outcome_pairs))])
or if I use size=5 as below how can I match the realizations to the initial probabilities? Do i need to cut the array after every 5th element and then store the values in a matrix with 5 columns and a new row for every 5th element of the initial array? if yes how could I do this?
realisations = np.concatenate([np.random.choice(outcome_pairs[i].ravel(),
size=1 , p=nominalized_prob_pairs[i].ravel()) for i in range(len(outcome_pairs))])
What are you trying to produce exactly ? Be more concise.
Here is a starter clean code where you can produce linear data.
import numpy as np
def generate_data(n_samples, variance):
# generate 2D data
X = np.random.random((n_samples, 1))
# adding a vector of ones to ease calculus
X = np.concatenate((np.ones((n_samples, 1)), X), axis=1)
# generate two random coefficients
W = np.random.random((2, 1))
# construct targets with our data and weights
y = X # W
# add some noise to our data
y += np.random.normal(0, variance, (n_samples, 1))
return X, y, W
if __name__ == "__main__":
X, Y, W = generate_data(10, 0.5)
# check random value of x for example
for x in X:
print(x, end=' --> ')
if x[1] <= 0.4:
print('prob <= 0.4')
else:
print('prob > 0.4')

Something wrong with Sigmoid curve for Logistic Regression

I'm trying to use logistic regression on the popularity of hits songs on Spotify from 2010-2019 based on their durations and durability, whose data are collected from a .csv file. Basically, since the popularity values of each song are numerical, I have converted each of them to binary numbers "0" to "1". If the popularity value of a hit song is less than 70, I will replace its current value to 0, and vice versa if its value is more than 70. For some reason, as the rest of my code is pretty standard in creating a sigmoid function, the end result is a straight line instead of a sigmoid curve.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
Dance = df.dnce
Dance = np.array(Dance)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
df.loc[df['popu'] <= 70, 'popu'] = 0
df.loc[df['popu'] > 70, 'popu'] = 1
def Logistic_Regression(X, y, iterations, alpha):
ones = np.ones((X.shape[0], ))
X = np.vstack((ones, X))
X = X.T
b = np.zeros(X.shape[1])
for i in range(iterations):
z = np.dot(X, b)
p_hat = sigmoid(z)
gradient = np.dot(X.T, (y - p_hat))
b = b + alpha * gradient
if (i % 1000 == 0):
print('LL, i ', log_likelihood(X, y, b), i)
return b
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def log_likelihood(X, y, b):
z = np.dot(X, b)
LL = np.sum(y*z - np.log(1 + np.exp(z)))
return LL
def LR1():
Dur = df.dur
Dur = np.array(Dur)
Pop = df.popu
Pop = [int(i) for i in Pop]; Pop = np.array(Pop)
plt.figure(figsize=(10,8))
colormap = np.array(['r', 'b'])
plt.scatter(Dur, Pop, c = colormap[Pop], alpha = .4)
b = Logistic_Regression(Dur, Pop, iterations = 8000, alpha = 0.00005)
print('Done')
p_hat = sigmoid(np.dot(Dur, b[1]) + b[0])
idxDur = np.argsort(Dur)
plt.plot(Dur[idxDur], p_hat[idxDur])
plt.show()
LR1()
df
Your logreg params arent coming out correctly, thus something is wrong in your gradient descent.
If I do
from sklearn.linear_model import LogisticRegression
df = pd.DataFrame({'popu':[0,1,0,1,1,0,0,1,0,0],'dur'[217,283,200,295,221,176,206,260,217,213]})
logreg = LogisticRegression()
logreg.fit(Dur.reshape([10,1]),Pop.reshape([10,1]))
print(logreg.coef_)
print(logreg.intercept_)
I get [0.86473507, -189.79655798]
whereas your params (b) come out [0.012136874150412973 -0.2430389407767768] for this data.
Plot of your vs scikit logregs here

How to calculate logistic regression accuracy

I am a complete beginner in machine learning and coding in python, and I have been tasked with coding logistic regression from scratch to understand what happens under the hood. So far I have coded for the hypothesis function, cost function and gradient descent, and then coded for the logistic regression. However on coding for printing the accuracy I get a low output (0.69) which doesnt change with increasing iterations or changing the learning rate. My question is, is there a problem with my accuracy code below? Any help pointing to the right direction would be appreciated
X = data[['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst']]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = data["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)
X = data["diagnosis"].map(lambda x: float(x))
def Sigmoid(z):
if z < 0:
return 1 - 1/(1 + math.exp(z))
else:
return 1/(1 + math.exp(-z))
def Hypothesis(theta, x):
z = 0
for i in range(len(theta)):
z += x[i]*theta[i]
return Sigmoid(z)
def Cost_Function(X,Y,theta,m):
sumOfErrors = 0
for i in range(m):
xi = X[i]
hi = Hypothesis(theta,xi)
error = Y[i] * math.log(hi if hi >0 else 1)
if Y[i] == 1:
error = Y[i] * math.log(hi if hi >0 else 1)
elif Y[i] == 0:
error = (1-Y[i]) * math.log(1-hi if 1-hi >0 else 1)
sumOfErrors += error
constant = -1/m
J = constant * sumOfErrors
#print ('cost is: ', J )
return J
def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
sumErrors = 0
for i in range(m):
xi = X[i]
xij = xi[j]
hi = Hypothesis(theta,X[i])
error = (hi - Y[i])*xij
sumErrors += error
m = len(Y)
constant = float(alpha)/float(m)
J = constant * sumErrors
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = []
constant = alpha/m
for j in range(len(theta)):
CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
new_theta_value = theta[j] - CFDerivative
new_theta.append(new_theta_value)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test, Hypothesis(X,theta))
for i in range(length):
prediction = round(Hypothesis(X[i],theta))
answer = Y[i]
if prediction == answer.all():
correct += 1
my_accuracy = (correct / length)*100
print ('LR Accuracy %: ', my_accuracy)
def Logistic_Regression(X,Y,alpha,theta,num_iters):
theta = np.zeros(X.shape[1])
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
Cost_Function(X,Y,theta,m)
print ('theta: ', theta)
print ('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
initial_theta = [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
alpha = 0.0001
iterations = 1000
Logistic_Regression(X,Y,alpha,initial_theta,iterations)
This is using data from the wisconsin breast cancer dataset (https://www.kaggle.com/uciml/breast-cancer-wisconsin-data) where I am weighing in 30 features - although changing the features to ones which are known to correlate also doesn't change my accuracy.
Python gives us this scikit-learn library that makes our work easier,
this worked for me:
from sklearn.metrics import accuracy_score
y_pred = log.predict(x_test)
score =accuracy_score(y_test,y_pred)
Accuracy is one of the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations. Higher accuracy means model is preforming better.
Accuracy = TP+TN/TP+FP+FN+TN
TP = True positives
TN = True negatives
FN = False negatives
TN = True negatives
While you are using accuracy measure your false positives and false negatives should be of similar cost. A better metric is the F1-score which is given by
F1-score = 2*(Recall*Precision)/Recall+Precision where,
Precision = TP/TP+FP
Recall = TP/TP+FN
Read more here
https://en.wikipedia.org/wiki/Precision_and_recall
The beauty about machine learning in python is that important modules like scikit-learn is open source so you can always look at the actual code.
Please use the below link to scikit learn metrics source code which will give you an idea how scikit-learn calculates the accuracy score when you do
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/metrics
I'm not sure how you arrived at a value of 0.0001 for alpha, but I think it's too low. Using your code with the cancer data shows that cost is decreasing with each iteration -- it's just going glacially.
When I raise this to 0.5, I still get a decreasing costs, but at a more reasonable level. After 1000 iterations it reports:
cost: 0.23668000993020666
And after fixing the Accuracy function I'm getting 92% on the test segment of the data.
You have Numpy installed, as shown by X = np.array(X). You should really consider using it for your operations. It will be orders of magnitude faster for jobs like this. Here is a vectorized version that gives results instantly rather than waiting:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv("cancerdata.csv")
X = df.values[:,2:-1].astype('float64')
X = (X - np.mean(X, axis =0)) / np.std(X, axis = 0)
## Add a bias column to the data
X = np.hstack([np.ones((X.shape[0], 1)),X])
X = MinMaxScaler().fit_transform(X)
Y = df["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy %: ', my_accuracy)
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
#print ('theta: ', theta)
print ('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 2000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
I think I might have a different versions of scikit, because I had change the MinMaxScaler line to make it work. The result is that I can 10K iterations in the blink of an eye and the results of the applying the model to the test set is about 97% accuracy.
This also works using Vectorization to calculate the accuracy
But Accuracy is not recommended metric as the above Answer noted (if the data is not well_blanced you should not use accuracy instead you use F1-score)
clf = sklearn.linear_model.LogisticRegressionCV();
clf.fit(X.T, Y.T);
LR_predictions = clf.predict(X.T)
print ('Accuracy of logistic regression: %d ' % float((np.dot(Y,LR_predictions) + np.dot(1-Y,1-LR_predictions))/float(Y.size)*100) +
'% ' + "(percentage of correctly labelled datapoints)")

Questions on Logistic Regression

I'm now using the training set from OpenClassroom(http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=DeepLearning&doc=exercises/ex4/ex4.html) to give it a try on Logistic Regression, and I only use LR,unlike that page which uses LR and Newton's methods.
below is my code:
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append(float(lineArr2[0]))
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# def autoNorm(dataSet):
# # newValue = (oldValue-min)/(max-min)
# minVals = min(dataSet)
# maxVals = max(dataSet)
# ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals)))
# normDataSet = zeros(shape(dataSet))
# m,n = shape(dataSet)
# normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1)))))
# normDataSet = normDataSet/tile(ranges, (m,1))
# return normDataSet, ranges, minVals
def gradDescent(dataMatIn, classLabels):
x = mat(dataMatIn)
y = mat(classLabels).transpose()
m,n = shape(x)
alpha = 0.001
maxCycles = 100000
theta = ones((n,1))
for k in range(maxCycles):
h = sigmoid(x*theta)
error = h - y
cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h))
print("Iteration %d | Cost: %f" % (k, cost))
theta = theta - alpha * (x.transpose() * error /m)
return theta
def plotBestFit(weights):
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
min_x = min(mat(dataMat)[:, 1])
max_x = max(mat(dataMat)[:, 1])
x = arange(min_x, max_x, 1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel)
print weights
plotBestFit(weights.getA())
here is my questions:
1. I trained it for 100,000 times, with error was printed each iteration, I didn't see it converaged anyway, well, actually I'm not sure here.
2. I'm not sure how to paint the classifier correctly by matplotlib, when the maxCycle is 200,000, I can get a somewhat reasonable classifier as well as the maxCyle is 100,000, the paint seems not reasonable at all.
maxCycle is 100,000
UPDATE CODE:
count = 0
for i in range(80):
result = sigmoid(dataMat[i] * weights)
if result > 0.5:
a = 1
else:
a = 0
if float(a) != classLabel[i][0]:
count += 1
errorRate = (float(count)/80)
print "error count is: %f, error rate is: %f" %(count,errorRate)
Your code is actually fine! Here are some remarks:
You initialized the thetas with all ones. I would not do so in this example. The first call of the sigmoid function will return values close to 1, because the product of theta and x gives very large numbers. The computation of log(1 - h) can result in error, because log is not defined at 0. I prefer to initialize thetas with 0's.
When calculating the cost function you missed the division by m. It does not matter for the algorithm, but it's better to follow the theory.
It's a good idea to plot the cost function, and not just print its values. The correct trend can be seen very clearly.
In order to converge, this particular example needs much more iterations. I reached a good result at 500.000 iterations.
The post has been updated, see the UPDATE below
Here are my plots:
As you can see the resulting separation line matches the plot shown in your tutorial very well.
Here is my code. It differs a little bit from yours, but they are very similar.
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
x = np.mat(dataMatIn)
y = np.mat(classLabels)
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
for k in range(maxCycles):
h = sigmoid(x*theta)
cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]
if ((k % 1000) == 0):
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
grad = (x.transpose() * (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(dataMat, classLabel, weights):
arrY = np.asarray(classLabel)
arrX = np.asarray(dataMat)
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(np.mat(dataMat)[:, 1])
max_x1 = max(np.mat(dataMat)[:, 1])
x1_val = np.arange(min_x1, max_x1, 1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000)
print(weights)
plotBestFit(dataMat, classLabel, weights)
UPDATE
After reading your questions in the comments to the first edition of the post I tried to optimize the code to achieve the convergence of the cost function using much smaller number of iterations.
Indeed the feature standardization makes miracles :)
An even better result was achieved after only 30 iterations!
Here are the new plots:
Because of the standardization you need to scale each new test example, in order to classify it.
Here is the new code. I changed some data types to avoid unnecessary data type conversions.
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = []; labelMat = []
frX = open('../ex4x.dat')
frY = open('../ex4y.dat')
for line1 in frX.readlines():
lineArr1 = line1.strip().split()
dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])
for line2 in frY.readlines():
lineArr2 = line2.strip().split()
labelMat.append([float(lineArr2[0])])
return np.asarray(dataMat), np.asarray(labelMat)
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradDescent(x, y, alpha, maxCycles):
m,n = np.shape(x)
n = n - 1 #usually n is the number of features (without the 1's)
theta = np.zeros((n+1,1))
cost_history = [] #list to accumulate the cost values
cost_iter = []
for k in range(maxCycles):
h = sigmoid(np.dot(x, theta))
cost = np.sum(-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h)))/m
cost_history.append(cost) #on each 1000th iteration the cost is saved to a list
cost_iter.append(k)
grad = np.dot(x.transpose(), (h - y))/m
theta = theta - alpha*grad
plot_cost = 1
if (plot_cost == 1):
plt.plot(cost_iter, cost_history)
plt.title("Cost")
plt.show()
return theta
def plotBestFit(arrX, arrY, weights):
ind1 = np.where(arrY == 1)[0]
ind0 = np.where(arrY == 0)[0]
min_x1 = min(arrX[:, 1:2])
max_x1 = max(arrX[:, 1:2])
x1_val = np.arange(min_x1, max_x1, 0.1)
x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]
plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
plt.plot(x1_val, x2_val)
plt.xlabel('X1', fontsize=18)
plt.ylabel('X2', fontsize=18)
plt.title("Separation border")
plt.show()
dataMat, classLabel = loadDataSet()
m = np.shape(dataMat)[0]
#standardization
dataMatMean = np.mean(dataMat, axis=0)
dataMatStd = np.std(dataMat, axis=0)
dataMatMean_m = np.tile(dataMatMean, (m, 1))
dataMatStd_m = np.tile(dataMatStd, (m, 1))
dataMatStand = np.copy(dataMat)
dataMatStand[:, 1:3] = np.divide( (dataMatStand[:, 1:3] - dataMatMean_m[:, 1:3]), dataMatStd_m[:, 1:3])
weights = gradDescent(dataMatStand, classLabel, 1.0, 30)
print(weights)
plotBestFit(dataMatStand, classLabel, weights)

Categories

Resources