Python: Logistic regression - inputing my data into my algorithm

Python: Logistic regression - inputing my data into my algorithm - python

I'm trying to implement a logistic regression algorithm in python, but i'm not used to using python.
I followed a tutorial to create my algorithm:
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
sns.set(style='ticks', palette='Set2')
import pandas as pd
import math
from numpy import *
def logistic_func(theta, X):
return float(1) / (1 + math.e**(-X.dot(theta))) #for x in x_values]
def log_gradient(theta, X, Y):
first_calc = logistic_func(theta, X) - np.squeeze(Y) #by attribute gives Beta(i)
final_calc = first_calc.T.dot(X)
return final_calc
def cost_func(theta, X, Y):
log_func_v = logistic_func(theta,X)
Y = np.squeeze(Y)
step1 = Y * np.log(log_func_v)
step2 = (1.5-Y) * np.log(1.5 - log_func_v)
step3 = (1-Y) * np.log(1 - log_func_v)
final = -step1 - step2 - step3
return np.mean(final)
def grad_desc(theta_values, X, Y, lr=.001, converge_change=.001):
#normalize
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
#setup cost iter
cost_iter = []
cost = cost_func(theta_values, X, Y)
cost_iter.append([0, cost])
change_cost = 1
i = 1
while(change_cost > converge_change):
old_cost = cost
theta_values = theta_values - (lr * log_gradient(theta_values, X, Y))
cost = cost_func(theta_values, X, X)
cost_iter.append([i, cost])
change_cost = old_cost - cost
i+=1
return theta_values, np.array(cost_iter)
def pred_values(theta, X, hard=True):
#normalize
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
pred_prob = logistic_func(theta, X)
p red_value = np.where(pred_prob >= .5, 1, 0)
if hard:
return pred_value
return pred_prob
the algorithm is supposed to predict 3 classifiers.
I can read in the data:
data = pd.read_csv('filepath')
data.loc[data["type"] == "type1", "type"] = 0
data.loc[data["type"] == "type2", "type"] = 1
data.loc[data["type"] == "type2", "type"] = 2
att1= [];
att2=[];
att3= [];
att4= [];
type=[];
for d in data["attribute1"]:
att1.append(d)
for d in data["attribute2"]:
att2.append(d)
for d in data["attribute3"]:
att3.append(d)
for d in data["attribute4"]:
att4.append(d)
for d in data["type"]:
type.append(d)
combinedClassArray = np.array([att1,att2,att3,att4])
X = combinedClassArray.T
y = type
#totalCount = type.count()
type1= data.loc[data["type"] == 0, "type"].count()
type2= data.loc[data["type"] == 1, "type"].count()
type3= data.loc[data["type"] == 1, "type"].count()
totalCount = type1+type2+type3
p = type1+type2
What i'm sure about is how i can insert my data to the algorithm.
Am I very far off?

You need a main function:
def main():
# your code here would be the calls to the algorithm with the parameters (your data)
if __name__ == "__main__":
main()

Related

XOR classification using multilayer perceptron

I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:
perceptron.py
import random
import numpy as np
class Perceptron:
def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
self.layer_sizes = layer_sizes
if len(self.layer_sizes) - 1 != len(activation_functions):
raise ValueError("...")
self.activation_functions = activation_functions
self.cost_function_deriv = cost_function_deriv
self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, training_data, test_data, epochs, mini_batch_size, lr):
test_data_len = len(test_data)
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[x: x + mini_batch_size]
for x in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
mb_len = len(mini_batch)
gradient_weights = [np.zeros(w.shape) for w in self.weights]
gradient_biases = [np.zeros(b.shape) for b in self.biases]
for x, y in mini_batch:
delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
correct_answers = self.how_many_correct_answers(test_data)
print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")
def backpropagation(self, x, y):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
activations = [x]
prev_activation = x
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
activations.append(current_activation)
prev_activation = current_activation
delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
gradient_b[-1] = delta
gradient_w[-1] = np.dot(delta, activations[-2].T)
for i in range(2, len(self.layer_sizes)):
z = activations[-i]
act_der = self.activation_functions[-i + 1].deriv(z)
delta = np.dot(self.weights[-i + 1].T, delta) * act_der
gradient_b[-i] = delta
gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
# Normal indexing variant:
# for i in range(len(self.layers) - 1, 0, -1):
# z = activations[i]
# act_der = self.activation_functions[i].deriv(z)
# delta = np.dot(self.weights[i].T, delta) * act_der
# gradient_b[i - 1] = delta
# gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
return gradient_b, gradient_w
def feedforward(self, a):
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
a = self.activation_functions[i](np.dot(w, a) + b)
return a
def how_many_correct_answers(self, test_data):
k = 0
for x, y in test_data:
y_predict = np.argmax(self.feedforward(x))
print(y_predict, y)
k += int(y_predict == y)
return k
main.py
from copy import deepcopy
import numpy as np
from perceptron import Perceptron
class Sigmoid:
out_min_max = [0, 1]
def __call__(self, x):
return 1. / (1. + np.exp(-x))
def deriv(self, y):
# t = self(x)
# return t * (1. - t)
return y * (1. - y)
def cost_function_derivative(y_predict, y_true_label):
label_vector = np.zeros(y_predict.shape)
label_vector[y_true_label] = 1.0
return y_predict - label_vector
def main():
training_data = np.asarray([[[[0], [0]], 0],
[[[0], [1]], 1],
[[[1], [0]], 1],
[[[1], [1]], 0]])
layer_sizes = [2, 8, 2]
model = Perceptron(layer_sizes=layer_sizes,
activation_functions=[Sigmoid(), Sigmoid()],
cost_function_deriv=cost_function_derivative)
model.train(deepcopy(training_data),
deepcopy(training_data),
epochs=10000,
mini_batch_size=4,
lr=0.01)
if __name__ == '__main__':
main()
The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0
If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0

I figured it out. It requires the following.
mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000
And it's better to do this:
lr=0.1
The result in most cases is obtained after ~1000 epochs:
0 0
1 1
1 1
0 0

Object values not being reset in python function

Here is my code. In the calculateOptimalLambda() function, I am attempting to declare a copy of n and
store it as m, remove one point from m, and make some calculations and a graph. Then, the loop should
restart, make a fresh copy of m, remove the next point, and so on.
However, when in the next iteration
of the loop, a point has been removed. Eventually, I run out of points to remove, and I get an error.
How do I declare a fresh copy of m so I can remove the next point?
import numpy as np
from matplotlib import pyplot as plt
class Data:
def __init__(self, points, sigma, lamda):
self.points = points
self.sigma = sigma
self.sample = np.random.uniform(-1,1, (points, 2))
self.transformedData = np.ones((points, 5))
self.weight = np.zeros((5,1))
self.lamda = lamda
def changeLamda(self,x):
self.lamda = x
def removePoint(self, x):
self.points = self.points - 1
self.sample = np.delete(self.sample, x, 0)
self.transformedData = np.delete(self.transformedData, x, 0)
def transformedFunction(self, x):
transformedData = np.ones((1, 5))
transformedData[0,1] = x
transformedData[0,2] = 0.5 * (3*x**2 -1)
transformedData[0,3]= 0.5 * (5*x**3 - 3*x)
transformedData[0,4] = 0.125 * (35*x**4 -30*x**2 + 3)
return np.dot(transformedData, self.weight)
def setY(self):
for i in range(len(self.sample[0:,0])):
self.sample[i,1] = np.random.normal(0, self.sigma) + self.sample[i,0]**2
def transform(self):
for i in range(len(self.sample[0:,0])):
self.transformedData[i,1] = self.sample[i,0]
self.transformedData[i,2] = 0.5 * (3*self.sample[i,0]**2 -1)
self.transformedData[i,3]= 0.5 * (5*self.sample[i,0]**3 - 3*self.sample[i,0])
self.transformedData[i,4] = 0.125 * (35*self.sample[i,0]**4 -30*self.sample[i,0]**2 + 3)
def calculateWeight(self):
z = n.transformedData
zProd = np.linalg.inv(np.matmul(np.transpose(z), z) + np.identity(5)*self.lamda)
next1 = np.matmul(zProd,np.transpose(z))
a = self.sample[0:,1]
a = a.reshape((-1, 1))
print(a)
self.weight = np.matmul(next1,a)
def calculateError(self):
error= (np.matmul(self.transformedData, self.weight) - self.sample[1,0:])
return error/self.points
def calculateOptimalLambda(n, L):
a = 0
for i in range(len(L)):
n.changeLamda(L[i])
for x in range(n.getPoints()):
a+=1
plt.subplot(4,5,a)
m = n
m.removePoint(x)
m.calculateWeight()
weight = m.getWeight()
error = m.calculateError()
twoD_plot(m)
print(error)
def twoD_plot(n):
t = np.linspace(-1, 1, 400)
x = np.square(t)
plt.plot(t,x,'b')
error = 0
y = x
for i in range(len(t)):
y[i] = n.transformedFunction(t[i])
error += (y[i] - t[i]**2)**2
"""print(error/len(t))"""
plt.plot(t,y,'r')
plt.scatter(n.getSample()[0:,0],n.getSample()[0:,1], c = 'g', marker = 'o')
n = Data(5,0.1,0)
n.setY()
n.transform()
n.calculateWeight()
L = [1, 0.01, 0.00001, 0]
calculateOptimalLambda(n, L)
plt.show()

Logistic regression - strange behaviour of the decision boundary when additional parameters are added

I am trying to build a logistic regression model for a dataset consisting of two parameters
x1 and x2, but instead of analyzing just the two of them, I have added their squares as well - x12, x22 and x1· x2.
At the first glance everything looks fine and the error function is decreasing, but whilist drawing the plot of the decision boundary I have noticed, that after circa 500 iterations something strange happens to it.
Here is an animation of the error function as a function of iterations and a respective plot of the decision boundary:
Now,I interpret the decision boundary as a quadratic function
x2=f(x1), where
the relation between both parameters is given like this:
0.5 = θ0 + θ1x1 + θ2x2 + θ3x12 + θ4x1x2
+ θ5x22
Here is the python code I use to do everything:
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
from math import log
from matplotlib.animation import FuncAnimation
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def loadData(filepath):
source=""
try:
f = open(filepath, "r")
source = f.read()
f.close()
except IOError:
print("Error while reading file (" + filepath + ")")
return ""
raw_data = source.split("\n")
raw_data = [x.split(",") for x in raw_data if x !=""]
raw_data = np.matrix(raw_data).astype(float)
return (raw_data[:,:np.size(raw_data,1)-1], raw_data[:,np.size(raw_data, 1)-1:])
def standardize(dataset, skipfirst=True):
means = np.amin(dataset, 0)
deviation = np.std(dataset, 0)
if skipfirst:
dataset[:,1:] -= means[:,1:]
dataset[:,1:] /= deviation[:,1:]
return dataset
else:
dataset -= means
dataset /= deviation
return dataset
def error(X, Y, Theta):
"Calculates error values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
sigmo = v_sigm(h_x)
partial_vect = (Y-1).T # np.log(1-sigmo) - Y.T # np.log(sigmo)
return 1/(2*np.size(Y, axis=0))*np.sum(partial_vect)
def gradientStep(X, Y, Theta, LR):
"Returns new theta Values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
modif = -1*LR/np.size(Y, 0)*(h_x-Y)
sums = np.sum(modif.T # X, axis = 0)
return Theta + sums.T
X, Y = loadData("ex2data1.txt")
#add bias to X
X = np.append(np.ones((np.size(X, 0), 1)), X, axis=1)
added_params = [[x[1]**2, x[1]*x[2], x[2]**2] for x in np.array(X)]
X = np.append(X, np.matrix(added_params), axis=1)
#standardize X
X = standardize(X)
#create vector of parameters
Theta=np.zeros((np.size(X, 1), 1))
iterations = 3000
Theta_vals = []
Error_vals = []
for i in range(0, iterations):
Theta_vals.append(np.asarray(Theta).flatten())
Error_vals.append(error(X, Y, Theta))
Theta = gradientStep(X, Y, Theta, 0.07)
#CALCULATING FINISHES HERE
#plot data:
fig = plt.figure()
def_ax = fig.add_subplot(211)
def_ax.set_xlim(np.amin(X[:,1:2]), np.amax(X[:,1:2]))
def_ax.set_ylim(np.amin(X[:,2:3]), np.amax(X[:,2:3]))
err_ax = fig.add_subplot(212)
err_ax.set_ylim(0, error(X, Y, Theta))
err_ax.set_xlim(0, iterations)
positive_X1 = []
positive_X2 = []
negative_X1 = []
negative_X2 = []
for i in range(0, np.size(Y, 0)):
if(Y[i, 0] == 1):
positive_X1.append(X[i, 1])
positive_X2.append(X[i, 2])
else:
negative_X1.append(X[i, 1])
negative_X2.append(X[i, 2])
err_ax.set_ylim(np.amin(Error_vals), np.amax(Error_vals))
def animation(frame):
global Theta_vals, Error_vals, def_ax, err_ax, positive_X1, positive_X2, negative_X1, negative_X2
def_limX = def_ax.get_xlim()
def_limY = def_ax.get_ylim()
err_limX = err_ax.get_xlim()
err_limY = err_ax.get_ylim()
def_ax.clear()
err_ax.clear()
def_ax.set_xlim(def_limX)
def_ax.set_ylim(def_limY)
err_ax.set_xlim(err_limX)
err_ax.set_ylim(err_limY)
def_ax.scatter(positive_X1, positive_X2, marker="^")
def_ax.scatter(negative_X1, negative_X2, marker="o")
Theta = Theta_vals[frame]
res_x = np.linspace(*def_ax.get_xlim(), num=5)
delta_x = [(Theta[4]*x+Theta[2])**2-4*Theta[5]*(Theta[3]*x**2+Theta[1]*x+Theta[0]-0.5) for x in res_x]
delta_x = [np.sqrt(x) if x >= 0 else 0 for x in delta_x]
minb = [-(Theta[4]*x+Theta[2]) for x in res_x]
res_1 = []
res_2 = []
for i in range(0, len(res_x)):
if Theta[5] == 0:
res_1.append(0)
res_2.append(0)
else:
res_1.append((minb[i]+delta_x[i])/(2*Theta[5]))
res_2.append((minb[i]-+delta_x[i])/(2*Theta[5]))
def_ax.plot(res_x, res_1)
def_ax.plot(res_x, res_2)
err_x = np.linspace(0, frame, frame)
err_y = Error_vals[0:frame]
err_ax.plot(err_x, err_y)
anim = FuncAnimation(fig, animation, frames=iterations, interval=3, repeat_delay=2000)
print(error(X, Y, Theta))
anim.save("anim.mp4")
What could be the reason of such a strange behaviour?

FCBF Python feature selection technique

I want to use FCBF technique from github https://github.com/shiralkarprashant/FCBF
The problem i faced is that i am working on Python 3 ad the module is implemented for python 2 users . I got the following error that describes that name 'xrange' is not defined because i work on python3
i think to solve the issue just by changing range by xrange
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from sklearn.grid_search import GridSearchCV
classifiers = [('DecisionTree', DecisionTreeClassifier(), {'max_depth' : [5, 10, 15]}),
('LogisticRegression', LogisticRegression(), {'C' : [0.1, 1, 10]})]
n_features = dataCAD.shape[1]
npieces = get_i(n_features)
The module code contains just one xrange occurence i tried to change it by range but it does not solve the problem:
# -*- coding: utf-8 -*-
import numpy as np
def count_vals(x):
vals = np.unique(x)
occ = np.zeros(shape = vals.shape)
for i in range(vals.size):
occ[i] = np.sum(x == vals[i])
return occ
def entropy(x):
n = float(x.shape[0])
ocurrence = count_vals(x)
px = ocurrence / n
return -1* np.sum(px*np.log2(px))
def symmetricalUncertain(x,y):
n = float(y.shape[0])
vals = np.unique(y)
# Computing Entropy for the feature x.
Hx = entropy(x)
# Computing Entropy for the feature y.
Hy = entropy(y)
#Computing Joint entropy between x and y.
partial = np.zeros(shape = (vals.shape[0]))
for i in range(vals.shape[0]):
partial[i] = entropy(x[y == vals[i]])
partial[np.isnan(partial)==1] = 0
py = count_vals(y).astype(dtype = 'float64') / n
Hxy = np.sum(py[py > 0]*partial)
IG = Hx-Hxy
return 2*IG/(Hx+Hy)
def suGroup(x, n):
m = x.shape[0]
x = np.reshape(x, (n,m/n)).T
m = x.shape[1]
SU_matrix = np.zeros(shape = (m,m))
for j in range(m-1):
x2 = x[:,j+1::]
y = x[:,j]
temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
for k in range(temp.shape[0]):
SU_matrix[j,j+1::] = temp
SU_matrix[j+1::,j] = temp
return 1/float(m-1)*np.sum(SU_matrix, axis = 1)
def isprime(a):
return all(a % i for i in xrange(2, a))
"""
get
"""
def get_i(a):
if isprime(a):
a -= 1
return filter(lambda x: a % x == 0, range(2,a))
"""
FCBF - Fast Correlation Based Filter
L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""
class FCBF:
idx_sel = []
def __init__(self, th = 0.01):
'''
Parameters
---------------
th = The initial threshold
'''
self.th = th
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > self.th]
SU_list[::-1].sort()
m = x[:,SU_vec > self.th].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between feature to remove redundancy.
"""
j = 0
while True:
"""
Stopping Criteria:The search finishes
"""
if j >= x_sorted.shape[1]: break
y = x_sorted[:,j].copy()
x_list = x_sorted[:,j+1:].copy()
if x_list.shape[1] == 0: break
SU_list_2 = SU_list[j+1:]
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0:
x_sorted = np.delete(x_sorted, to_remove, axis = 1)
SU_list = np.delete(SU_list, to_remove, axis = 0)
to_remove.sort()
for r in reversed(to_remove):
self.idx_sel.remove(self.idx_sel[r])
j = j + 1
def fit_transform(self, x, y):
'''
This function fits the feature selection
algorithm and returns the resulting subset.
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.fit(x, y)
return x[:,self.idx_sel]
def transform(self, x):
'''
This function applies the selection
to the vector x.
Parameters
---------------
x = dataset [NxM]
'''
return x[:, self.idx_sel]
"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""
class FCBFK(FCBF):
idx_sel = []
def __init__(self, k = 10):
'''
Parameters
---------------
k = Number of features to include in the
subset.
'''
self.k = k
def fit(self, x, y):
'''
This function executes FCBFK algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > 0]
SU_list[::-1].sort()
m = x[:,SU_vec > 0].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between features to remove redundancy with stopping
criteria (features in x_best == k).
"""
j = 0
while True:
y = x_sorted[:,j].copy()
SU_list_2 = SU_list[j+1:]
x_list = x_sorted[:,j+1:].copy()
"""
Stopping Criteria:The search finishes
"""
if x_list.shape[1] == 0: break
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0 and x.shape[1] > self.k:
for i in reversed(to_remove):
x_sorted = np.delete(x_sorted, i, axis = 1)
SU_list = np.delete(SU_list, i, axis = 0)
self.idx_sel.remove(self.idx_sel[i])
if x_sorted.shape[1] == self.k: break
if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
break
j = j + 1
if len(self.idx_sel) > self.k:
self.idx_sel = self.idx_sel[:self.k]
"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""
class FCBFiP(FCBF):
idx_sel = []
def __init__(self, k = 10, npieces = 2):
'''
Parameters
---------------
k = Number of features to include in the
subset.
npieces = Number of pieces to divide the
feature space.
'''
self.k = k
self.npieces = npieces
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
"""
First Stage: Computing the SU for each feature with the response. We sort the
features. When we have a prime number of features we remove the last one from the
sorted features list.
"""
m = x.shape
nfeaturesPieces = int(m[1] / float(self.npieces))
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
x_sorted = np.zeros(shape = m, dtype = 'float64')
idx_sorted = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = -1
idx_sorted[i]= ind
x_sorted[:,i] = x[:,ind].copy()
if isprime(m[1]):
x_sorted = np.delete(x_sorted, m[1]-1, axis = 1 )
ind_prime = idx_sorted[m[1]-1]
idx_sorted = np.delete(idx_sorted, m[1]-1)
#m = x_sorted.shape
"""
Second Stage: Identify relationships between features into its vecinity
to remove redundancy with stopping criteria (features in x_best == k).
"""
x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces*m[0])).T
SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
SU_x = np.reshape(SU_x.T, (self.npieces*nfeaturesPieces,))
idx_sorted2 = np.zeros(shape = idx_sorted.shape, dtype = 'int64')
SU_x[np.isnan(SU_x)] = 1
for i in range(idx_sorted.shape[0]):
ind = np.argmin(SU_x)
idx_sorted2[i] = idx_sorted[ind]
SU_x[ind] = 10
"""
Scoring step
"""
self.scores = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
if i in idx_sorted:
self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
if isprime(m[1]):
self.scores[ind_prime] = 2*m[1]
self.set_k(self.k)
def set_k(self, k):
self.k = k
scores_temp = -1*self.scores
self.idx_sel = np.zeros(shape = self.k, dtype = 'int64')
for i in range(self.k):
ind = np.argmax(scores_temp)
scores_temp[ind] = -100000000
self.idx_sel[i] = ind

Try using 2to3 package for python to automatically convert the files. Worked for me!
https://docs.python.org/2/library/2to3.html

How to get kfold splits for cross validation from scratch in python?

I think I've split my training data in 5 kold, is there a way for me to label/identify each of the 5 splits so I can then send each into my algorithm to calculate their own accuracies?
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
splits=kf.get_n_splits(X_train)
print(splits)
Separately, I have also tried splitting my data to then run in my logistic regression but this outputs nan % accuracy:
X_train1 = X[0:84]
Y_train1 = Y[0:84]
X_train2 = X[85:170]
Y_train2 = Y[85:170]
X_train3 = X[171:255]
Y_train3 = Y[171:255]
X_train4 = X[256:340]
Y_train4 = Y[256:340]
X_train5 = X[341:426]
Y_train5 = Y[341:426]
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, x)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Regularisation(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy CV: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train1,Y_train1,alpha,initial_theta,iterations)
Logistic_Regression(X_train2,Y_train2,alpha,initial_theta,iterations)
Logistic_Regression(X_train3,Y_train3,alpha,initial_theta,iterations)
Logistic_Regression(X_train4,Y_train4,alpha,initial_theta,iterations)
Logistic_Regression(X_train5,Y_train5,alpha,initial_theta,iterations

get_n_splits returns the "number of splits" you configured for skf.
Look at the documentation here for an example : http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Logistic regression - inputing my data into my algorithm - python

You need a main function: def main(): # your code here would be the calls to the algorithm with the parameters (your data) if name == "main": main()

Related

XOR classification using multilayer perceptron

Object values not being reset in python function

Logistic regression - strange behaviour of the decision boundary when additional parameters are added

FCBF Python feature selection technique

How to get kfold splits for cross validation from scratch in python?

Categories

Resources

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Logistic regression - inputing my data into my algorithm - python

You need a main function: def main(): # your code here would be the calls to the algorithm with the parameters (your data) if __name__ == "__main__": main()

Related

XOR classification using multilayer perceptron

Object values not being reset in python function

Logistic regression - strange behaviour of the decision boundary when additional parameters are added

FCBF Python feature selection technique

How to get kfold splits for cross validation from scratch in python?

Categories

Resources

You need a main function: def main(): # your code here would be the calls to the algorithm with the parameters (your data) if name == "main": main()