Python: Logistic regression - inputing my data into my algorithm - python

I'm trying to implement a logistic regression algorithm in python, but i'm not used to using python.
I followed a tutorial to create my algorithm:
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
sns.set(style='ticks', palette='Set2')
import pandas as pd
import math
from numpy import *
def logistic_func(theta, X):
return float(1) / (1 + math.e**(-X.dot(theta))) #for x in x_values]
def log_gradient(theta, X, Y):
first_calc = logistic_func(theta, X) - np.squeeze(Y) #by attribute gives Beta(i)
final_calc = first_calc.T.dot(X)
return final_calc
def cost_func(theta, X, Y):
log_func_v = logistic_func(theta,X)
Y = np.squeeze(Y)
step1 = Y * np.log(log_func_v)
step2 = (1.5-Y) * np.log(1.5 - log_func_v)
step3 = (1-Y) * np.log(1 - log_func_v)
final = -step1 - step2 - step3
return np.mean(final)
def grad_desc(theta_values, X, Y, lr=.001, converge_change=.001):
#normalize
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
#setup cost iter
cost_iter = []
cost = cost_func(theta_values, X, Y)
cost_iter.append([0, cost])
change_cost = 1
i = 1
while(change_cost > converge_change):
old_cost = cost
theta_values = theta_values - (lr * log_gradient(theta_values, X, Y))
cost = cost_func(theta_values, X, X)
cost_iter.append([i, cost])
change_cost = old_cost - cost
i+=1
return theta_values, np.array(cost_iter)
def pred_values(theta, X, hard=True):
#normalize
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
pred_prob = logistic_func(theta, X)
p red_value = np.where(pred_prob >= .5, 1, 0)
if hard:
return pred_value
return pred_prob
the algorithm is supposed to predict 3 classifiers.
I can read in the data:
data = pd.read_csv('filepath')
data.loc[data["type"] == "type1", "type"] = 0
data.loc[data["type"] == "type2", "type"] = 1
data.loc[data["type"] == "type2", "type"] = 2
att1= [];
att2=[];
att3= [];
att4= [];
type=[];
for d in data["attribute1"]:
att1.append(d)
for d in data["attribute2"]:
att2.append(d)
for d in data["attribute3"]:
att3.append(d)
for d in data["attribute4"]:
att4.append(d)
for d in data["type"]:
type.append(d)
combinedClassArray = np.array([att1,att2,att3,att4])
X = combinedClassArray.T
y = type
#totalCount = type.count()
type1= data.loc[data["type"] == 0, "type"].count()
type2= data.loc[data["type"] == 1, "type"].count()
type3= data.loc[data["type"] == 1, "type"].count()
totalCount = type1+type2+type3
p = type1+type2
What i'm sure about is how i can insert my data to the algorithm.
Am I very far off?

You need a main function:
def main():
# your code here would be the calls to the algorithm with the parameters (your data)
if __name__ == "__main__":
main()

Related

XOR classification using multilayer perceptron

I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:
perceptron.py
import random
import numpy as np
class Perceptron:
def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
self.layer_sizes = layer_sizes
if len(self.layer_sizes) - 1 != len(activation_functions):
raise ValueError("...")
self.activation_functions = activation_functions
self.cost_function_deriv = cost_function_deriv
self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, training_data, test_data, epochs, mini_batch_size, lr):
test_data_len = len(test_data)
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[x: x + mini_batch_size]
for x in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
mb_len = len(mini_batch)
gradient_weights = [np.zeros(w.shape) for w in self.weights]
gradient_biases = [np.zeros(b.shape) for b in self.biases]
for x, y in mini_batch:
delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
correct_answers = self.how_many_correct_answers(test_data)
print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")
def backpropagation(self, x, y):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
activations = [x]
prev_activation = x
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
activations.append(current_activation)
prev_activation = current_activation
delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
gradient_b[-1] = delta
gradient_w[-1] = np.dot(delta, activations[-2].T)
for i in range(2, len(self.layer_sizes)):
z = activations[-i]
act_der = self.activation_functions[-i + 1].deriv(z)
delta = np.dot(self.weights[-i + 1].T, delta) * act_der
gradient_b[-i] = delta
gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
# Normal indexing variant:
# for i in range(len(self.layers) - 1, 0, -1):
# z = activations[i]
# act_der = self.activation_functions[i].deriv(z)
# delta = np.dot(self.weights[i].T, delta) * act_der
# gradient_b[i - 1] = delta
# gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
return gradient_b, gradient_w
def feedforward(self, a):
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
a = self.activation_functions[i](np.dot(w, a) + b)
return a
def how_many_correct_answers(self, test_data):
k = 0
for x, y in test_data:
y_predict = np.argmax(self.feedforward(x))
print(y_predict, y)
k += int(y_predict == y)
return k
main.py
from copy import deepcopy
import numpy as np
from perceptron import Perceptron
class Sigmoid:
out_min_max = [0, 1]
def __call__(self, x):
return 1. / (1. + np.exp(-x))
def deriv(self, y):
# t = self(x)
# return t * (1. - t)
return y * (1. - y)
def cost_function_derivative(y_predict, y_true_label):
label_vector = np.zeros(y_predict.shape)
label_vector[y_true_label] = 1.0
return y_predict - label_vector
def main():
training_data = np.asarray([[[[0], [0]], 0],
[[[0], [1]], 1],
[[[1], [0]], 1],
[[[1], [1]], 0]])
layer_sizes = [2, 8, 2]
model = Perceptron(layer_sizes=layer_sizes,
activation_functions=[Sigmoid(), Sigmoid()],
cost_function_deriv=cost_function_derivative)
model.train(deepcopy(training_data),
deepcopy(training_data),
epochs=10000,
mini_batch_size=4,
lr=0.01)
if __name__ == '__main__':
main()
The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0
If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0
I figured it out. It requires the following.
mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000
And it's better to do this:
lr=0.1
The result in most cases is obtained after ~1000 epochs:
0 0
1 1
1 1
0 0

Object values not being reset in python function

Here is my code. In the calculateOptimalLambda() function, I am attempting to declare a copy of n and
store it as m, remove one point from m, and make some calculations and a graph. Then, the loop should
restart, make a fresh copy of m, remove the next point, and so on.
However, when in the next iteration
of the loop, a point has been removed. Eventually, I run out of points to remove, and I get an error.
How do I declare a fresh copy of m so I can remove the next point?
import numpy as np
from matplotlib import pyplot as plt
class Data:
def __init__(self, points, sigma, lamda):
self.points = points
self.sigma = sigma
self.sample = np.random.uniform(-1,1, (points, 2))
self.transformedData = np.ones((points, 5))
self.weight = np.zeros((5,1))
self.lamda = lamda
def changeLamda(self,x):
self.lamda = x
def removePoint(self, x):
self.points = self.points - 1
self.sample = np.delete(self.sample, x, 0)
self.transformedData = np.delete(self.transformedData, x, 0)
def transformedFunction(self, x):
transformedData = np.ones((1, 5))
transformedData[0,1] = x
transformedData[0,2] = 0.5 * (3*x**2 -1)
transformedData[0,3]= 0.5 * (5*x**3 - 3*x)
transformedData[0,4] = 0.125 * (35*x**4 -30*x**2 + 3)
return np.dot(transformedData, self.weight)
def setY(self):
for i in range(len(self.sample[0:,0])):
self.sample[i,1] = np.random.normal(0, self.sigma) + self.sample[i,0]**2
def transform(self):
for i in range(len(self.sample[0:,0])):
self.transformedData[i,1] = self.sample[i,0]
self.transformedData[i,2] = 0.5 * (3*self.sample[i,0]**2 -1)
self.transformedData[i,3]= 0.5 * (5*self.sample[i,0]**3 - 3*self.sample[i,0])
self.transformedData[i,4] = 0.125 * (35*self.sample[i,0]**4 -30*self.sample[i,0]**2 + 3)
def calculateWeight(self):
z = n.transformedData
zProd = np.linalg.inv(np.matmul(np.transpose(z), z) + np.identity(5)*self.lamda)
next1 = np.matmul(zProd,np.transpose(z))
a = self.sample[0:,1]
a = a.reshape((-1, 1))
print(a)
self.weight = np.matmul(next1,a)
def calculateError(self):
error= (np.matmul(self.transformedData, self.weight) - self.sample[1,0:])
return error/self.points
def calculateOptimalLambda(n, L):
a = 0
for i in range(len(L)):
n.changeLamda(L[i])
for x in range(n.getPoints()):
a+=1
plt.subplot(4,5,a)
m = n
m.removePoint(x)
m.calculateWeight()
weight = m.getWeight()
error = m.calculateError()
twoD_plot(m)
print(error)
def twoD_plot(n):
t = np.linspace(-1, 1, 400)
x = np.square(t)
plt.plot(t,x,'b')
error = 0
y = x
for i in range(len(t)):
y[i] = n.transformedFunction(t[i])
error += (y[i] - t[i]**2)**2
"""print(error/len(t))"""
plt.plot(t,y,'r')
plt.scatter(n.getSample()[0:,0],n.getSample()[0:,1], c = 'g', marker = 'o')
n = Data(5,0.1,0)
n.setY()
n.transform()
n.calculateWeight()
L = [1, 0.01, 0.00001, 0]
calculateOptimalLambda(n, L)
plt.show()

Logistic regression - strange behaviour of the decision boundary when additional parameters are added

I am trying to build a logistic regression model for a dataset consisting of two parameters
x1 and x2, but instead of analyzing just the two of them, I have added their squares as well - x12, x22 and x1· x2.
At the first glance everything looks fine and the error function is decreasing, but whilist drawing the plot of the decision boundary I have noticed, that after circa 500 iterations something strange happens to it.
Here is an animation of the error function as a function of iterations and a respective plot of the decision boundary:
Now,I interpret the decision boundary as a quadratic function
x2=f(x1), where
the relation between both parameters is given like this:
0.5 = θ0 + θ1x1 + θ2x2 + θ3x12 + θ4x1x2
+ θ5x22
Here is the python code I use to do everything:
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
from math import log
from matplotlib.animation import FuncAnimation
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def loadData(filepath):
source=""
try:
f = open(filepath, "r")
source = f.read()
f.close()
except IOError:
print("Error while reading file (" + filepath + ")")
return ""
raw_data = source.split("\n")
raw_data = [x.split(",") for x in raw_data if x !=""]
raw_data = np.matrix(raw_data).astype(float)
return (raw_data[:,:np.size(raw_data,1)-1], raw_data[:,np.size(raw_data, 1)-1:])
def standardize(dataset, skipfirst=True):
means = np.amin(dataset, 0)
deviation = np.std(dataset, 0)
if skipfirst:
dataset[:,1:] -= means[:,1:]
dataset[:,1:] /= deviation[:,1:]
return dataset
else:
dataset -= means
dataset /= deviation
return dataset
def error(X, Y, Theta):
"Calculates error values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
sigmo = v_sigm(h_x)
partial_vect = (Y-1).T # np.log(1-sigmo) - Y.T # np.log(sigmo)
return 1/(2*np.size(Y, axis=0))*np.sum(partial_vect)
def gradientStep(X, Y, Theta, LR):
"Returns new theta Values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
modif = -1*LR/np.size(Y, 0)*(h_x-Y)
sums = np.sum(modif.T # X, axis = 0)
return Theta + sums.T
X, Y = loadData("ex2data1.txt")
#add bias to X
X = np.append(np.ones((np.size(X, 0), 1)), X, axis=1)
added_params = [[x[1]**2, x[1]*x[2], x[2]**2] for x in np.array(X)]
X = np.append(X, np.matrix(added_params), axis=1)
#standardize X
X = standardize(X)
#create vector of parameters
Theta=np.zeros((np.size(X, 1), 1))
iterations = 3000
Theta_vals = []
Error_vals = []
for i in range(0, iterations):
Theta_vals.append(np.asarray(Theta).flatten())
Error_vals.append(error(X, Y, Theta))
Theta = gradientStep(X, Y, Theta, 0.07)
#CALCULATING FINISHES HERE
#plot data:
fig = plt.figure()
def_ax = fig.add_subplot(211)
def_ax.set_xlim(np.amin(X[:,1:2]), np.amax(X[:,1:2]))
def_ax.set_ylim(np.amin(X[:,2:3]), np.amax(X[:,2:3]))
err_ax = fig.add_subplot(212)
err_ax.set_ylim(0, error(X, Y, Theta))
err_ax.set_xlim(0, iterations)
positive_X1 = []
positive_X2 = []
negative_X1 = []
negative_X2 = []
for i in range(0, np.size(Y, 0)):
if(Y[i, 0] == 1):
positive_X1.append(X[i, 1])
positive_X2.append(X[i, 2])
else:
negative_X1.append(X[i, 1])
negative_X2.append(X[i, 2])
err_ax.set_ylim(np.amin(Error_vals), np.amax(Error_vals))
def animation(frame):
global Theta_vals, Error_vals, def_ax, err_ax, positive_X1, positive_X2, negative_X1, negative_X2
def_limX = def_ax.get_xlim()
def_limY = def_ax.get_ylim()
err_limX = err_ax.get_xlim()
err_limY = err_ax.get_ylim()
def_ax.clear()
err_ax.clear()
def_ax.set_xlim(def_limX)
def_ax.set_ylim(def_limY)
err_ax.set_xlim(err_limX)
err_ax.set_ylim(err_limY)
def_ax.scatter(positive_X1, positive_X2, marker="^")
def_ax.scatter(negative_X1, negative_X2, marker="o")
Theta = Theta_vals[frame]
res_x = np.linspace(*def_ax.get_xlim(), num=5)
delta_x = [(Theta[4]*x+Theta[2])**2-4*Theta[5]*(Theta[3]*x**2+Theta[1]*x+Theta[0]-0.5) for x in res_x]
delta_x = [np.sqrt(x) if x >= 0 else 0 for x in delta_x]
minb = [-(Theta[4]*x+Theta[2]) for x in res_x]
res_1 = []
res_2 = []
for i in range(0, len(res_x)):
if Theta[5] == 0:
res_1.append(0)
res_2.append(0)
else:
res_1.append((minb[i]+delta_x[i])/(2*Theta[5]))
res_2.append((minb[i]-+delta_x[i])/(2*Theta[5]))
def_ax.plot(res_x, res_1)
def_ax.plot(res_x, res_2)
err_x = np.linspace(0, frame, frame)
err_y = Error_vals[0:frame]
err_ax.plot(err_x, err_y)
anim = FuncAnimation(fig, animation, frames=iterations, interval=3, repeat_delay=2000)
print(error(X, Y, Theta))
anim.save("anim.mp4")
What could be the reason of such a strange behaviour?

FCBF Python feature selection technique

I want to use FCBF technique from github https://github.com/shiralkarprashant/FCBF
The problem i faced is that i am working on Python 3 ad the module is implemented for python 2 users . I got the following error that describes that name 'xrange' is not defined because i work on python3
i think to solve the issue just by changing range by xrange
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from sklearn.grid_search import GridSearchCV
classifiers = [('DecisionTree', DecisionTreeClassifier(), {'max_depth' : [5, 10, 15]}),
('LogisticRegression', LogisticRegression(), {'C' : [0.1, 1, 10]})]
n_features = dataCAD.shape[1]
npieces = get_i(n_features)
The module code contains just one xrange occurence i tried to change it by range but it does not solve the problem:
# -*- coding: utf-8 -*-
import numpy as np
def count_vals(x):
vals = np.unique(x)
occ = np.zeros(shape = vals.shape)
for i in range(vals.size):
occ[i] = np.sum(x == vals[i])
return occ
def entropy(x):
n = float(x.shape[0])
ocurrence = count_vals(x)
px = ocurrence / n
return -1* np.sum(px*np.log2(px))
def symmetricalUncertain(x,y):
n = float(y.shape[0])
vals = np.unique(y)
# Computing Entropy for the feature x.
Hx = entropy(x)
# Computing Entropy for the feature y.
Hy = entropy(y)
#Computing Joint entropy between x and y.
partial = np.zeros(shape = (vals.shape[0]))
for i in range(vals.shape[0]):
partial[i] = entropy(x[y == vals[i]])
partial[np.isnan(partial)==1] = 0
py = count_vals(y).astype(dtype = 'float64') / n
Hxy = np.sum(py[py > 0]*partial)
IG = Hx-Hxy
return 2*IG/(Hx+Hy)
def suGroup(x, n):
m = x.shape[0]
x = np.reshape(x, (n,m/n)).T
m = x.shape[1]
SU_matrix = np.zeros(shape = (m,m))
for j in range(m-1):
x2 = x[:,j+1::]
y = x[:,j]
temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
for k in range(temp.shape[0]):
SU_matrix[j,j+1::] = temp
SU_matrix[j+1::,j] = temp
return 1/float(m-1)*np.sum(SU_matrix, axis = 1)
def isprime(a):
return all(a % i for i in xrange(2, a))
"""
get
"""
def get_i(a):
if isprime(a):
a -= 1
return filter(lambda x: a % x == 0, range(2,a))
"""
FCBF - Fast Correlation Based Filter
L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""
class FCBF:
idx_sel = []
def __init__(self, th = 0.01):
'''
Parameters
---------------
th = The initial threshold
'''
self.th = th
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > self.th]
SU_list[::-1].sort()
m = x[:,SU_vec > self.th].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between feature to remove redundancy.
"""
j = 0
while True:
"""
Stopping Criteria:The search finishes
"""
if j >= x_sorted.shape[1]: break
y = x_sorted[:,j].copy()
x_list = x_sorted[:,j+1:].copy()
if x_list.shape[1] == 0: break
SU_list_2 = SU_list[j+1:]
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0:
x_sorted = np.delete(x_sorted, to_remove, axis = 1)
SU_list = np.delete(SU_list, to_remove, axis = 0)
to_remove.sort()
for r in reversed(to_remove):
self.idx_sel.remove(self.idx_sel[r])
j = j + 1
def fit_transform(self, x, y):
'''
This function fits the feature selection
algorithm and returns the resulting subset.
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.fit(x, y)
return x[:,self.idx_sel]
def transform(self, x):
'''
This function applies the selection
to the vector x.
Parameters
---------------
x = dataset [NxM]
'''
return x[:, self.idx_sel]
"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""
class FCBFK(FCBF):
idx_sel = []
def __init__(self, k = 10):
'''
Parameters
---------------
k = Number of features to include in the
subset.
'''
self.k = k
def fit(self, x, y):
'''
This function executes FCBFK algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > 0]
SU_list[::-1].sort()
m = x[:,SU_vec > 0].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between features to remove redundancy with stopping
criteria (features in x_best == k).
"""
j = 0
while True:
y = x_sorted[:,j].copy()
SU_list_2 = SU_list[j+1:]
x_list = x_sorted[:,j+1:].copy()
"""
Stopping Criteria:The search finishes
"""
if x_list.shape[1] == 0: break
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0 and x.shape[1] > self.k:
for i in reversed(to_remove):
x_sorted = np.delete(x_sorted, i, axis = 1)
SU_list = np.delete(SU_list, i, axis = 0)
self.idx_sel.remove(self.idx_sel[i])
if x_sorted.shape[1] == self.k: break
if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
break
j = j + 1
if len(self.idx_sel) > self.k:
self.idx_sel = self.idx_sel[:self.k]
"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""
class FCBFiP(FCBF):
idx_sel = []
def __init__(self, k = 10, npieces = 2):
'''
Parameters
---------------
k = Number of features to include in the
subset.
npieces = Number of pieces to divide the
feature space.
'''
self.k = k
self.npieces = npieces
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
"""
First Stage: Computing the SU for each feature with the response. We sort the
features. When we have a prime number of features we remove the last one from the
sorted features list.
"""
m = x.shape
nfeaturesPieces = int(m[1] / float(self.npieces))
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
x_sorted = np.zeros(shape = m, dtype = 'float64')
idx_sorted = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = -1
idx_sorted[i]= ind
x_sorted[:,i] = x[:,ind].copy()
if isprime(m[1]):
x_sorted = np.delete(x_sorted, m[1]-1, axis = 1 )
ind_prime = idx_sorted[m[1]-1]
idx_sorted = np.delete(idx_sorted, m[1]-1)
#m = x_sorted.shape
"""
Second Stage: Identify relationships between features into its vecinity
to remove redundancy with stopping criteria (features in x_best == k).
"""
x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces*m[0])).T
SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
SU_x = np.reshape(SU_x.T, (self.npieces*nfeaturesPieces,))
idx_sorted2 = np.zeros(shape = idx_sorted.shape, dtype = 'int64')
SU_x[np.isnan(SU_x)] = 1
for i in range(idx_sorted.shape[0]):
ind = np.argmin(SU_x)
idx_sorted2[i] = idx_sorted[ind]
SU_x[ind] = 10
"""
Scoring step
"""
self.scores = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
if i in idx_sorted:
self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
if isprime(m[1]):
self.scores[ind_prime] = 2*m[1]
self.set_k(self.k)
def set_k(self, k):
self.k = k
scores_temp = -1*self.scores
self.idx_sel = np.zeros(shape = self.k, dtype = 'int64')
for i in range(self.k):
ind = np.argmax(scores_temp)
scores_temp[ind] = -100000000
self.idx_sel[i] = ind
Try using 2to3 package for python to automatically convert the files. Worked for me!
https://docs.python.org/2/library/2to3.html

How to get kfold splits for cross validation from scratch in python?

I think I've split my training data in 5 kold, is there a way for me to label/identify each of the 5 splits so I can then send each into my algorithm to calculate their own accuracies?
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
splits=kf.get_n_splits(X_train)
print(splits)
Separately, I have also tried splitting my data to then run in my logistic regression but this outputs nan % accuracy:
X_train1 = X[0:84]
Y_train1 = Y[0:84]
X_train2 = X[85:170]
Y_train2 = Y[85:170]
X_train3 = X[171:255]
Y_train3 = Y[171:255]
X_train4 = X[256:340]
Y_train4 = Y[256:340]
X_train5 = X[341:426]
Y_train5 = Y[341:426]
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, x)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Regularisation(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy CV: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train1,Y_train1,alpha,initial_theta,iterations)
Logistic_Regression(X_train2,Y_train2,alpha,initial_theta,iterations)
Logistic_Regression(X_train3,Y_train3,alpha,initial_theta,iterations)
Logistic_Regression(X_train4,Y_train4,alpha,initial_theta,iterations)
Logistic_Regression(X_train5,Y_train5,alpha,initial_theta,iterations
get_n_splits returns the "number of splits" you configured for skf.
Look at the documentation here for an example : http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

Categories

Resources