Recently I've found interesting article about regression clustering algorithm which can deal both tasks of regression and clustering:
http://ncss.wpengine.netdna-cdn.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Regression_Clustering.pdf
I'm just curios-is there some technics (libraries) to do it via Python? Thanks!
The algorithm of Spath is not implemented in Python, as far as I know.
But you could replicate its results using Gaussian mixture models in scikit-learn:
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
# generate random data
np.random.seed(1)
n = 10
x1 = np.random.uniform(0, 20, size=n)
x2 = np.random.uniform(0, 20, size=n)
y1 = x1 + np.random.normal(size=n)
y2 = 15 - x2 + np.random.normal(size=n)
x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])
data = np.vstack([x, y]).T
model = GaussianMixture (n_components=2).fit(data)
plt.scatter(x, y, c=model.predict(data))
plt.show()
This code produces the picture, similar to one in the paper:
The GMM is different from Spath algorithm, because the former tries to maximize prediction accuracy of ALL data (X and y), and the latter maximizes only R^2 of y. In my opinion, for most practical problems you would prefer the GMM.
If you still want the Spath algorithm, it could be done with a class like this, implementing a version of EM algorithm:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.base import RegressorMixin, BaseEstimator, clone
class ClusteredRegressor(RegressorMixin, BaseEstimator):
def __init__(self, n_components=2, base=Ridge(), random_state=1, max_iter=100, tol=1e-10, verbose=False):
self.n_components = n_components
self.base = base
self.random_state = random_state
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
def fit(self, X, y):
np.random.seed(self.random_state)
self.estimators_ = [clone(self.base) for i in range(self.n_components)]
# initialize cluster responsibilities randomly
self.resp_ = np.random.uniform(size=(X.shape[0], self.n_components))
self.resp_ /= self.resp_.sum(axis=1, keepdims=True)
for it in range(self.max_iter):
old_resp = self.resp_.copy()
# Estimate sample-weithted regressions
errors = np.empty(shape=self.resp_.shape)
for i, est in enumerate(self.estimators_):
est.fit(X, y, sample_weight=self.resp_[:, i])
errors[:, i] = y - est.predict(X)
self.mse_ = np.sum(self.resp_ * errors**2) / X.shape[0]
if self.verbose:
print(self.mse_)
# Recalculate responsibilities
self.resp_ = np.exp(-errors**2 / self.mse_)
self.resp_ /= self.resp_.sum(axis=1, keepdims=True)
# stop if change in responsibilites is small
delta = np.abs(self.resp_ - old_resp).mean()
if delta < self.tol:
break
self.n_iter_ = it
return self
def predict(self, X):
""" Calculate a matrix of conditional predictions """
return np.vstack([est.predict(X) for est in self.estimators_]).T
def predict_proba(self, X, y):
""" Estimate cluster probabilities of labeled data """
predictions = self.predict(X)
errors = np.empty(shape=self.resp_.shape)
for i, est in enumerate(self.estimators_):
errors[:, i] = y - est.predict(X)
resp_ = np.exp(-errors**2 / self.mse_)
resp_ /= resp_.sum(axis=1, keepdims=True)
return resp_
This code is similar to Spath algorithm, with the only difference that it uses soft "responsibilities" of each cluster for each observation, instead of hard cluster assignment (this way, it is easier for optimization). You can see that the resulting cluster assignment is similar to GMM:
model = ClusteredRegressor()
model.fit(x[:, np.newaxis], y)
labels = np.argmax(model.resp_, axis=1)
plt.scatter(x, y, c=labels)
plt.show()
Unfortunately, this model cannot be applied to predict test data, because its output depends on data labels (y). However, if you further modify my code, you could predict cluster probability conditional on X. In this case, the model would be useful for prediction.
Related
I am tring to implement the RBF Kernel Function for my kernel k-means alg. Here is my formula.
And then I implement it with Numpy, but there's a two-layer for loop, and I'm thinking about how to turn it into a matrix operation. Because if I could do matrix operations, it would be a lot faster to process my 784-dimensional data. Or maybe my implemention is not correct? Can someone help me?
import numpy as np
def get_gamma(X, Y):
gamma = 0
for x in X:
for y in Y:
tmp = x - y
gamma += tmp**2
gamma = gamma / (length**2)
return gamma
def kernel(X, Y, gamma):
up = np.sum(np.power(X-Y, 2))
res = np.exp(-*up/gamma)
return res
def kernel_distance(X, Y):
gamma = get_gamma(X, Y)
a = kernel(X, X, gamma)
b = kernel(Y, Y, gamma)
c = kernel(X, Y, gamma)
return np.sqrt(a+b-2*c)
That's odd if I run your code it gives me a number for k. But shouldn't it be an array? Also shouldn't X and Y be 2d since those are basically a list of your points? Anyways if I take my own X and Y
from scipy.spatial.distance import cdist
import numpy as np
n = 10
X = np.random.random((n,3))
Y = np.random.random((n,3))
I can solve your problem like this
norms_sq = cdist(X,Y,'sqeuclidean')
two_sigma_sq = 1/n**2*np.sum(norms_sq)
k = np.exp(-norms_sq/two_sigma_sq)
I have the following code:
import numpy as np
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from functools import partial
import pandas as pd
def tanimotoKernel(xs, ys):
a = 0
b = 0
for x, y in zip(xs, ys):
a += min(x, y)
b += max(x, y)
return a / b
#gammaExp = 1/(np.exp(gamma) - 1), calculated outside the kernel
def tanimotoLambdaKernel(xs,ys, gamma, gammaExp):
return np.exp(gamma * tanimotoKernel(xs,ys) - 1) * gammaExp
class GramBuilder:
def __init__(self, Kernel):
self._Kernel = Kernel
def generateMatrixBuilder(self, X1, X2):
gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
for i, x1 in enumerate(X1):
for j, x2 in enumerate(X2):
gram_matrix[i, j] = self._Kernel(x1, x2)
return gram_matrix
gammaList = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
CList = [0.001, 0.01, 0.1, 1, 10, 100]
X, y = datasets.load_digits(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(X, y)
svc_list = [
(svm.SVC(
kernel=GramBuilder(
partial(tanimotoLambdaKernel, gamma = x, gammaExp = 1/(np.exp(x) - 1)))
.generateMatrixBuilder),
x)
for x in gammaList
]
gammas = []
Cs = []
accuracy = []
for svc, gamma in svc_list:
print("Training gamma ", gamma)
clf = GridSearchCV(svc, {'C' : CList}, verbose = 1, n_jobs = -1)
clf.fit(x_train, y_train)
gammas.append(gamma)
Cs.append(clf.best_params_['C'])
accuracy.append(clf.best_score_)
For this toy dataset, I have to wait 50 minutes approx to perform all the cross validations in the loop.
The first improvement I did was to calculate gammaExp outside the function, so I can save millions of exponentials. Also I multiplication is faster than division, so I calculated the inverse of the exponential minus one to also try to save more time.
With those modifications I improved a lot the time training the models, however I need it to be faster, so I would appreciate any ideas. Thanks.
You can use Numpy to speed up the min/max operations. Then you can use Numba's JIT to speed up the code even more by inlining calls.
import numba as nb
#nb.njit
def tanimotoKernel(xs, ys):
a = np.minimum(xs, ys).sum()
b = np.maximum(xs, ys).sum()
return a / b
#nb.njit
def tanimotoLambdaKernel(xs,ys, gamma, gammaExp):
return np.exp(gamma * tanimotoKernel(xs,ys) - 1) * gammaExp
# [...]
The above code should be correct and is more than 20 times faster on my machine. It took actually only few minutes to complete.
I think you can speed things up even more by removing the partial call and use Numba for the GramBuilder class too (look at the Numba documentation to JIT class, partial function are probably not supported, but you can store values in the class and do part of the job yourself). Moreover, note that many operation seems performed multiple times in the kernel. I is probably possible to compute them once (the kernel is called with the same x2 multiple times and recompute the max again and again).
I am trying to implement logistic regression for a binary classification problem from scratch in Python. My results do not match those provided by the implementation of sklearn, as you can see in this example. Note that the lines look "similar", but they are clearly not the same.
I took care of what is mentioned in this answer: both sklearn and me (i) fit the intercept term, and; (ii) do not apply regularization (penalty='none'). Also, while sklearn applies 100 iterations to train the algorithm (by default), I am applying 10000 with a rather small learning rate of 0.01. I tried different combination of values, but the problem does not seem to depend on this.
At the same time, I do notice that, even before comparing the results with sklearn, the ones I obtain with my implementation seem to be wrong: the decision regions are clearly off in some cases. You can see an example in this image.
The last point seems to indicate that the problem is all my own fault. Here is my code (it actually generates new datasets at each run and plots the results):
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
def create_training_set():
X0, y = make_blobs(n_samples=[100, 100],
centers=None,
n_features=2,
cluster_std=1)
y = y.reshape(-1, 1) # make y a column vector
return np.hstack([np.ones((X0.shape[0], 1)), X0]), X0, y
def create_test_set(X0):
xx, yy = np.meshgrid(np.arange(X0[:, 0].min() - 1, X0[:, 0].max() + 1, 0.1),
np.arange(X0[:, 1].min() - 1, X0[:, 1].max() + 1, 0.1))
X_test = np.c_[xx.ravel(), yy.ravel()]
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
return xx, yy, X_test
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def apply_gradient_descent(theta, X, y, max_iter=1000, alpha=0.1):
m = X.shape[0]
cost_iter = []
for _ in range(max_iter):
p_hat = sigmoid(np.dot(X, theta))
cost_J = -1/float(m) * (np.dot(y.T, np.log(p_hat)) + np.dot((1 - y).T, np.log(1 - p_hat)))
grad_J = 1/float(m) * np.dot(X.T, p_hat - y)
theta -= alpha * grad_J
cost_iter.append(float(cost_J))
return theta, cost_iter
fig, ax = plt.subplots(10, 2, figsize = (10, 30))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
max_iter = 10000
alpha = 0.1
all_cost_history = []
for n_fil in range(10):
X_train, X0, y = create_training_set()
xx, yy, X_test = create_test_set(X0)
theta, cost_evolution = apply_gradient_descent(np.zeros((X_train.shape[1], 1)), X_train, y, max_iter, alpha)
all_cost_history.append(cost_evolution)
y_pred = np.where(sigmoid(np.dot(X_test, theta)) > 0.5, 1, 0)
y_pred = y_pred.reshape(xx.shape)
ax[n_fil, 0].pcolormesh(xx, yy, y_pred, cmap = cmap_light)
ax[n_fil, 0].scatter(X0[:, 0], X0[:, 1], c=y.ravel(), cmap=cmap_bold, alpha = 1, edgecolor="black")
y = y.reshape(X_train.shape[0], )
clf = LogisticRegression().fit(X0, y)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax[n_fil, 1].pcolormesh(xx, yy, Z, cmap = cmap_light)
ax[n_fil, 1].scatter(X0[:, 0], X0[:, 1], c=y, cmap=cmap_bold, alpha = 1, edgecolor="black")
plt.show()
There is actually a difference between your implementation and Sklearn's one: you are not using the same optimization algorithm (also called solver in sklearn), and I think the difference you observe comes from here. You are using gradient descent, while sklearn's implementation uses by default the "liblinear" solver, which is different
Indeed, different optimization algorithms can yield different results based on, as an example :
The convergence speed: As we are limiting the number of iterations, an algorithm converging slower would stop at a different minima and thus yield differents decision regions
Whether an algorithm is deterministic or not: non deterministic algorithms (such as stochastic gradient descent) can converge to a different local minima given the same dataset. With non deterministic algorithms you could observe different results with the exact same dataset and algorithm.
Hyperparameters: changing a hyperparameter (as an example the learning rate of a gradient descent algorithm) changes the behavior of the optimization algorithm too, thus leading to different results.
In you case, there are good reasons for not always getting the same results: the gradient descent algorithm you use can get stuck in a local minima (because of an insufficient number of iterations, a non optimal learning rate...) which can be different from the local minima reached by the liblinear solver.
You can observe the same kind of discrepancies if you compare sklearn's implementation with different solvers (reusing your code):
fig, ax = plt.subplots(10, 2, figsize=(10, 30))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
max_iter = 10000
alpha = 0.1
solver_algo_1 = 'liblinear'
solver_algo_2 = 'sag'
for n_fil in range(10):
X_train, X0, y = create_training_set()
xx, yy, X_test = create_test_set(X0)
y = y.reshape(X_train.shape[0], )
clf = LogisticRegression(solver=solver_algo_1, max_iter=max_iter).fit(X0, y)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax[n_fil, 0].pcolormesh(xx, yy, Z, cmap=cmap_light)
ax[n_fil, 0].scatter(X0[:, 0], X0[:, 1], c=y, cmap=cmap_bold, alpha=1, edgecolor="black")
clf = LogisticRegression(solver=solver_algo_2, max_iter=max_iter).fit(X0, y)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax[n_fil, 1].pcolormesh(xx, yy, Z, cmap=cmap_light)
ax[n_fil, 1].scatter(X0[:, 0], X0[:, 1], c=y, cmap=cmap_bold, alpha=1, edgecolor="black")
plt.show()
As an example, with "liblinear" (left) and "newton-cg" (right), you can get this:
Though the Logistc regression implementation is the same, the difference in the optimization algorithms leads to different results. So in a few words, the difference between your implementation and Scikit learn's one is the optimization algorithm.
Now if the quality if the decsion boundary you get is not satisfying, you can try tuning the hyperparameters of your gradient descent algorithm or try changing the optimization algorithm!
I'm attempting to create a simple linear model with Python using no libraries (other than numpy). Here's what I have
import numpy as np
import pandas
np.random.seed(1)
alpha = 0.1
def h(x, w):
return np.dot(w.T, x)
def cost(X, W, Y):
totalCost = 0
for i in range(47):
diff = h(X[i], W) - Y[i]
squared = diff * diff
totalCost += squared
return totalCost / 2
housing_data = np.loadtxt('Housing.csv', delimiter=',')
x1 = housing_data[:,0]
x2 = housing_data[:,1]
y = housing_data[:,2]
avgX1 = np.mean(x1)
stdX1 = np.std(x1)
normX1 = (x1 - avgX1) / stdX1
print('avgX1', avgX1)
print('stdX1', stdX1)
avgX2 = np.mean(x2)
stdX2 = np.std(x2)
normX2 = (x2 - avgX2) / stdX2
print('avgX2', avgX2)
print('stdX2', stdX2)
normalizedX = np.ones((47, 3))
normalizedX[:,1] = normX1
normalizedX[:,2] = normX2
np.savetxt('normalizedX.csv', normalizedX)
weights = np.ones((3,))
for boom in range(100):
currentCost = cost(normalizedX, weights, y)
if boom % 1 == 0:
print(boom, 'iteration', weights[0], weights[1], weights[2])
print('Cost', currentCost)
for i in range(47):
errorDiff = h(normalizedX[i], weights) - y[i]
weights[0] = weights[0] - alpha * (errorDiff) * normalizedX[i][0]
weights[1] = weights[1] - alpha * (errorDiff) * normalizedX[i][1]
weights[2] = weights[2] - alpha * (errorDiff) * normalizedX[i][2]
print(weights)
predictedX = [1, (2100 - avgX1) / stdX1, (3 - avgX2) / stdX2]
firstPrediction = np.array(predictedX)
print('firstPrediction', firstPrediction)
firstPrediction = h(firstPrediction, weights)
print(firstPrediction)
First, it converges VERY quickly. After only 14 iterations. Second, it gives me a different result than a linear regression with sklearn. For reference, my sklearn code is:
import numpy
import matplotlib.pyplot as plot
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
dataset = pandas.read_csv('Housing.csv', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 2].values
linearRegressor = LinearRegression()
xnorm = sklearn.preprocessing.scale(x)
scaleCoef = sklearn.preprocessing.StandardScaler().fit(x)
mean = scaleCoef.mean_
std = numpy.sqrt(scaleCoef.var_)
print('stf')
print(std)
stuff = linearRegressor.fit(xnorm, y)
predictedX = [[(2100 - mean[0]) / std[0], (3 - mean[1]) / std[1]]]
yPrediction = linearRegressor.predict(predictedX)
print('predictedX', predictedX)
print('predict', yPrediction)
print(stuff.coef_, stuff.intercept_)
My custom model predicts 337,000 for the value of y and sklearn predicts 355,000. My data is 47 rows that look like
2104,3,3.999e+05
1600,3,3.299e+05
2400,3,3.69e+05
1416,2,2.32e+05
3000,4,5.399e+05
1985,4,2.999e+05
1534,3,3.149e+05
Complete data available at https://github.com/shamoons/linear-logistic-regression/blob/master/Housing.csv
I assume either (a) my regression with gradient descent is somehow wrong or (b) I'm not using sklearn properly.
Any other reasons why the 2 wouldn't predict the same output for a given input?
I think you are missing the 1/m term (where m is the size of y) in the gradient descent. After including the 1/m term, I seem to get a predicted value similar to your sklearn code.
see below
....
weights = np.ones((3,))
m = y.size
for boom in range(100):
currentCost = cost(normalizedX, weights, y)
if boom % 1 == 0:
print(boom, 'iteration', weights[0], weights[1], weights[2])
print('Cost', currentCost)
for i in range(47):
errorDiff = h(normalizedX[i], weights) - y[i]
weights[0] = weights[0] - alpha *(1/m)* (errorDiff) * normalizedX[i][0]
weights[1] = weights[1] - alpha *(1/m)* (errorDiff) * normalizedX[i][1]
weights[2] = weights[2] - alpha *(1/m)* (errorDiff) * normalizedX[i][2]
...
this gives the firstprediction to be 355242.
This agrees well with the linear regression model even though it does not do gradient descent.
I also tried sgdregressor (uses stochastic gradient descent) in sklearn and it too seem to get a value close to linear regressor model and your model. see the code below
import numpy
import matplotlib.pyplot as plot
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
dataset = pandas.read_csv('Housing.csv', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 2].values
sgdRegressor = SGDRegressor(penalty='none', learning_rate='constant', eta0=0.1, max_iter=1000, tol = 1E-6)
xnorm = sklearn.preprocessing.scale(x)
scaleCoef = sklearn.preprocessing.StandardScaler().fit(x)
mean = scaleCoef.mean_
std = numpy.sqrt(scaleCoef.var_)
print('stf')
print(std)
yPrediction = []
predictedX = [[(2100 - mean[0]) / std[0], (3 - mean[1]) / std[1]]]
print('predictedX', predictedX)
for trials in range(10):
stuff = sgdRegressor.fit(xnorm, y)
yPrediction.extend(sgdRegressor.predict(predictedX))
print('predict', np.mean(yPrediction))
results in
predict 355533.10119985335
I am using the below code for logistic regression with regularization in python. Its giving me 80% accuracy on the training set itself.
I am using minimize method 'TNC'. With BFG the results are of 50%.
What is the ideal method(equivalent to fminunc in Octave) to use for gradient descent?
How can I increase or decrease iteration?
What is the default iteration?
Any other suggestion/approach to improve performance?
The same algo in Octave with fminunc gives 83% accuracy on the training set.
import numpy as np
import scipy.optimize as op
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import style
from pylab import scatter, show, legend, xlabel, ylabel
from numpy import loadtxt, where
from sklearn.preprocessing import PolynomialFeatures
def sigmoid(z):
return 1/(1 + np.exp(-z));
def Gradient(theta,X,y,l):
m,n = X.shape
#print("theta shape")
#print(theta.shape)
theta = theta.reshape((n,1))
thetaR = theta[1:n,:]
y = y.reshape((m,1))
h = sigmoid(X.dot(theta))
nonRegGrad = ((np.sum(((h-y)*X),axis=0))/m).reshape(n,1)
reg = np.insert((l/m)*thetaR,0,0,axis=0)
grad = nonRegGrad + reg
return grad.flatten();
def CostFunc(theta,X,y,l):
h = sigmoid(X.dot(theta))
m,n=X.shape;
#print("theta shape")
#print(theta.shape)
theta = theta.reshape((n,1))
thetaR = theta[1:n,:]
cost=np.sum((np.multiply(-y,np.log(h))-np.multiply((1-y),np.log(1-h))))/m
reg=(l/(2*m))* np.sum(np.square(thetaR))
J=cost+reg
return J;
def predict(theta,X):
m,n=X.shape;
return np.round(sigmoid(X.dot(theta.reshape(n,1))));
data = np.loadtxt(open("ex2data2.txt","rb"),delimiter=",",skiprows=1)
nr,nc = data.shape
X=data[:,0:nc - 1]
#X=preprocessing.scale(X)
#X=np.insert(X,0,1,axis=1)
y= data[:,[nc - 1]]
pos = where(y == 1)
neg = where(y == 0)
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
xlabel('Microchip Test 1')
ylabel('Microchip Test 2')
legend(['Passed', 'Failed'])
show()
storeX=X
poly = PolynomialFeatures(6)
X=poly.fit_transform(X)
#print(X.shape)
m , n = X.shape;
initial_theta = np.zeros((n,1));
#initial_theta = zeros(shape=(it.shape[1], 1))
l = 1
# Compute and display initial cost and gradient for regularized logistic
# regression
#cost, grad = cost_function_reg(initial_theta, X, y, l)
#def decorated_cost(theta):
# return cost_function_reg(theta, X, y, l)
#print fmin_bfgs(decorated_cost, initial_theta, maxfun=400)
print("Calling optimization")
Result = op.minimize(fun = CostFunc,
x0 = initial_theta,
args = (X, y,l),
method = 'TNC',
jac = Gradient);
optimal_theta = Result.x;
print(Result.x.shape)
print("optimal theta")
print(optimal_theta)
p=predict(optimal_theta,X)
accuracy = np.mean(np.double(p==y))
print("accuracy")
print(accuracy)
enter code here