I am using the below code for logistic regression with regularization in python. Its giving me 80% accuracy on the training set itself.
I am using minimize method 'TNC'. With BFG the results are of 50%.
What is the ideal method(equivalent to fminunc in Octave) to use for gradient descent?
How can I increase or decrease iteration?
What is the default iteration?
Any other suggestion/approach to improve performance?
The same algo in Octave with fminunc gives 83% accuracy on the training set.
import numpy as np
import scipy.optimize as op
from sklearn import preprocessing
import matplotlib.pyplot as plt
from matplotlib import style
from pylab import scatter, show, legend, xlabel, ylabel
from numpy import loadtxt, where
from sklearn.preprocessing import PolynomialFeatures
def sigmoid(z):
return 1/(1 + np.exp(-z));
def Gradient(theta,X,y,l):
m,n = X.shape
#print("theta shape")
#print(theta.shape)
theta = theta.reshape((n,1))
thetaR = theta[1:n,:]
y = y.reshape((m,1))
h = sigmoid(X.dot(theta))
nonRegGrad = ((np.sum(((h-y)*X),axis=0))/m).reshape(n,1)
reg = np.insert((l/m)*thetaR,0,0,axis=0)
grad = nonRegGrad + reg
return grad.flatten();
def CostFunc(theta,X,y,l):
h = sigmoid(X.dot(theta))
m,n=X.shape;
#print("theta shape")
#print(theta.shape)
theta = theta.reshape((n,1))
thetaR = theta[1:n,:]
cost=np.sum((np.multiply(-y,np.log(h))-np.multiply((1-y),np.log(1-h))))/m
reg=(l/(2*m))* np.sum(np.square(thetaR))
J=cost+reg
return J;
def predict(theta,X):
m,n=X.shape;
return np.round(sigmoid(X.dot(theta.reshape(n,1))));
data = np.loadtxt(open("ex2data2.txt","rb"),delimiter=",",skiprows=1)
nr,nc = data.shape
X=data[:,0:nc - 1]
#X=preprocessing.scale(X)
#X=np.insert(X,0,1,axis=1)
y= data[:,[nc - 1]]
pos = where(y == 1)
neg = where(y == 0)
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
xlabel('Microchip Test 1')
ylabel('Microchip Test 2')
legend(['Passed', 'Failed'])
show()
storeX=X
poly = PolynomialFeatures(6)
X=poly.fit_transform(X)
#print(X.shape)
m , n = X.shape;
initial_theta = np.zeros((n,1));
#initial_theta = zeros(shape=(it.shape[1], 1))
l = 1
# Compute and display initial cost and gradient for regularized logistic
# regression
#cost, grad = cost_function_reg(initial_theta, X, y, l)
#def decorated_cost(theta):
# return cost_function_reg(theta, X, y, l)
#print fmin_bfgs(decorated_cost, initial_theta, maxfun=400)
print("Calling optimization")
Result = op.minimize(fun = CostFunc,
x0 = initial_theta,
args = (X, y,l),
method = 'TNC',
jac = Gradient);
optimal_theta = Result.x;
print(Result.x.shape)
print("optimal theta")
print(optimal_theta)
p=predict(optimal_theta,X)
accuracy = np.mean(np.double(p==y))
print("accuracy")
print(accuracy)
enter code here
Related
I am trying to use curve fitting to find coefficients for an equation using multiple datasets. The equation itself is piecewise, it is defined as :
In this equation, we don't know the break point Po. The variable
I have tried using scipy curve_fit and lmfit. Curve_fit succefully fitted the data for some datasets but failed miserably in others. Here is the code for lmfit inspired by this answer and Curve_fit inspired by this answer:
import pandas as pd
import matplotlib
from scipy.signal import savgol_filter
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.transforms as mtrans
from sklearn import linear_model
import csv
from scipy import stats
from sklearn import preprocessing
from scipy.special import erf,erfc
from lmfit import minimize, Parameters, Model
from sklearn.linear_model import LinearRegression
power_level_for_prediction = [45,50,60,69,71,88]
group_by_column = "mem_pow"
critical_device_power_name = "core_pow"
files = pd.read_csv("file_path")
def residual(params,x,y = None):
param1 = params['a']
param2 = params['b']
param3 = params['x0']
param4 = params['c']
param5 = params['d']
dx = (max(x) - min(x))/(len(x) -1)
xhi = (erf((x-param3)/dx) + 1)/2.0
xlo = (erfc((x-param3)/dx) + 1)/2.0
# p = xlo*param4*np.exp(param5*x) + xhi*(param1*x+param2)
p = xlo*(param1*x + param2) + xhi*(param4*x + param5)
# p = param1*x + param2
# p[np.where(param2 < x)] = param3*x + param2
if y is None:
return p
return p - y
def linear_lmfit(x,y):
params = Parameters()
params.add('a', value = 0.1)
params.add('b', value = 0.2)
params.add('c', value = 0.3)
params.add('d', value = 0.4,min = -5, max =5)
params.add('x0', value = 120)
out = minimize(residual,params,args = (x,y))
fit = residual(out.params,x)
return fit
def piecewise_linear(x, x0, y0, a, c):
# Represntation of above equation. here b and d from above equation, would remain same.
return np.piecewise(x, [x< x0],[lambda x: a*x + y0-a*x0, lambda x: c*x + y0-c*x0])
def linear(files):
files_grouped = files.groupby(group_by_column)
rows, columns = (2,3)
fig, ax = plt.subplots(rows,columns,figsize = (20,10))
k = 0
for name, group in files_grouped:
x = group[critical_device_power_name].to_numpy().astype(float)
y = group['elapsed_time'].to_numpy().astype(float)
if name in power_level_for_prediction:
i = math.floor( k / columns)
j = k % columns
p ,e = curve_fit(piecewise_linear,x,y)
#pred = piecewise_linear(x,*p)
pred = linear_lmfit(x,y)
ax[i][j].plot(x,y,label = "Actual Elapsed Time")
ax[i][j].plot(x,pred, label = "Predicted Elapsed Time")
ax[i][j].grid()
ax[i][j].set_title(f"Prediction Result for {name}W {group_by_column}")
ax[i][j].set_ylabel(r"$T_c$ (sec)")
ax[i][j].set_xlabel(f"{critical_device_power_name}")
ax[i][j].legend(title = f'{group_by_column}')
k = k+1
fig.suptitle(f"{experiment_name}")
fig.tight_layout()
plt.show()
Result using LMFIT:
I have no clue, why LMFIT is showing this type of result. Do you think is it because of the intial value.
and here is the result for the curve_fit:
As seen in the graph, for some mem_pow values the graph is somewhat good but for other it is quite bad. I am unable to understand the reason behind this. In my opinion, the curve fitting is failling for mem_pow level because the second piecwise function is quite flat and the function fails to fit that part.
Here is the csv file :
https://gist.github.com/kulnaman/8952e9c14ec5e8dcf2bbbd40f2dccdaa
I want to find the parameters of a Weibull distribution by minimizing the parameters using Kullbak-Leibler method. I found a code here which did the same thing. I replaced the Normal distributions in the original code by the Weibull distributions. I do not know why I get “Nan” parameters and “Nan” Kullback-Leibler divergence value. Can anyone please help?
import numpy as np
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import seaborn as sns
sns.set()
from scipy.stats import weibull_min
learning_rate = 0.001
epochs = 100
x = np.arange(0, 2000,0.001)
p_pdf=weibull_min.pdf(x, 1.055,0, 468).reshape(1, -1)
p = tf.placeholder(tf.float64, shape=p_pdf.shape)
alpha = tf.Variable(np.zeros(1))
beta = tf.Variable(np.eye(1))
weibull=(beta / alpha) * ((x / alpha)**(beta - 1)) * tf.exp(-((x / alpha)**beta))
q = weibull
kl_divergence = tf.reduce_sum(tf.where(p == 0, tf.zeros(p_pdf.shape, tf.float64), p * tf.log(p / q)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(kl_divergence)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
history = []
alphas = []
betas = []
for i in range(epochs):
sess.run(optimizer, { p: p_pdf })
if i % 10 == 0:
history.append(sess.run(kl_divergence, { p: p_pdf }))
alphas.append(sess.run(alpha)[0])
betas.append(sess.run(beta)[0][0])
for a, b in zip(alphas, betas):
q_pdf =weibull_min.pdf(x, b,0,a)
plt.plot(x, q_pdf.reshape(-1, 1), c='red')
plt.title('KL(P||Q) = %1.3f' % history[-1])
plt.plot(x, p_pdf.reshape(-1, 1), linewidth=3)
plt.show()
plt.plot(history)
plt.show()
sess.close()
Try initialising your alphas to not be 0. Perhaps initialise to np.ones(1) instead.
If you use an alpha of zero you will get a nan with scipy.
from scipy.stats import weibull_min
weibull_min.pdf(100, 0, 0, 2.), weibull_min.pdf(100, 1, 0, 2.)
(nan, 9.643749239819589e-23)
I have some troubles with plotting a polynomial with confidence bounds. Here's my code:
import matplotlib.pyplot as plt
import numpy as np
X = np.array([-5.965215369881319, -40.41538208207736, -15.584956679988448, -6.073510488327594, -11.784890508714675, -7.754674907524617, -17.482364474520395, 2.4347468437246667, -16.133111795228572, -15.815302670890363, 5.9730059659614305, -19.249139823392717, 4.044936045002517, -7.102200416593474, 5.035187269390729, -23.543269648523623, -12.593308808761405, -21.08859785268947, -24.712871361819676, 2.4347468437246667, -21.028901691001877, -15.815302670890363, 7.208054914877421, -29.6589088548177])
Y = np.array([-2.6822693448184607, -23.168555618313547, -3.6166894384329806, -3.5137320916685866, -3.770179381108618, -12.788411352407874, -15.698803377485447, 1.9978332067376703, -11.838042662997829, -8.377671546754629, 8.109573809406804, -14.749849913813343, 2.8160696371542833, -3.3810722874645083, 5.560322978176329, -16.710386872172883, -6.795050134412731, -9.855604995547115, -25.386715163603533, 1.9978332067376703, -11.828949808296766, -8.402106796338003, 7.631911984593458, -18.155638519731614])
#
plt.plot(X, Y, '.')
poly_degree = 5
sd_cutoff = 1 # 2 keeps everything
coeffs = np.polyfit(X, Y, poly_degree)
poly_eqn = np.poly1d(coeffs)
Y_hat = poly_eqn(X)
delta = Y - Y_hat
sd_p = np.std(delta)
ok = abs(delta) < sd_p * sd_cutoff
plt.scatter(X, Y, color=np.where(ok, 'k', 'r'))
plt.fill_between(
X,
Y_hat - sd_p * sd_cutoff,
Y_hat + sd_p * sd_cutoff,
color='#00000020')
plt.plot(X, Y_hat)
Why my polynomial seems so strange?
https://imgur.com/a/hf4gY3P
Since the linked question does not provide a sorting solution, here the code that will sort X-Y pairs:
...
ind = np.argsort(X)
X = X[ind]
Y = Y[ind]
...
Output:
I'm trying to implement a Gaussian fitting using TensorFlow-probability's Nelder-Mead optimizer: tfp.optimizer.nelder_mead_minimize(). It does not converge, while scipy.optimizer.minimize() provide good result in less than 1 second of computation time. I am probably doing something wrong but i can't figure what ? Can someone help me on this ?
I am using :
python 3.7.3
tensorflow-probability 0.8
tensorflow 2.0
Here's my code :
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import matplotlib as plt
# Define the gaussian model : y = 1/(sigma * sqrt(2 pi)) * exp(- (x-mu)²/(2*sigma²))
pi = np.pi
def model(x, theta):
y = 1/(theta[1]*tf.sqrt(2*pi)) * tf.exp(-(x-theta[0])**2 /(2*theta[1]**2))
return y
# Define the loss (least mean square)
def loss_function(theta, y, x, callback=False, n_iterations=1):
global n_epochs_cb
loss = tf.losses.mean_squared_error(y, model(x, theta))
if callback:
if n_epochs_cb % (n_iterations/10.) == 0:
print('{0:4d} {1:} {2: 3.6f}'.format(n_epochs_cb, theta, loss))
n_epochs_cb = n_epochs_cb + 1
return loss
# Generate some data
theta_true = [3, 2]
X = np.arange(-10, 10, 0.5)
Y = model(X, theta_true)
# fig, ax = plt.subplots(1, 1, figsize = [20, 10])
# ax.scatter(X, Y, label='data', alpha=0.5)
# initialize parameters
theta = tf.constant(tf.random.uniform([2], 0, 10), dtype=tf.float32, name='theta')
n_iterations = 100
n_epochs_cb = 1
# minimization
print('{0:4s} {1:9s} {2:9s}'.format('Iter', 'theta', 'loss'))
optim_results = tfp.optimizer.nelder_mead_minimize(lambda theta: loss_function(theta, X, Y, True, n_iterations),
initial_vertex=theta,
func_tolerance=1e-8,
position_tolerance=1e-8,
max_iterations=n_iterations)
print("theta_true", theta_true)
print("theta_est", optim_results.position.numpy())
print("convergenced:", optim_results.converged.numpy())
print("number of function evaluation", optim_results.num_objective_evaluations.numpy())
print("number of iterations", optim_results.num_iterations.numpy())
print("objective value", optim_results.objective_value.numpy())
Optimization stop around 50 iterations and return :
Iter theta loss
10 [0.1448533 6.7525005] 33.408031
20 [-0.2385819 28.76061 ] 33.382130
30 [ -4.1879644 260.84622 ] 33.375771
40 [ -34.722183 2053.5083 ] 33.375099
50 [ -418.6432 24589.836 ] 33.375008
theta_true [3, 2]
theta_est [ -488.44122 28687.352 ]
convergenced: True
number of function evaluation 55
number of iterations 35
objective value 33.375008
I run the same problem using minimize() from scipy.optimize with 'Nelder-Mead' method and it gives :
Iter theta loss
10 [4.61612335 4.40795762] 0.007583
20 [3.19502416 2.09290338] 0.001023
30 [3.01845636 1.99504269] 0.000091
40 [2.99843397 2.00065615] 0.000010
Optimization terminated successfully.
Current function value: 0.000010
Iterations: 44
Function evaluations: 96
computation time 0.046 seconds
I would expect to have the same performance using tensorflow-probability's Nelder-Mead optimizer and Scipy.optimize's Nelder-mead optimizer.
What am I doing wrong ?
Edit :
Found the mistake in the definition of the loss function. the following code is now converging :
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import matplotlib as plt
# Define the gaussian model : y = 1/(sigma * sqrt(2 pi)) * exp(- (x-mu)²/(2*sigma²))
pi = np.pi
def model(x, theta):
y = 1/(theta[1]*tf.sqrt(2*pi)) * tf.exp(-(x-theta[0])**2 /(2*theta[1]**2))
return y
# Define the loss (least mean square)
def loss_function(theta, y, x, callback=False, n_iterations=1):
global n_epochs_cb
loss = tf.losses.mean_squared_error(y, model(x, theta))
if callback:
if n_epochs_cb % (n_iterations/10.) == 0:
print('{0:4d} {1:} {2: 3.6f}'.format(n_epochs_cb, theta, loss))
n_epochs_cb = n_epochs_cb + 1
return loss
# Generate some data
theta_true = [3, 2]
X = np.arange(-10, 10, 0.5, dtype=np.float32)
Y = model(X, theta_true)
# fig, ax = plt.subplots(1, 1, figsize = [20, 10])
# ax.scatter(X, Y, label='data', alpha=0.5)
# initialize parameters
theta = tf.constant(tf.random.uniform([2], 0, 10), dtype=tf.float32, name='theta')
print("theta_true", theta_true)
print("theta_init", theta.numpy())
n_iterations = 100
n_epochs_cb = 1
# minimization
print('{0:4s} {1:9s} {2:9s}'.format('Iter', 'theta', 'loss'))
optim_results = tfp.optimizer.nelder_mead_minimize(lambda theta: loss_function(theta, Y, X, True, n_iterations),
initial_vertex=theta,
func_tolerance=1e-8,
position_tolerance=1e-8,
max_iterations=n_iterations)
print("theta_est", optim_results.position.numpy())
print("convergenced:", optim_results.converged.numpy())
print("number of function evaluation", optim_results.num_objective_evaluations.numpy())
print("number of iterations", optim_results.num_iterations.numpy())
print("objective value", optim_results.objective_value.numpy())
Recently I've found interesting article about regression clustering algorithm which can deal both tasks of regression and clustering:
http://ncss.wpengine.netdna-cdn.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Regression_Clustering.pdf
I'm just curios-is there some technics (libraries) to do it via Python? Thanks!
The algorithm of Spath is not implemented in Python, as far as I know.
But you could replicate its results using Gaussian mixture models in scikit-learn:
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
# generate random data
np.random.seed(1)
n = 10
x1 = np.random.uniform(0, 20, size=n)
x2 = np.random.uniform(0, 20, size=n)
y1 = x1 + np.random.normal(size=n)
y2 = 15 - x2 + np.random.normal(size=n)
x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])
data = np.vstack([x, y]).T
model = GaussianMixture (n_components=2).fit(data)
plt.scatter(x, y, c=model.predict(data))
plt.show()
This code produces the picture, similar to one in the paper:
The GMM is different from Spath algorithm, because the former tries to maximize prediction accuracy of ALL data (X and y), and the latter maximizes only R^2 of y. In my opinion, for most practical problems you would prefer the GMM.
If you still want the Spath algorithm, it could be done with a class like this, implementing a version of EM algorithm:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.base import RegressorMixin, BaseEstimator, clone
class ClusteredRegressor(RegressorMixin, BaseEstimator):
def __init__(self, n_components=2, base=Ridge(), random_state=1, max_iter=100, tol=1e-10, verbose=False):
self.n_components = n_components
self.base = base
self.random_state = random_state
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
def fit(self, X, y):
np.random.seed(self.random_state)
self.estimators_ = [clone(self.base) for i in range(self.n_components)]
# initialize cluster responsibilities randomly
self.resp_ = np.random.uniform(size=(X.shape[0], self.n_components))
self.resp_ /= self.resp_.sum(axis=1, keepdims=True)
for it in range(self.max_iter):
old_resp = self.resp_.copy()
# Estimate sample-weithted regressions
errors = np.empty(shape=self.resp_.shape)
for i, est in enumerate(self.estimators_):
est.fit(X, y, sample_weight=self.resp_[:, i])
errors[:, i] = y - est.predict(X)
self.mse_ = np.sum(self.resp_ * errors**2) / X.shape[0]
if self.verbose:
print(self.mse_)
# Recalculate responsibilities
self.resp_ = np.exp(-errors**2 / self.mse_)
self.resp_ /= self.resp_.sum(axis=1, keepdims=True)
# stop if change in responsibilites is small
delta = np.abs(self.resp_ - old_resp).mean()
if delta < self.tol:
break
self.n_iter_ = it
return self
def predict(self, X):
""" Calculate a matrix of conditional predictions """
return np.vstack([est.predict(X) for est in self.estimators_]).T
def predict_proba(self, X, y):
""" Estimate cluster probabilities of labeled data """
predictions = self.predict(X)
errors = np.empty(shape=self.resp_.shape)
for i, est in enumerate(self.estimators_):
errors[:, i] = y - est.predict(X)
resp_ = np.exp(-errors**2 / self.mse_)
resp_ /= resp_.sum(axis=1, keepdims=True)
return resp_
This code is similar to Spath algorithm, with the only difference that it uses soft "responsibilities" of each cluster for each observation, instead of hard cluster assignment (this way, it is easier for optimization). You can see that the resulting cluster assignment is similar to GMM:
model = ClusteredRegressor()
model.fit(x[:, np.newaxis], y)
labels = np.argmax(model.resp_, axis=1)
plt.scatter(x, y, c=labels)
plt.show()
Unfortunately, this model cannot be applied to predict test data, because its output depends on data labels (y). However, if you further modify my code, you could predict cluster probability conditional on X. In this case, the model would be useful for prediction.