I am learning a little bit about using classes to make my code easier to modify. I was working on defining a ml_setup class which calls a spearman calculation from a separate function.
class ml_setup:
def __init__(self, df, dropcols, ycol, **kwargs ):
self.ycol = ycol
self.df = df
if 'stratify' in kwargs:
self.stratify = kwargs['stratify']
else:
self.stratify = None
self.train_Y = df[ycol]
self.train_X = df.drop(columns=dropcols)
if 'seed' in kwargs:
self.seed = kwargs['seed']
else:
self.seed = self.seed_gen()
if 'test' in kwargs:
self.test = kwargs['test']
else:
self.test = 0.3
if 'final_model' in kwargs:
self.final_model = kwargs['final_model']
else:
self.final_model = None
def seed_gen(self):
seed = np.random.randint(0,2**32 - 1)
return seed
def linear_reg(self, positive=False):
self.regr = linear_model.LinearRegression(positive=positive)
if self.final_model is None:
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.train_X, self.train_Y, test_size=self.test, random_state=self.seed, shuffle=True, stratify=self.stratify)
#for test/train
self.regr.fit(self.X_train, self.y_train)
self.predictions = self.regr.predict(self.X_test)
#print(y_test)
#print(self.predictions)
#print(self.y_test[self.ycol])
self.p, self.s = pearson_stat(self.y_test[self.ycol], self.predictions, print_out='no')
self.r2 = r_squared(self.y_test, self.predictions)
and the pearson_stat function look like the following,
def pearson_stat (x_data, y_data, print_out='no'):
import scipy.stats as ss
p = ss.pearsonr(x_data, y_data)
s = ss.spearmanr(x_data, y_data)
if print_out == 'yes':
print('Pearson rho = {:.4f}, P = {:.4g}'
.format(*p))
print('Spearman r = {:.4f}, P = {:.4g}'
.format(*s))
return p[0], s[0]
While this code perfectly works if I pass the x and y to pearson_stat function following way.
a, s = pearson_stat(ml_mods[seed].y_test['exp_val'],ml_mods[seed].y_test['exp_val'])
But if I now set the 'exp_val' from the class attribute, it doesn't work. gives me the following error.
a, s = pearson_stat(ml_mods[seed].y_test[ml_mods[seed].ycol],ml_mods[seed].y_test[ml_mods[seed].ycol])
xmean = x.mean(dtype=dtype)
File "../anaconda3/envs/py3/lib/python3.6/site-packages/numpy/core/_methods.py", line 160, in _mean
ret = umr_sum(arr, axis, dtype, out, keepdims)
TypeError: No loop matching the specified signature and casting was found for ufunc add
Can you help me understand this?
So after the direction from #hpaulj I realized that the way I call for the column resulted in a DataFrame rather than a Series. For Pearson calculation, I needed the data in form of Series (i.e., array-like).
type(ml_mods[seed].y_test[ml_mods[seed].ycol])
<class 'pandas.core.frame.DataFrame'>
type(ml_mods[seed].y_test['exp_val'])
<class 'pandas.core.series.Series'>
Related
I'm trying to add two optional arguments to a function that trains a GLM using the statsmodel package. I used this question to guide the development of the function: How do I create a Python function with optional arguments?
Basically, I want to give the user the ability to use OR not use weights and offsets.
This is the function:
def model_train(df, formula, *args, **kwargs):
'''
run non discrete model
df = model set
formula = model formula
weight = column used for weights
offset = column used for offsets
'''
weight = kwargs.get(df[weight], None)
print(f"Weights initialized....Starting to intialize offsets")
offset_factor = kwargs.get(df[offset], None)
#print(f"Offset initialized....starting matrix development")
y, x = patsy.dmatrices(formula, df, return_type = 'dataframe')
print(f"Matrix done...starting to instantiate model")
glm = sm.GLM(y, x, family = sm.families.Poisson(), var_weights = weight, offset = offset_factor)
print(f"Model instantiated....starting to fit")
glm_results = glm.fit()
print("Model fit. If you are reading this, you're done. Run 'model_object'[0].summary() to get summary statistics")
return glm_results, x, y
This is the error it throws:
---------------------------------------------------------------------------
UnboundLocalError Traceback (most recent call last)
<ipython-input-34-0ce97f02e15e> in <module>
----> 1 model_80150 = model_train(df = train_model1, formula=formula_80150, weight = 'eunit', offset = None)
~\Documents\GitHub\Edit\run_model.py in model_train(df, formula, *args, **kwargs)
7 offset = column used for offsets
8 '''
----> 9 weight = kwargs.get(df[weight], None)
10 print(f"Weights initialized....Starting to intialize offsets")
11
UnboundLocalError: local variable 'weight' referenced before assignment
EDIT UPDATE:
I've tried the following with a TypeError: unsupported operand type(s) for &: 'NoneType' and 'str' error
def model_train(df, formula, *args, **kwargs):
'''
run non discrete model
df = model set
formula = model formula
weight = column used for weights
offset = column used for offsets
'''
weight_value = kwargs.get('weight', None)
print(f"Weights initialized....Starting to intialize offsets")
offset_factor = kwargs.get('offset', None)
print(f"Offset initialized....starting matrix development")
y, x = patsy.dmatrices(formula, df, return_type = 'dataframe')
print(f"Matrix done...starting to instantiate model")
if weight_value == None:
glm = sm.GLM(y, x, family = sm.families.Poisson())
elif weight_value == None & offset_factor != None:
glm = sm.GLM(y, x, family = sm.families.Poisson(), offset = df[offset_factor])
elif weight_value != None and offset_factor == None:
glm = sm.GLM(y, x, family = sm.families.Poisson(), var_weights = df[weight_value])
else:
glm = sm.GLM(y, x, family = sm.families.Poisson(), var_weights = df[weight_value], offset = df[offset_factor])
print(f"Model instantiated....starting to fit")
glm_results = glm.fit()
print("Model fit. If you are reading this, you're done. Run 'model_object'[0].summary() to get summary statistics")
return glm_results, x, y
I am trying to sample the parameter posterior of an ODE's parameters using a Likelihood that has mean equal to the logarithm of those ODE solutions for a particular choice of parameter and initial value. This is based on the tutorial found here. I can replicate the tutorial, but can't make my model work. My model's ODE is:
dQ(t)/dt = (1/K)*(R(t) - Q(t))
where R(t) is based on rainfall data that I input.
I am assigning priors to the noise standard deviation \sigma, the initial value Q(0) and parameter K.
Any help on how to overcome the error would be much appreciated :)
This is my code:
from scipy.integrate import odeint
from scipy.interpolate import interp1d
import numpy as np
import pandas as pd
import theano
from theano import *
import pymc3 as pm
import theano.tensor as tt
THEANO_FLAGS='optimizer=fast_compile'
theano.config.exception_verbosity= 'high'
theano.config.floatX = 'float64'
n_states = 1
n_odeparams = 1
n_ivs = 1
class LinearReservoirModel(object):
def __init__(self, n_states, n_odeparams, n_ivs,net_rainfall_data, y0=None):
self._n_states = n_states
self._n_odeparams = n_odeparams
self._n_ivs = n_ivs
self._y0 = y0
self._nr = net_rainfall_data
def simulate(self, parameters, times):
return self._simulate(parameters, times, self._nr, False)
def simulate_with_sensitivities(self, parameters, times):
return self._simulate(parameters, times, self._nr, True)
def _simulate(self, parameters, times, net_rainfall_data, sensitivities):
k, q0 = [x for x in parameters]
# Interpolate net_rainfall
nr_int = interp1d(times, net_rainfall_data,fill_value="extrapolate",kind='slinear')
def r(q,time,k,nrint):
return (nrint(time) - q) * (1./k)
if sensitivities:
def jac(k):
ret = np.zeros((self._n_states, self._n_states))
ret[0, 0] = (-1./k)
return ret
def dfdp(x,t,k,nrint):
ret = np.zeros((self._n_states,
self._n_odeparams + self._n_ivs))
ret[0, 0] = (-1./(k**2)) * (nrint(t) - x)
return ret
def rhs(q_and_dqdp, t, k, nrint):
q = q_and_dqdp[0:self._n_states]
dqdp = q_and_dqdp[self._n_states:].reshape((self._n_states,
self._n_odeparams + self._n_ivs))
dqdt = r(q, t, k, nrint)
# print('jacobian',jac(q))
# print('dqdp',dqdp)
# print('dfdp',dfdp(q,t,nrint))
d_dqdp_dt = jac(k)*dqdp + dfdp(q,t,k,nrint) # CHANGED CODE HERE np.matmul(jac(q), dqdp) + dfdp(q,t,nrint)
return np.concatenate((dqdt, d_dqdp_dt.reshape(-1)))
y0 = np.zeros( (n_states*(n_odeparams+n_ivs)) + n_states ) # CHANGED CODE HERE 2*
y0[2] = 1. #\frac{\partial [X]}{\partial Xt0} at t==0, and same below for Y
y0[0:n_states] = q0
result = odeint(rhs, y0, times, (k,nr_int),rtol=1e-6,atol=1e-5)
values = result[:, 0:self._n_states]
dvalues_dp = result[:, self._n_states:].reshape((len(times),
self._n_states,
self._n_odeparams + self._n_ivs))
return values, dvalues_dp
else:
q = odeint(r,q0,times,args=(k,nr_int),rtol=1e-6,atol=1e-5)
q_flat = [item for sublist in q for item in sublist]
return q_flat
q = [0.01, 0.084788051,0.289827287,0.487426902,0.623592162,0.855202214,0.901709887,0.87936577,0.857067839,0.775516564,0.701725939,0.675138958,0.68101658,0.64644605,0.701305112,0.747128907,0.676039744,0.668502137,0.731464651,0.766588801]
nr = [1.618666063,0.0001,4.405308823,0.394073731,3.392555321,2.733285785,0.0001,1.31186209,0.0001,0.0001,0.0001,0.83074128,0.646141131,0.0001,2.405660466,0.0001,0.0001,1.174002978,1.481146447,0.73244669]
ode_model = LinearReservoirModel(n_states, n_odeparams, n_ivs, nr)
class ODEGradop(theano.Op):
def __init__(self, numpy_vsp):
self._numpy_vsp = numpy_vsp
def make_node(self, x, g):
x = theano.tensor.as_tensor_variable(x)
g = theano.tensor.as_tensor_variable(g)
node = theano.Apply(self, [x, g], [g.type()])
return node
def perform(self, node, inputs_storage, output_storage):
x = inputs_storage[0]
g = inputs_storage[1]
out = output_storage[0]
out[0] = self._numpy_vsp(x, g) # get the numerical VSP
class ODEop(theano.Op):
def __init__(self, state, numpy_vsp):
self._state = state
self._numpy_vsp = numpy_vsp
def make_node(self, x):
x = theano.tensor.as_tensor_variable(x)
return theano.Apply(self, [x], [x.type()])
def perform(self, node, inputs_storage, output_storage):
x = inputs_storage[0]
out = output_storage[0]
out[0] = self._state(x) # get the numerical solution of ODE states
def grad(self, inputs, output_grads):
x = inputs[0]
g = output_grads[0]
grad_op = ODEGradop(self._numpy_vsp) # pass the VSP when asked for gradient
grad_op_apply = grad_op(x, g)
return [grad_op_apply]
class solveCached(object):
def __init__(self, times, n_params, n_outputs):
self._times = times
self._n_params = n_params
self._n_outputs = n_outputs
self._cachedParam = np.zeros(n_params)
self._cachedSens = np.zeros((len(times), n_outputs, n_params))
self._cachedState = np.zeros((len(times),n_outputs))
def __call__(self, x):
if np.all(x==self._cachedParam):
state, sens = self._cachedState, self._cachedSens
else:
state, sens = ode_model.simulate_with_sensitivities(x, times)
return state, sens
times = np.arange(0, len(q)) # number of measurement points (see below)
cached_solver=solveCached(times, n_odeparams + n_ivs, n_states)
def state(x):
State, Sens = cached_solver(np.array(x,dtype=np.float64))
cached_solver._cachedState, cached_solver._cachedSens, cached_solver._cachedParam = State, Sens, x
return State.reshape((len(State),))
def numpy_vsp(x, g):
numpy_sens = cached_solver(np.array(x,dtype=np.float64))[1].reshape((n_states*len(times),len(x)))
return numpy_sens.T.dot(g)
# Define the data matrix
Q = np.vstack((q))
# Now instantiate the theano custom ODE op
my_ODEop = ODEop(state,numpy_vsp)
# The probabilistic model
with pm.Model() as LR_model:
# Priors for unknown model parameters
k = pm.Uniform('k', lower=0.01, upper=10)
# Priors for initial conditions and noise level
q0 = pm.Lognormal('q0', mu=np.log(1.2), sd=1)
sigma = pm.Lognormal('sigma', mu=-1, sd=1, shape=1)
# Forward model
all_params = pm.math.stack([k,q0],axis=0)
ode_sol = my_ODEop(all_params)
forward = ode_sol.reshape(Q.shape)
# log_forward = pm.math.log(forward)
# log_forward_print = tt.printing.Print('log_forward')(log_forward.shape)
# tt.printing.Print('sigma')(sigma.shape)
# Likelihood
Q_obs = pm.Lognormal('Q_obs', mu=pm.math.log(forward), sd=sigma, observed=Q)
print(LR_model.check_test_point())
# Y_obs_print = tt.printing.Print('Y_obs')(Y_obs)
trace = pm.sample(n_init=1500, tune=1000, chains=1, init='adapt_diag')
trace['diverging'].sum()
If you run the code above you should be able to reproduce the following error:
Traceback (most recent call last):
File "examples/myexample.py", line 195, in <module>
trace = pm.sample(1500, tune=1000, chains=1, init='adapt_diag')
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/sampling.py", line 457, in sample
trace = _sample_many(**sample_args)
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/sampling.py", line 503, in _sample_many
step=step, random_seed=random_seed[i], **kwargs)
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/sampling.py", line 544, in _sample
for it, strace in enumerate(sampling):
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/tqdm/std.py", line 1091, in __iter__
for obj in iterable:
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/sampling.py", line 633, in _iter_sample
point, states = step.step(point)
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/step_methods/arraystep.py", line 247, in step
apoint, stats = self.astep(array)
File "/Users/Yannis/.pyenv/versions/mini-project/lib/python3.6/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 144, in astep
raise SamplingError("Bad initial energy")
pymc3.exceptions.SamplingError: Bad initial energy
PyMC3 Version: 3.7
Theano Version: 1.0.4
Python Version: 3.6.5
Operating system: macOS Catalina (v10.15.1)
How did you install PyMC3: pip (managed in a pyenv virtualenv)
In the function read_train_sets() an empty class is created called DataSets. It has no methods or variables. An object called data_sets is then created.
My question is, is data_sets.train an object of the class DataSet().
Or are you creating a method called train() and setting it equal to an object of the DataSet() class.
Note that there are two classes called DataSet and DataSets in the code.
import cv2
import os
import glob
from sklearn.utils import shuffle
import numpy as np
def load_train(train_path, image_size, classes):
images = []
labels = []
img_names = []
cls = []
print('Going to read training images')
for fields in classes:
index = classes.index(fields)
print('Now going to read {} files (Index: {})'.format(fields, index))
path = os.path.join(train_path, fields, '*g')
files = glob.glob(path)
for fl in files:
image = cv2.imread(fl)
image = cv2.resize(image, (image_size, image_size),0,0, cv2.INTER_LINEAR)
image = image.astype(np.float32)
image = np.multiply(image, 1.0 / 255.0)
images.append(image)
label = np.zeros(len(classes))
label[index] = 1.0
labels.append(label)
flbase = os.path.basename(fl)
img_names.append(flbase)
cls.append(fields)
images = np.array(images)
labels = np.array(labels)
img_names = np.array(img_names)
cls = np.array(cls)
return images, labels, img_names, cls
class DataSet(object):
def __init__(self, images, labels, img_names, cls):
self._num_examples = images.shape[0]
self._images = images
self._labels = labels
self._img_names = img_names
self._cls = cls
self._epochs_done = 0
self._index_in_epoch = 0
#property
def images(self):
return self._images
#property
def labels(self):
return self._labels
#property
def img_names(self):
return self._img_names
#property
def cls(self):
return self._cls
#property
def num_examples(self):
return self._num_examples
#property
def epochs_done(self):
return self._epochs_done
def next_batch(self, batch_size):
"""Return the next `batch_size` examples from this data set."""
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# After each epoch we update this
self._epochs_done += 1
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end], self._img_names[start:end], self._cls[start:end]
def read_train_sets(train_path, image_size, classes, validation_size):
class DataSets(object):
pass
data_sets = DataSets()
images, labels, img_names, cls = load_train(train_path, image_size, classes)
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
if isinstance(validation_size, float):
validation_size = int(validation_size * images.shape[0])
validation_images = images[:validation_size]
validation_labels = labels[:validation_size]
validation_img_names = img_names[:validation_size]
validation_cls = cls[:validation_size]
train_images = images[validation_size:]
train_labels = labels[validation_size:]
train_img_names = img_names[validation_size:]
train_cls = cls[validation_size:]
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls)
return data_sets
You can dynamically assign attributes to your objects in Python. Try inserting hasattr(data_sets, 'train') which asks if data_sets has attribute train after you assign it and see what you get. Also you can call type(data_sets.train) and convince yourself that it is indeed of type DataSet.
data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
This is quite clear since we are assigning a Class object to the data_sets.train
With respect to data_sets object, train and validate will be 2 attributes to it. Hope this helps.
I am using Python 3.6.4 and built a custom class where I'm doing a computation and changing one of the internal class variables. I noticed it always runs correctly when I'm running the algorithm (e.g. instantiating class, etc.) and always failing the 2nd time. Even if it's the same line code repeated twice in a row. I've been able to replicate the error in it's more simple form.
Why is the lr_space that is being changed in the first object propagating to the 2nd instantiated object?
class testing(object):
def __init__(self,
n_iter=5,
n_space=10,
model_type="logistic",
lr_space={
"C":(1e-6, 1.0),
"penalty":["l1", "l2"],
},
lr_kws=dict(max_iter=10000, solver="liblinear"),
):
self.n_iter = n_iter
self.n_space = n_space
# Logistic Regression
self.lr_space = lr_space
self.lr_kws = lr_kws
print("", self, self.lr_space,"", sep="\n\t")
self.model_type = model_type.lower()
self.models = self._test_function()
def _test_function(self):
"""
Internal: Label models
Need to extend this for using different hyperparameters
"""
models = list()
self.param_index = OrderedDict()
# Indexing for hyperparameters and models
a = np.ones(self.n_iter*2)
b = np.arange(a.size)
if self.model_type == "logistic":
self.lr_space["C"] = np.linspace(*self.lr_space["C"], self.n_space)
return models
print("=====Instantiating and running `instance_1`=====")
instance_1 = testing()
print("=====Instantiating and running `instance_2`=====")
instance_2 = testing()
Output:
=====Instantiating and running `instance_1`=====
<__main__.testing object at 0x136154400>
{'C': (1e-06, 1.0), 'penalty': ['l1', 'l2']}
=====Instantiating and running `instance_2`=====
<__main__.testing object at 0x127649390>
{'C': array([ 1.00000000e-06, 1.11112000e-01, 2.22223000e-01,
3.33334000e-01, 4.44445000e-01, 5.55556000e-01,
6.66667000e-01, 7.77778000e-01, 8.88889000e-01,
1.00000000e+00]), 'penalty': ['l1', 'l2']}
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in ()
38 instance_1 = testing()
39 print("=====Instantiating and running instance_2=====")
---> 40 instance_2 = testing()
<ipython-input-342-24f241984973> in __init__(self, n_iter, n_space, model_type, lr_space, lr_kws)
17 print("", self, self.lr_space,"", sep="\n\t")
18 self.model_type = model_type.lower()
---> 19 self.models = self._test_function()
20
21 def _test_function(self):
<ipython-input-342-24f241984973> in _test_function(self)
31 b = np.arange(a.size)
32 if self.model_type == "logistic":
---> 33 self.lr_space["C"] = np.linspace(*self.lr_space["C"], self.n_space)
34
35
TypeError: linspace() takes from 2 to 6 positional arguments but 11 were given
Solution:
If you assign the default value of lr_space inside the init function instead it works:
from collections import OrderedDict
import numpy as np
class testing(object):
def __init__(self,
n_iter=5,
n_space=10,
model_type="logistic",
lr_space=None,
lr_kws=dict(max_iter=10000, solver="liblinear"),
):
if lr_space is None:
lr_space = {
"C":(1e-6, 1.0),
"penalty":["l1", "l2"],
}
self.n_iter = n_iter
self.n_space = n_space
# Logistic Regression
self.lr_space = lr_space
self.lr_kws = lr_kws
print("", self, self.lr_space,"", sep="\n\t")
self.model_type = model_type.lower()
self.models = self._test_function()
def _test_function(self):
"""
Internal: Label models
Need to extend this for using different hyperparameters
"""
models = list()
self.param_index = OrderedDict()
# Indexing for hyperparameters and models
a = np.ones(self.n_iter*2)
b = np.arange(a.size)
if self.model_type == "logistic":
self.lr_space["C"] = np.linspace(*self.lr_space["C"], self.n_space)
return models
print("=====Instantiating and running `instance_1`=====")
instance_1 = testing()
print("=====Instantiating and running `instance_2`=====")
instance_2 = testing()
Why:
When you assign default values to arguments in def __init__(...), they are assigned before the new instance exists. This doesn't matter when using simple non-mutable values such as 5 and "logistic" but of you use a dict, you are creating a object outside of the instance and then assigning it to a reference in the __init__ call.
This is a dangerous anti-pattern that you should avoid. You can read more about it here: Using a mutable default value as an argument
When you create new instances, the reference is assigned again, but it's still referencing the same object. Your code above is equivalent to:
default_dict = {
"C":(1e-6, 1.0),
"penalty":["l1", "l2"],
}
class testing(object):
def __init__(self,
n_iter=5,
n_space=10,
model_type="logistic",
lr_space=default_dict,
lr_kws=dict(max_iter=10000, solver="liblinear"),
):
What I ended up doing was using deepcopy from the copy built-in:
import copy
class testing(object):
def __init__(self,
n_iter=5,
n_space=10,
model_type="logistic",
lr_space={
"C":(1e-6, 1.0),
"penalty":["l1", "l2"],
},
lr_kws=dict(max_iter=10000, solver="liblinear"),
):
self.n_iter = n_iter
self.n_space = n_space
# Logistic Regression
self.lr_space = copy.deepcopy(lr_space)
self.lr_kws = lr_kws
print("", self, self.lr_space,"", sep="\n\t")
self.model_type = model_type.lower()
self.models = self._test_function()
def _test_function(self):
"""
Internal: Label models
Need to extend this for using different hyperparameters
"""
models = list()
self.param_index = OrderedDict()
# Indexing for hyperparameters and models
a = np.ones(self.n_iter*2)
b = np.arange(a.size)
if self.model_type == "logistic":
self.lr_space["C"] = np.linspace(*self.lr_space["C"], self.n_space)
return models
I'm trying to use one_vs_one composition of decision trees for multiclass classification. The problem is, when I pass different object weights to a classifier, the result stays the same.
Do I misunderstand something with weights, or do they just work incorrectly?
Thanks for your replies!
Here is my code:
class AdaLearner(object):
def __init__(self, in_base_type, in_multi_type):
self.base_type = in_base_type
self.multi_type = in_multi_type
def train(self, in_features, in_labels):
model = AdaBoost(self.base_type, self.multi_type)
model.learn(in_features, in_labels)
return model
class AdaBoost(object):
CLASSIFIERS_NUM = 100
def __init__(self, in_base_type, in_multi_type):
self.base_type = in_base_type
self.multi_type = in_multi_type
self.classifiers = []
self.weights = []
def learn(self, in_features, in_labels):
labels_number = len(set(in_labels))
self.weights = self.get_initial_weights(in_labels)
for iteration in xrange(AdaBoost.CLASSIFIERS_NUM):
classifier = self.multi_type(self.base_type())
self.classifiers.append(classifier.train(in_features,
in_labels,
weights=self.weights))
answers = []
for obj in in_features:
answers.append(self.classifiers[-1].apply(obj))
err = self.compute_weighted_error(in_labels, answers)
print err
if abs(err - 0.) < 1e-6:
break
alpha = 0.5 * log((1 - err)/err)
self.update_weights(in_labels, answers, alpha)
self.normalize_weights()
def apply(self, in_features):
answers = {}
for classifier in self.classifiers:
answer = classifier.apply(in_features)
if answer in answers:
answers[answer] += 1
else:
answers[answer] = 1
ranked_answers = sorted(answers.iteritems(),
key=lambda (k,v): (v,k),
reverse=True)
return ranked_answers[0][0]
def compute_weighted_error(self, in_labels, in_answers):
error = 0.
w_sum = sum(self.weights)
for ind in xrange(len(in_labels)):
error += (in_answers[ind] != in_labels[ind]) * self.weights[ind] / w_sum
return error
def update_weights(self, in_labels, in_answers, in_alpha):
for ind in xrange(len(in_labels)):
self.weights[ind] *= exp(in_alpha * (in_answers[ind] != in_labels[ind]))
def normalize_weights(self):
w_sum = sum(self.weights)
for ind in xrange(len(self.weights)):
self.weights[ind] /= w_sum
def get_initial_weights(self, in_labels):
weight = 1 / float(len(in_labels))
result = []
for i in xrange(len(in_labels)):
result.append(weight)
return result
As you can see, it is just a simple AdaBoost (I instantiated it with in_base_type = tree_learner, in_multi_type = one_against_one) and it worked the same way no matter how many base classifiers were engaged. It just acted as one multiclass decision tree.
Then I've made a hack. I chose a random sample of objects on the each iteration with respect to their weights and trained classifiers with a random subset of objects without any weights. And that worked as it was supposed to.
The default tree criterion, namely information gain, does not take the weights into account. If you know of a formula which would do it, I'll implement it.
In the meanwhile, using neg_z1_loss will do it correctly. By the way, there was a slight bug in that implementation, so you will need to use the most current github master.