KMeans implementation in python with OOP?

KMeans implementation in python with OOP? - python

I am trying to implement KMeans algorithm as a class and when i done all code i test with creating object and than train the dataset. But i get the following error.
self.distances = np.zeros((self.N, self.n_clusters))
TypeError: only integer scalar arrays can be converted to a scalar index
from abc import ABC, abstractmethod
class KMeansInterface(ABC):
#abstractmethod
def fit(self, X):
pass
#abstractmethod
def predict(self, X):
pass
class Kmeans(KMeansInterface):
def __init__(self, n_clusters):
self.n_clusters = n_clusters
self.labels = None
self.distances = None
self.N = X.shape[0]
self.data_points = X
self.prev_labels = None
def choose_clusters(self, k, X):
self.n_clusters = k
size = X.shape[0]
numbers = np.random.choice(size, k, replace=False)
return X[numbers]
def calculate_sum_squared_distance(self, data_points, clusters):
self.distances = np.zeros((self.N, self.n_clusters))
for i, centroid in enumerate(clusters):
distance = np.sum(np.square(data_points-centroid), axis=1)
self.distances[:, i] = distance
return self.distances
def calculate_closest_clusters(self, distances):
self.closest_clusters = np.argmin(self.distances, axis=1)
return self.closest_clusters
def update_clusters(self, labels, data_points):
new_clusters = []
for i in range(self.n_clusters):
points_in_cluster = data_points[labels==i]
values = np.mean(points_in_cluster, axis=0)
new_clusters.append(values)
return np.array(new_clusters)
def fit(self, X):
self.n_clusters = self.choose_clusters(self.n_clusters, X)
for i in range(20000):
self.distances = self.calculate_sum_squared_distance(X, self.n_clusters)
self.labels = self.calculate_closest_clusters(self.distances)
self.new_cluster = self.update_clusters(self.labels, X)
if self.prev_labels is not None:
if np.all(self.prev_labels == self.labels):
break
self.prev_labels = self.labels
fitted = True
return self.labels
def predict(self, X, clusters):
self.distances = self.calculate_sum_squared_distance(X, clusters)
self.labels = self.calculate_closest_clusters(self.distances)
return self.labels
kmean_object = KMeans(2)
kmean_object.fit(X)

Related

transforming data first vs doing everything in pipe results in different results when using a model

I wanted to make all of the custom transformations I make to my data in a pipe. I thought that I could use it as pipe.fit_transform(X) to transform my X before using it in a model, but I also thought that I'll be able to append to the pipeline model itself and simply use it as one using pipe.steps.append(('model', self.model)).
Unfortunately, after everything was built I've noticed that I'm getting different results when transforming the data and using it directly in a model vs doing everything in one pipeline. Have anyone experienced anything like this?
Adding code:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
And then, when using everything gives different R2 scores:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e+22
EDIT:
providing all steps in pipeline used:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col+'^s'})
self.frames += [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col+'^2'})
self.frames += [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols

The one transformer that stands out to me is data_cat_mix, specifically the count-of-level columns. When applied to train+test, these are consistent (but leaks test information); when applied separately, the values in train will generally be much higher (just from its size being three times larger), so the model doesn't really understand how to treat them in the test set.

Why k-means clustering give me different answers when initialized with different centroids?

I followed the pseudo code for k-means clustering to write this code. This code gives different answers when initialized the clusters' centroids with different values and none of those answers are correct. Can you help me please?
I tested with 15 nodes, tolerance = 0.00001 and iterations = 100000
Thanks in advance.
class kMeans:
def __init__(self, coordinates, tolerance, iter, nof):
self.grid = coordinates
self.N = coordinates.shape[0]
self.t = tolerance
self.nof = nof
self.f = None
def kMeans(self, nof):
assign = [0]*self.N
self.fac = np.empty([nof,2])
for i in range(nof):
for j in range(2):
self.fac[i,j] = self.grid[i+10,j]
for itr in range(iter):
for n in range(self.N):
distance = [0]*nof
for f in range(nof):
distance[f] = math.sqrt((self.grid[n,0]-self.fac[f,0])**2 + (self.grid[n,1]-self.fac[f,1])**2 )
assign[n] = np.argmin(distance)
for fa in range(nof):
l = []
x,y = 0,0
for asg in range(self.N):
if fa == assign[asg]:
l.append(asg)
x = np.mean(self.grid[l,0])
y = np.mean(self.grid[l,1])
if abs(x-self.fac[fa,0]) >= self.t:
self.fac[fa,0] = x
if abs(y-self.fac[fa,1]) >= self.t:
self.fac[fa,1] = y
continue
print('dist:',distance)
print('assign:',assign)
print('fac:',self.fac)
print('locate:', self.grid[l,1])
self.f = self.fac
return self.fac
'''

How to create an importable module?

I was following this tutorial and after creating the classes I still cannot import the desired module. The code I used:
import numpy as np
import sys
class ForwardEuler:
def __init__(self, f):
# test that f is a function
if not callable(f):
raise TypeError('f is %s, not a function' % type(f))
self.f = f
def set_initial_condition(self, U0):
self.U0 = float(U0)
def solve(self, time_points):
"""Compute u for t values in time_points list."""
self.t = np.asarray(time_points)
self.u = np.zeros(len(time_points))
self.u[0] = self.U0
for k in range(len(self.t)-1):
self.k = k
self.u[k+1] = self.advance()
return self.u, self.t
def advance(self):
"""Advance the solution one time step."""
u, f, k, t = self.u, self.f, self.k, self.t
dt = t[k+1] - t[k]
unew = u[k] + dt*f(u[k], t[k])
return unew
class ODESolver:
def __init__(self, f):
self.f = f
def advance(self):
"""Advance solution one time step."""
raise NotImplementedError # implement in subclass
def set_initial_condition(self, U0):
self.U0 = float(U0)
def solve(self, time_points):
self.t = np.asarray(time_points)
self.u = np.zeros(len(self.t))
# Assume that self.t[0] corresponds to self.U0
self.u[0] = self.U0
# Time loop
for k in range(n-1):
self.k = k
self.u[k+1] = self.advance()
return self.u, self.t
def advance(self):
raise NotImplemtedError # to be impl. in subclasses
class ForwardEuler(ODESolver):
def advance(self):
u, f, k, t = self.u, self.f, self.k, self.t
dt = t[k+1] - t[k]
unew = u[k] + dt*f(u[k], t)
return unew
Now, I want from ODESolver import ForwardEuler, but there is no module named ODESolver. How do I create it? I suppose there must be something with if __name__ == '__main__': and then the classes underneath, but that didn't work either.

the name of the file.py where you are writting this code is the name of module
you must name it ODESolver
then you can do
from ODESolver import ForwardEuler

How to use Pyomo decorator within a class

Below is a simple Pyomo script using the decorator syntax - I would like to understand how to use this syntax within a class - in this case inside Model.
None-class version
from pyomo.environ import *
import random
random.seed(1000)
model = AbstractModel()
model.N = Param(within=PositiveIntegers)
model.P = Param(within=RangeSet(1, model.N))
model.M = Param(within=PositiveIntegers)
model.Locations = RangeSet(1, model.N)
model.Customers = RangeSet(1, model.M)
model.d = Param(
model.Locations,
model.Customers,
initialize=lambda n, m, model: random.uniform(1.0, 2.0),
within=Reals,
)
model.x = Var(model.Locations, model.Customers, bounds=(0.0, 1.0))
model.y = Var(model.Locations, within=Binary)
#model.Objective()
def obj(model):
return sum(
model.d[n, m] * model.x[n, m] for n in model.Locations for m in model.Customers
)
#model.Constraint(model.Customers)
def single_x(model, m):
return (sum(model.x[n, m] for n in model.Locations), 1.0)
#model.Constraint(model.Locations, model.Customers)
def bound_y(model, n, m):
return model.x[n, m] - model.y[n] <= 0.0
#model.Constraint()
def num_facilities(model):
return sum(model.y[n] for n in model.Locations) == model.P
Decorator version within a class that doesn't work:
from pyomo.environ import *
import random
random.seed(1000)
class Model:
def __init__(self):
self.model = AbstractModel()
self.model.N = Param(within=PositiveIntegers)
self.model.P = Param(within=RangeSet(1, self.model.N))
self.model.M = Param(within=PositiveIntegers)
self.model.Locations = RangeSet(1, self.model.N)
self.model.Customers = RangeSet(1, self.model.M)
self.model.d = Param(
self.model.Locations,
self.model.Customers,
initialize=lambda n, m, model: random.uniform(1.0, 2.0),
within=Reals,
)
self.model.x = Var(
self.model.Locations, self.model.Customers, bounds=(0.0, 1.0)
)
self.model.y = Var(self.model.Locations, within=Binary)
#model.Objective()
def obj(model):
return sum(
model.d[n, m] * model.x[n, m]
for n in model.Locations
for m in model.Customers
)
#model.Constraint(model.Customers)
def single_x(model, m):
return (sum(model.x[n, m] for n in model.Locations), 1.0)
#model.Constraint(model.Locations, model.Customers)
def bound_y(model, n, m):
return model.x[n, m] - model.y[n] <= 0.0
#model.Constraint()
def num_facilities(model):
return sum(model.y[n] for n in model.Locations) == model.P

I'm not able to help you on this, I just have a few qustions:
do you know if the use of #model.Objective() (same for Constraint etc) is documented somewhere? I didn't know it existed, and it's awesome
why do you want your "function rules" to be methods of the class? couldn't you defined them as functions within the __init__ method?
I guess what I'm missing is the benefit of using a class in the first place.
If you are just trying to wrap the model construction somehow, then a better approach is using a function:
def create_model():
model = AbstractModel()
...
#model.Constraint()
def some_rule_function(model):
...
...
return model
EDIT: if you really want to wrap everything into a class:
class Model:
def __init__(self, model):
self.model = model
# alternative constructor:
# def __init__(self):
# self.model = create_model()
def construct(self, data):
# get concrete model
self.model = self.model.create_instance(data)
def run(self, solver, **kwargs):
with pe.SolverFactory(solver) as solver:
solver.solve(self.model, **kwargs)
def construct_and_run(self, data, solver, **kwargs):
self.construct(data)
self.data(solver, **kwargs)
# other behavior you want to add to the class
example usage:
model = Model(create_model())

Trying to answer your direct question, here's something that seems to work for me. My interpretation is that since your model is called self.model, the decorators should also match that.
Note that I used s as the first argument in the constraint method definitions just to see if it worked, but it could also be model or whatever you want to call it.
class Model:
def __init__(self):
self.model = pyo.AbstractModel()
self.model.N = pyo.Param(initialize=5, within=pyo.PositiveIntegers)
self.model.P = pyo.Param(initialize=3, within=pyo.RangeSet(1, self.model.N))
self.model.M = pyo.Param(initialize=3, within=pyo.PositiveIntegers)
self.model.Locations = pyo.RangeSet(1, self.model.N)
self.model.Customers = pyo.RangeSet(1, self.model.M)
self.model.d = pyo.Param(
self.model.Locations,
self.model.Customers,
initialize=lambda n, m, model: random.uniform(1.0, 2.0),
within=pyo.Reals,
)
self.model.x = pyo.Var(
self.model.Locations, self.model.Customers, bounds=(0.0, 1.0)
)
self.model.y = pyo.Var(self.model.Locations, within=pyo.Binary)
#self.model.Objective()
def obj(s):
return sum(
s.d[n, m] * s.x[n, m]
for n in s.Locations
for m in s.Customers
)
#self.model.Constraint(self.model.Customers)
def single_x(s, m):
return (sum(s.x[n, m] for n in s.Locations), 1.0)
#self.model.Constraint(self.model.Locations, self.model.Customers)
def bound_y(s, n, m):
return s.x[n, m] - s.y[n] <= 0.0
#self.model.Constraint()
def num_facilities(s):
return sum(s.y[n] for n in s.Locations) == s.P
You would then be able to instantiate the model with model = Model(), though annoyingly (at least to me), all your Pyomo model components will be within the attribute model.model (e.g., model.model.P).
What I've done before to make the naming cleaner is to inherit from AbstractModel (though the other answer suggests that may not be good practice):
from pyomo.core.base.PyomoModel import AbstractModel
class Model(AbstractModel):
def __init__(self):
AbstractModel.__init__(self)
self.N = pyo.Param(initialize=5, within=pyo.PositiveIntegers)
self.P = pyo.Param(initialize=3, within=pyo.RangeSet(1, self.N))
self.M = pyo.Param(initialize=3, within=pyo.PositiveIntegers)
self.Locations = pyo.RangeSet(1, self.N)
self.Customers = pyo.RangeSet(1, self.M)
self.d = pyo.Param(
self.Locations,
self.Customers,
initialize=lambda n, m, model: random.uniform(1.0, 2.0),
within=pyo.Reals,
)
self.x = pyo.Var(
self.Locations, self.Customers, bounds=(0.0, 1.0)
)
self.y = pyo.Var(self.Locations, within=pyo.Binary)
#self.Objective()
def obj(s):
return sum(
s.d[n, m] * s.x[n, m]
for n in s.Locations
for m in s.Customers
)
#self.Constraint(self.Customers)
def single_x(s, m):
return (sum(s.x[n, m] for n in s.Locations), 1.0)
#self.Constraint(self.Locations, self.Customers)
def bound_y(s, n, m):
return s.x[n, m] - s.y[n] <= 0.0
#self.Constraint()
def num_facilities(s):
return sum(s.y[n] for n in s.Locations) == s.P
In this case, you still instantiate as model = Model() but your Pyomo model components can be accessed as model.P.

TypeError: 'int' object is not iterable...2D array

I'm trying to set a class property of a cell contained in the multidimensional array Board.grid. I get the following error message:
File "newbs1.py", line 133, in get_and_set_board_item
self.grid[x, y].set_cell(value, secondary)
File "newbs1.py", line 129, in __getitem__
return self.grid[x][y]
File "newbs1.py", line 128, in __getitem__
x, y = tup
TypeError: 'int' object is not iterable
The idea of having the x and y entered how they are came from a post by someone else, it fixed their problem but it hasn't worked for me.
class Board(object):
def __init__(self):
self.grid = []
self.width = 10
self.height = 10
def __getitem__(self, tup):
x, y = tup
return self.grid[x][y]
def get_and_set_board_item(self, x, y, value, secondary):
print (x, y, value, secondary)
self.grid[(x, y)].set_cell(value, secondary)
class Cell():
def __init__(self):
self.is_ship = False
self.is_hidden = False
self.ship_symbol = ""
def set_cell(self, value, secondary):
if secondary == None:
self.is_hidden = value
else:
self.is_ship = value
self.ship_symbol = secondary

I'm not sure how the rest of your code looks, but on line:133, self.grid[(x, y)].set_cell(value, secondary), it doesn't look like the tuple (x, y) is a Cell type.
Maybe try:
class Board(object):
def __init__(self):
self.grid = []
self.width = 10
self.height = 10
def __getitem__(self, tup):
x, y = tup
return self.grid[x][y]
def get_and_set_board_item(self, x, y, value, secondary):
print (x, y, value, secondary)
# add this line #
self.grid[x][y] = Cell() # set this to a cell type
# ************* #
self.grid[x][y].set_cell(value, secondary)
class Cell():
def __init__(self):
self.is_ship = False
self.is_hidden = False
self.ship_symbol = ""
def set_cell(self, value, secondary):
if secondary == None:
self.is_hidden = value
else:
self.is_ship = value
self.ship_symbol = secondary

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

KMeans implementation in python with OOP? - python

Related

transforming data first vs doing everything in pipe results in different results when using a model

Why k-means clustering give me different answers when initialized with different centroids?

How to create an importable module?

How to use Pyomo decorator within a class

TypeError: 'int' object is not iterable...2D array

Categories

Resources