I am trying to make an app using streamlit. Inside the script there is a preprocessing of MinMaxScaler using scikitlearn. But, after the transformation it return all the values with zero. Whats wrong with my code?
Here is some of the script :
contract = ['Proyek dibawah 100M','Proyek 100M-150M','Proyek 150M-500M','Proyek diatas 500M']
project_contract = st.selectbox("Select your project contract", contract)
input_spec = pd.DataFrame(columns=['FC','SL','FA'], data=[[FC, SL, FA]])
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'])
if area == 'Jakarta':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[1,0,0,0,0,0]])
elif area == 'Jawa':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,1,0,0,0,0]])
elif area == 'Kalimantan':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,0,1,0,0,0]])
elif area == 'Papua':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,0,0,1,0,0]])
elif area == 'Sulawesi':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,0,0,0,1,0]])
elif area == 'Sumatera':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,0,0,0,0,1]])
elif area == 'Bali & Nusa Tenggara':
input_area = pd.DataFrame(columns=['area_JAKARTA', 'area_JAWA', 'area_KALIMANTAN',
'area_PAPUA', 'area_SULAWESI', 'area_SUMATERA'], data=[[0,0,0,0,0,0]])
if project_contract == 'Proyek dibawah 100M':
input_project = pd.DataFrame(columns=['project_contract_150M-500M', 'project_contract_above 500M',
'project_contract_below 100M'], data =[[0,0,1]])
elif project_contract == 'Proyek 150M-500M':
input_project = pd.DataFrame(columns=['project_contract_150M-500M', 'project_contract_above 500M',
'project_contract_below 100M'], data =[[1,0,0]])
elif project_contract == 'Proyek diatas 500M':
input_project = pd.DataFrame(columns=['project_contract_150M-500M', 'project_contract_above 500M',
'project_contract_below 100M'], data =[[0,1,0]])
elif project_contract == 'Proyek 100M-150M':
input_project = pd.DataFrame(columns=['project_contract_150M-500M', 'project_contract_above 500M',
'project_contract_below 100M'], data =[[0,0,0]])
for i in input_area.columns:
input_area[i] = input_area[i].astype('float')
for j in input_project.columns:
input_project[j] = input_project[j].astype('float')
input_submit = pd.concat([input_spec, input_area, input_project], axis=1)
st.dataframe(input_submit)
scaler = MinMaxScaler()
input_submit_scaled = pd.DataFrame(scaler.fit_transform(input_submit.values), columns=input_submit.columns)
st.dataframe(input_submit_scaled)
The input_submit dataframe
The input_submit_scaled dataframe
Your issue has nothing to do with streamlit but on scaler. When you instantiate the MinMaxScaler() with:
scaler = MinMaxScaler()
Use this scaler to fit the training data. When you have a test sample, use again this scaler to transform it. But do not fit.
Here is a demo.
Code
def demo():
train_data = [[5.0, 16.0, 7.0, 4.0, 1.0, 0.0, 0.0, 0.0],
[8.0, 7.0, 8.0, 1.0, 1.0, 0.0, 0.0, 0.0],
[5.0, 9.0, 9.0, 0.0, 1.0, 0.0, 1.0, 1.0]]
test_data = [[25.0, 12.0, 15.0, 0.0, 1.0, 0.0, 0.0, 0.0]] # input_submit.values
# Scale, fit and transform the train data.
min_max_scaler = MinMaxScaler()
train_data_minmax = min_max_scaler.fit_transform(train_data)
# Save the scaler to disk. import pickle
# scaler_fn = 'scaler_project.pkl'
# with open(scaler_fn, 'wb') as handle:
# pickle.dump(min_max_scaler, handle)
# Scale new data using scaler from distk.
# with open(scaler_fn, 'rb') as handle:
# loaded_scaler = pickle.load(handle)
# test_data_loaded_minmax = loaded_scaler.transform(test_data)
# Scale the single test or input data.
test_data_minmax = min_max_scaler.transform(test_data)
print(f'train_data:\n{train_data}')
print(f'train_data_minmax:\n{train_data_minmax}\n')
print(f'test_data:\n{test_data}')
print(f'test_data_minmax:\n{test_data_minmax}')
demo()
Output
train_data:
[[5.0, 16.0, 7.0, 4.0, 1.0, 0.0, 0.0, 0.0], [8.0, 7.0, 8.0, 1.0, 1.0, 0.0, 0.0, 0.0], [5.0, 9.0, 9.0, 0.0, 1.0, 0.0, 1.0, 1.0]]
train_data_minmax:
[[0. 1. 0. 1. 0. 0.
0. 0. ]
[1. 0. 0.5 0.25 0. 0.
0. 0. ]
[0. 0.22222222 1. 0. 0. 0.
1. 1. ]]
test_data:
[[25.0, 12.0, 15.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
test_data_minmax:
[[6.66666667 0.55555556 4. 0. 0. 0.
0. 0. ]]
Not all scaled input data values are zero.
Reference
https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range
Related
I need to pass an offset pointer to glVertexAttribPointer in Python. Currently I have coming out of pythons console window when an instance of Box class is created:
C:\Users\phill\PycharmProjects\texturebox\venv\Scripts\python.exe C:\Users\phill\PycharmProjects\texturebox\main.py
Traceback (most recent call last):
File "C:\Users\phill\PycharmProjects\texturebox\main.py", line 29, in <module>
main()
File "C:\Users\phill\PycharmProjects\texturebox\main.py", line 18, in main
m_box = Box()
^^^^^
File "C:\Users\phill\PycharmProjects\texturebox\Box.py", line 60, in __init__
array_colors_offset(*vertices_list)
IndexError: invalid index
Process finished with exit code 1
Does anyone know how an offset pointer might be passed in Python?
The corresponding C code for the (void *) looks like this:
// color attribute
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 8 * sizeof(float), (void*)(3 * sizeof(float)));
glEnableVertexAttribArray(1);
Box.py:
from OpenGL import GL as GL
import ctypes
class Box:
def __init__(self):
#3 floats for vertices x,y,z 3 floats for color R G B 2 floats for tex co-ords
vertices_list = [
0.5, 0.5, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,
0.5, -0.5, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-0.5, -0.5, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
-0.5, 0.5, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0
]
indices_list = [
0, 1, 3,
1, 2, 3
]
self.EBO = None
self.VBO = None
self.VAO = None
self.VAO = GL.glGenVertexArrays(1)
self.VBO = GL.glGenBuffers(1)
self.EBO = GL.glGenBuffers(1)
GL.glBindVertexArray(self.VAO)
GL.glBindBuffer(GL.GL_ARRAY_BUFFER, self.VBO)
array_type_vertices = (GL.GLfloat * len(vertices_list))
GL.glBufferData(GL.GL_ARRAY_BUFFER, len(vertices_list) * ctypes.sizeof(ctypes.c_float),
array_type_vertices(*vertices_list), GL.GL_STATIC_DRAW)
GL.glBindBuffer(GL.GL_ELEMENT_ARRAY_BUFFER, self.EBO)
array_type_indices = (GL.GLint * len(indices_list))
GL.glBufferData(GL.GL_ELEMENT_ARRAY_BUFFER, len(indices_list) * ctypes.sizeof(ctypes.c_ulong),
array_type_indices(*indices_list), GL.GL_STATIC_DRAW)
GL.glVertexAttribPointer(
0,
3,
GL.GL_FLOAT,
False,
8 * ctypes.sizeof(ctypes.c_float),
None
)
GL.glEnableVertexAttribArray(0)
# attempting to pass the pointer
array_colors_offset = (GL.GLfloat * 3)
GL.glVertexAttribPointer(1,
3,
GL.GL_FLOAT,
False,
8 * ctypes.sizeof(ctypes.c_float),
array_colors_offset(*vertices_list)
)
GL.glEnableVertexAttribArray(1)
array_tex_offset = (GL.GLfloat * 6)
GL.glVertexAttribPointer(2,
2,
GL.GL_FLOAT,
GL.GL_FALSE,
8 * ctypes.sizeof(ctypes.c_float),
array_tex_offset(*vertices_list)
)
print("Box()")
So I started with code from gitlabs opengl python examples. The basic triangle does not seem to enumerate how to pass a pointer to glVertexAtribPointer. So there is
an offset to pass as a (void *) according to the C code I have that I am moving over into Python.
(GL.GLfloat * 6)(*vertices_list) does not do what you expect. It creates an array with 6 elements.
See pyopengl, glVertexAttribPointer. The argument must be a void pointer. You can use the ctypes.c_void_p to create a void pointer, e.g.:
GL.glVertexAttribPointer(1,
3,
GL.GL_FLOAT,
False,
8 * ctypes.sizeof(ctypes.c_float),
ctypes.c_void_p(3 * ctypes.sizeof(ctypes.c_float))
)
class Clinic:
def __init__(self, medicationList):
self._medicationList = {}
def addMedication(self, medication):
self._medicationList.append(medication)
def main():
m1 = ('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
m2 = ('DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2)
m3 = ('LH03', 'Lyso-X Hydrochloride', 1.00, 10.0, 1)
cl = Clinic()
cl.addMedication('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
print(c1)
main()
I am trying to append the medicine into the _medicationList. How do I go about doing it? So the dictionary will be something like
{m1 : ['CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3 ] , m2 : ['DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2]}`
It looks like you want to be using a list instead of a dictionary. Maybe something like this?
from typing import List, Tuple
Medication = Tuple[str, str, float, float, int]
class Clinic:
def __init__(self):
self._medications: List[Medication] = []
def add_medication(self, medication: Medication) -> None:
self._medications.append(medication)
def __str__(self) -> str:
return str(self._medications)
def main() -> None:
m1 = ('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
m2 = ('DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2)
m3 = ('LH03', 'Lyso-X Hydrochloride', 1.00, 10.0, 1)
cl = Clinic()
for m in m1, m2, m3:
cl.add_medication(m)
print(cl)
if __name__ == '__main__':
main()
If you want it to be a dict instead of a list, maybe something more like this?
from typing import Dict, Tuple
Medication = Tuple[str, str, float, float, int]
class Clinic:
def __init__(self):
self._medications: Dict[str, Medication] = {}
def add_medication(self, name: str, medication: Medication) -> None:
self._medications[name] = medication
def __str__(self) -> str:
return str(self._medications)
def main() -> None:
m1 = ('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
m2 = ('DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2)
m3 = ('LH03', 'Lyso-X Hydrochloride', 1.00, 10.0, 1)
cl = Clinic()
cl.add_medication("m1", m1)
cl.add_medication("m2", m2)
cl.add_medication("m3", m3)
print(cl)
if __name__ == '__main__':
main()
class Clinic:
def __init__(self):
self._medication_list = {}
def add_medication(self, medication):
self._medication_list[medication[0]] = medication[1:]
def get_medication_list(self):
return self._medication_list
def main():
m1 = ('m1','CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
m2 = ('m2','DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2)
m3 = ('m3','LH03', 'Lyso-X Hydrochloride', 1.00, 10.0, 1)
lst = [m1,m2,m3]
cl = Clinic()
for m in lst:
cl.add_medication(m)
print(cl.get_medication_list())
main()
is this what you wanted?
1.Your self._medicationList is not a list first (I think you show assign to an empty dictionary instead)
Your addMedication() function takes only one parameter, you've provided 6, I guess you should use '*'
Here is the code that works, try running it:
class Clinic:
def __init__(self):
self.medicationList = dict()
def addMedication(self, name, *medication):
self.medicationList.update({name: list(medication)})
def main():
m1 = ('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
m2 = ('DM01', 'Dex-2 trimethorphan-0', 0.25, 15.0, 2)
m3 = ('LH03', 'Lyso-X Hydrochloride', 1.00, 10.0, 1)
cl = Clinic()
cl.addMedication('CP12', 'Chloro-6 pheniramine-X', 0.08, 4.0, 3)
print(cl.medicationList)
main()
I have below code that converts an implied volatility into piecewise constant volatility. In the below code I get an error:
for j, _vol in enumerate(_boot_vol,2):
TypeError: 'numpy.float64' object is not iterable
But neither _vol or _boot_vol is a numpy array. Need your wisdom to resolve this please
Code:
termstruct = np.array([0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]);
forwardcurve = np.array([0.0112, 0.0118, 0.0123, 0.0127, 0.0132, 0.0137, 0.0145,0.0154, 0.0163, 0.0174]);
capletvols = np.array([0.2366, 0.2487, 0.2573, 0.2564, 0.2476, 0.2376, 0.2252,0.2246, 0.2223]);
num_times = len(termstruct);
tau= np.diff(termstruct);
class computevol:
def _caliberatevol():
global termstruct
global forwardcurve
global tau
global capletvols
_vols = np.zeros((len(forwardcurve),len(termstruct)))
_boot_vol = []
for i , _capvol in enumerate(capletvols,2):
_boot_vol = _capvol**2 * termstruct[i-1]
for j, _vol in enumerate(_boot_vol,2):
_boot_vol -= _vol**2*tau[j-1]
_boot_vol.append(_boot_vol,np.sqrt(_boot_vol/tau(0)))
_vols[1:,1] = _boot_vol
for i in range(2,len(termstruct)):
_vols[i:,i] = _boot_vol[:-i+1]
return _vols
Needed to use a temporary variable in between
class computevol:
def _caliberatevol():
global termstruct
global forwardcurve
global tau
global capletvols
_vols = np.zeros((len(forwardcurve),len(termstruct)))
_boot_vol = []
for i , _capvol in enumerate(capletvols,2):
_temp= _capvol**2 * termstruct[i-1]
for j, _vol in enumerate(_boot_vol,2):
_temp -= _vol**2*tau[j-1]
_boot_vol.append(np.sqrt(_temp/tau[0]))
_vols[1:,1] = _boot_vol
for i in range(2,len(termstruct)):
_vols[i:,i] = _boot_vol[:-i+1]
return _vols
df = pd.read_csv(r'main.csv', header=0)
spark = SparkSession \
.builder \
.master("local") \
.appName("myapp") \
.getOrCreate()
s_df = spark.createDataFrame(df)
transformed_df = s_df.rdd.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[1:])))
splits = [0.7, 0.3]
training_data, test_data = transformed_df.randomSplit(splits, 100)
model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
predictions = model.predict(test_data.map(lambda x: x.features))
when print test_data.map(lambda x: x.features) the result is
[DenseVector([1431500000.0, 9.3347, 79.8337, 44.6364, 194.0, 853.0, 196.9998]),
DenseVector([1431553600.0, 9.5484, 80.7409, 39.5968, 78.0, 923.0, 196.9994])....]
numbers inside the DenseVector([numbers]) are correct for prediction
but the result of the prediction is 0
[0.0, 0.0, 0.0, 0.0, 0.0...]
I am trying to implement the code on this website to estimate what value of K I should use for my K means clustering.
https://datasciencelab.wordpress.com/2014/01/21/selection-of-k-in-k-means-clustering-reloaded/
However I am not getting any success - in particular I am trying to get the f(k) vs the number of clusters k graph which I can use to procure the ideal value of k to use.
My data format is as follows:
Each of the coordinates have 5 dimensions/variables i.e. they are data points that live in a five-dimensional space.
The list of the coordinates are below, where for example the first data point has coordinates ( 35.38361202590826,-24.022420305129415, 0.9608968122051765, -11.700331772145386, -9.4393980963685).
Variable1 = [35.38361202590826, 3.0, 10.0, 10.04987562112089, 5.385164807134505, 24.35159132377184, 10.77032961426901, 10.816653826391967, 18.384776310850235, 14.317821063276353, 24.18677324489565, 3.0, 24.33105012119288, 8.94427190999916, 2.82842712474619, 4.123105625617661, 4.47213595499958, 13.453624047073712, 12.529964086141668, 19.4164878389476, 5.385164807134505, 5.0, 24.041630560342618, 30.083217912982647, 15.132745950421555, 1.414213562373095, 21.470910553583888, 12.649110640673516, 9.0, 9.055385138137416, 16.124515496597102, 18.027756377319946, 7.615773105863908, 4.47213595499958, 5.0, 16.124515496597102, 8.246211251235321, 3.0, 23.02172886644268, 2.23606797749979, 10.0, 13.416407864998737, 14.7648230602334, 12.649110640673516, 2.82842712474619, 9.899494936611665, 12.806248474865697, 13.0, 10.19803902718557, 10.440306508910549]
Variable2 = [-24.022420305129415, -40.0, -21.0, -36.020346285601605, -14.298541039632994, -10.225204451297113, -7.242118188905023, -10.816653826391967, -16.263455967290593, -0.9079593845004517, -5.70559779110359, -1.0, -17.426292654367874, -0.4472135954999579, -12.727922061357855, -38.32062875574061, -15.205262246998569, -13.89960053482201, -6.943355894868313, -18.43793805396085, -14.298541039632994, -8.0, -9.899494936611665, -10.537436550735357, -9.251460406371256, -1.414213562373095, -0.23287321641631115, -4.743416490252569, -10.0, -25.951408627588936, -5.457528321925173, -11.648704120729812, -15.231546211727816, -9.838699100999074, -2.2, 4.713319914389921, -3.395498750508662, -32.0, -16.59301967354925, -4.47213595499958, -3.4, -13.416407864998737, 4.944183868793753, -3.478505426185217, -21.213203435596423, -18.384776310850235, -6.871645523098667, -21.0, -5.491251783869154, -8.620436566990362]
Variable3 = [0.9608968122051765, 22.0, 21.0, 18.507691737905798, 15.412713068695306, -8.08982038917884, -0.7427813527082074, -7.211102550927978, -14.849242404917499, -0.4190581774617469, -10.170848236315095, -7.0, 1.150792911137501, -5.366563145999495, -12.727922061357855, 4.85071250072666, 9.838699100999074, -8.473553267217696, 6.065460321953928, -10.249021432229634, 4.642383454426297, -9.0, 9.899494936611665, 4.354587344310195, -8.854969246098202, -8.48528137423857, -10.292996165600954, -11.067971810589327, -30.0, -10.932721081409808, -14.6360986815266, -22.188007849009164, 0.0, -7.155417527999327, -5.4, -12.279438724331637, 19.40285000290664, -7.0, 18.938629784469825, 8.94427190999916, 3.8, -8.94427190999916, -43.549455173073746, -8.538149682454623, -11.31370849898476, 1.4142135623730951, -10.619815808425212, 12.0, 7.060180864974626, -7.854175538813441]
Variable4 = [-11.700331772145386, -8.0, -5.0, -2.9851115706299676, -10.398938937914904, -8.459406092237773, -7.242118188905023, -10.539303728279352, -21.920310216782973, -8.03194840135015, -10.791021909261136, -10.0, -9.69954025101608, -2.6832815729997477, -23.33452377915607, -7.761140001162655, -17.44133022449836, -4.980070779856015, -2.7134954071899156, -6.48933015307002, -12.441587657862476, -5.2, -18.384776310850235, -10.603918800266811, -14.604091070057484, -4.949747468305833, -1.3506646552146047, -7.905694150420948, -14.0, -29.706080514133717, -2.4806946917841692, -23.574758339572238, -3.2826608214930637, -5.813776741499453, -13.4, -4.9613893835683385, -11.884245626780316, -19.0, -5.473090258814675, -2.23606797749979, -2.0, -2.6832815729997477, -6.163297699455227, -12.01665510863984, -12.727922061357855, -12.020815280171307, -8.589556903873333, -18.53846153846154, -5.491251783869154, -4.789131426105757]
Variable5 = [-9.4393980963685, -4.0, -2.0, -0.29851115706299675, -9.84185292338375, 6.118696639531204, -6.127946159842712, -2.218800784900916, 10.606601717798213, 0.6984302957695782, 0.7442084075352507, -0.0, 3.452378733412503, 1.3416407864998738, -6.363961030678928, 6.305926250944657, -5.813776741499453, -0.4459764877482998, -0.7980868844676221, 7.673890419106611, -1.4855627054164149, 1.4, -2.8284271247461903, -2.925218979383948, 3.9649116027305387, 0.7071067811865475, 0.4191717895493601, 1.5811388300841895, -4.0, 4.748555621218401, 4.341215710622296, 4.714951667914447, -5.120950881529179, 4.919349550499537, 6.2, 0.6201736729460423, -6.305926250944657, -9.0, -6.168085847235585, 0.0, -1.0, 1.3416407864998738, 3.3186987612451224, 4.427188724235731, 4.242640687119285, 4.949747468305833, 5.9346029517670305, 2.3076923076923075, -3.1378581622109447, 1.436739427831727]
I am able to use scikit-learn to create clusters with these coordinates however I am interested in finding the optimal k value to use - however scikit-learn does not have a feature where I can estimate the optimal value of K with this technique (or any technique as far as I am aware).
You can try the code in the last comment by Monte Shaffer.
Here's a simplified version:
import numpy as np
import random
from numpy import zeros
class KMeansFK():
def __init__(self, K, X):
self.K = K
self.X = X
self.N = len(X)
self.mu = None
self.clusters = None
self.method = None
def _cluster_points(self):
mu = self.mu
clusters = {}
for x in self.X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
self.clusters = clusters
def _reevaluate_centers(self):
clusters = self.clusters
newmu = []
keys = sorted(self.clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
self.mu = newmu
def _has_converged(self):
K = len(self.oldmu)
return(set([tuple(a) for a in self.mu]) == \
set([tuple(a) for a in self.oldmu])\
and len(set([tuple(a) for a in self.mu])) == K)
def find_centers(self, K, method='random'):
self.method = method
X = self.X
K = self.K
# https://stackoverflow.com/questions/44372231/population-must-be-a-sequence-or-set-for-dicts-use-listd
self.oldmu = random.sample(list(X), K)
if method != '++':
# Initialize to K random centers
self.mu = random.sample(list(X), K)
while not self._has_converged():
self.oldmu = self.mu
# Assign all points in X to clusters
self._cluster_points()
# Reevaluate centers
self._reevaluate_centers()
def _dist_from_centers(self):
cent = self.mu
X = self.X
D2 = np.array([min([np.linalg.norm(x-c)**2 for c in cent]) for x in X])
self.D2 = D2
def _choose_next_center(self):
self.probs = self.D2/self.D2.sum()
self.cumprobs = self.probs.cumsum()
r = random.random()
ind = np.where(self.cumprobs >= r)[0][0]
return(self.X[ind])
def init_centers(self,K):
self.K = K
#self.mu = random.sample(self.X, 1)
self.mu = random.sample(list(self.X), 1)
while len(self.mu) < self.K:
self._dist_from_centers()
self.mu.append(self._choose_next_center())
def get_ak(self,k, Nd):
if k == 2:
return( 1 - 3.0 / (4.0 * Nd ) )
else:
previous_a = self.get_ak(k-1, Nd)
return ( previous_a + (1.0-previous_a)/6.0 )
def fK(self, thisk, Skm1=0):
X = self.X
Nd = len(X[0])
self.find_centers(thisk, method='++')
mu, clusters = self.mu, self.clusters
Sk = sum([np.linalg.norm(mu[i]-c)**2 \
for i in range(thisk) for c in clusters[i]])
if thisk == 1:
fs = 1
elif Skm1 == 0:
fs = 1
else:
fs = Sk/(self.get_ak(thisk,Nd)*Skm1)
return fs, Sk
def run(self, maxk):
ks = range(1,maxk)
fs = zeros(len(ks))
Wks,Wkbs,sks = zeros(len(ks)+1),zeros(len(ks)+1),zeros(len(ks)+1)
# Special case K=1
self.init_centers(1)
fs[0], Sk = self.fK(1)
# Rest of Ks
for k in ks[1:]:
self.init_centers(k)
fs[k-1], Sk = self.fK(k, Skm1=Sk)
self.fs = fs
And then run it on your data:
X = np.array([Variable1, Variable2, Variable3, Variable4, Variable5])
km = kmeans.KMeansFK(2, X)
km.run(5)
Now km.clusters has the result.