Classification Techniques In python - python

I have found two codes online on Classification Techniques. One technique is Naive Bayes and the other one is KNn. I have used two datasets: one is iris.data and the other one is prima-indians-diabetes.data.
The prima indians dataset is working properly in Naive Bayes Algorithm and Iris.data is working correctly in KNn algorithm. But I want to compare both the algorithms which is possible only when one dataset runs in both of the algorithms.
I am attaching the algorithm of Naive bayes and KNn with both the datasets. and the respective tracebacks.
Naive Bayes with iris.data
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers) / float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
filename = 'E:\iris.data.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print(('Accuracy: {0}%').format(accuracy))
main()
and the traceback for this is:
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py',
wdir='C:/Users/Lenovo/Desktop/EE Codes') Traceback (most recent call
last):
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py",
line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py', wdir='C:/Users/Lenovo/Desktop/EE Codes')
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 710, in runfile
execfile(filename, namespace)
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py", line 63
print 'Train set: ' + repr(len(trainingSet))
^ SyntaxError: invalid syntax
KNn with prima indians:
# Example of kNN implemented from Scratch in Python
import csv
import random
import math
import operator
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
with open(filename, 'rt') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance)-1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
# prepare data
trainingSet=[]
testSet=[]
split = 0.67
loadDataset('E:\pima-indians-diabetes.data.csv', split, trainingSet, testSet)
print 'Train set: ' + repr(len(trainingSet))
print 'Test set: ' + repr(len(testSet))
# generate predictions
predictions=[]
k = 3
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
main()
And the traceback is:
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py',
wdir='C:/Users/Lenovo/Desktop/EE Codes') Traceback (most recent call
last):
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py",
line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py', wdir='C:/Users/Lenovo/Desktop/EE Codes')
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 710, in runfile
execfile(filename, namespace)
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py", line 63
print 'Train set: ' + repr(len(trainingSet))
^ SyntaxError: invalid syntax
What is the problem with these two bits of code?

Related

Getting TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class '...Categorical'>

I am trying to run a Resnet model through Skorch for classification which I found in a research paper. I am still learning the ways of Torch and Skorch, and I'm unable to find what to fix to get this to work.
ResNet class:
class ResNet(nn.Module):
def __init__(
self,
*,
d_numerical: int,
categories: ty.Optional[ty.List[int]],
d_embedding: int,
d: int,
d_hidden_factor: float,
n_layers: int,
activation: str,
normalization: str,
hidden_dropout: float,
residual_dropout: float,
d_out: int,
regression: bool,
categorical_indicator
) -> None:
super().__init__()
#categories = None #TODO
def make_normalization():
return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[
normalization[0]
](d)
self.categorical_indicator = categorical_indicator #Added
self.regression = regression
self.main_activation = deep.get_activation_fn(activation)
self.last_activation = deep.get_nonglu_activation_fn(activation)
self.residual_dropout = residual_dropout
self.hidden_dropout = hidden_dropout
d_in = d_numerical
d_hidden = int(d * d_hidden_factor)
if categories is not None:
d_in += len(categories) * d_embedding
category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
self.register_buffer('category_offsets', category_offsets)
self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding)
nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
print(f'{self.category_embeddings.weight.shape}')
self.first_layer = nn.Linear(d_in, d) # 1, 256
self.layers = nn.ModuleList(
[
nn.ModuleDict(
{
'norm': make_normalization(),
'linear0': nn.Linear(
d, d_hidden * (2 if activation.endswith('glu') else 1)
),
'linear1': nn.Linear(d_hidden, d),
}
)
for _ in range(n_layers)
]
)
self.last_normalization = make_normalization()
self.head = nn.Linear(d, d_out) # 256, 1
def forward(self, x) -> Tensor:
if not self.categorical_indicator is None:
x_num = x[:, ~self.categorical_indicator].float()
x_cat = x[:, self.categorical_indicator].long() #TODO
else:
x_num = x
x_cat = None
x = []
if x_num is not None:
x.append(x_num)
if x_cat is not None:
x.append(
self.category_embeddings(x_cat + self.category_offsets[None]).view(
x_cat.size(0), -1
)
)
x = torch.cat(x, dim=-1)
x = self.first_layer(x)
for layer in self.layers:
layer = ty.cast(ty.Dict[str, nn.Module], layer)
z = x
z = layer['norm'](z)
z = layer['linear0'](z)
z = self.main_activation(z)
if self.hidden_dropout:
z = F.dropout(z, self.hidden_dropout, self.training)
z = layer['linear1'](z)
if self.residual_dropout:
z = F.dropout(z, self.residual_dropout, self.training)
x = x + z
x = self.last_normalization(x)
x = self.last_activation(x)
x = self.head(x)
if not self.regression:
x = x.squeeze(-1)
return x
class InputShapeSetterResnet(skorch.callbacks.Callback):
def __init__(self, regression=False, batch_size=None,
categorical_indicator=None):
self.categorical_indicator = categorical_indicator
self.regression = regression
self.batch_size = batch_size
def on_train_begin(self, net, X, y):
print("categorical_indicator", self.categorical_indicator)
if self.categorical_indicator is None:
d_numerical = X.shape[1]
categories = None
else:
d_numerical = X.shape[1] - sum(self.categorical_indicator)
# categories = list((X[:, self.categorical_indicator].max(0) + 1).astype(int))
categories = [sum(self.categorical_indicator)]
net.set_params(module__d_numerical=d_numerical,
module__categories=categories, #FIXME #lib.get_categories(X_cat),
module__d_out=2 if self.regression == False else 1) #FIXME#D.info['n_classes'] if D.is_multiclass else 1,
print("Numerical features: {}".format(d_numerical))
print("Categories {}".format(categories))
Skorch Wrapper:
def create_resnet_skorch(id, wandb_run=None, use_checkpoints=True,
categorical_indicator=None, **kwargs):
print(kwargs)
if "verbose" not in kwargs:
verbose = 0
else:
verbose = kwargs.pop("verbose")
if "lr_scheduler" not in kwargs:
lr_scheduler = False
else:
lr_scheduler = kwargs.pop("lr_scheduler")
if "es_patience" not in kwargs.keys():
es_patience = 40
else:
es_patience = kwargs.pop('es_patience')
if "lr_patience" not in kwargs.keys():
lr_patience = 30
else:
lr_patience = kwargs.pop('lr_patience')
optimizer = kwargs.pop('optimizer')
if optimizer == "adam":
optimizer = Adam
elif optimizer == "adamw":
optimizer = AdamW
elif optimizer == "sgd":
optimizer = SGD
device = kwargs.pop('device')
if device == "cuda": # ! only for CPU training, is cuda by default
device = "cpu"
batch_size = kwargs.pop('batch_size')
callbacks = [InputShapeSetterResnet(categorical_indicator=categorical_indicator),
EarlyStopping(monitor="valid_loss",
patience=es_patience)]
callbacks.append(EpochScoring(scoring='accuracy', name='train_accuracy', on_train=True))
if lr_scheduler:
callbacks.append(LRScheduler(policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5,
factor=0.2)) # FIXME make customizable
if use_checkpoints:
callbacks.append(Checkpoint(dirname="skorch_cp", f_params=r"params_{}.pt".format(id), f_optimizer=None,
f_criterion=None))
if not wandb_run is None:
callbacks.append(WandbLogger(wandb_run, save_model=False))
callbacks.append(LearningRateLogger())
if not categorical_indicator is None:
categorical_indicator = torch.BoolTensor(categorical_indicator)
mlp_skorch = NeuralNetClassifier(
ResNet,
# Shuffle training data on each epoch
criterion=torch.nn.CrossEntropyLoss,
optimizer=optimizer,
batch_size=max(batch_size, 1), # if batch size is float, it will be reset during fit
iterator_train__shuffle=True,
module__d_numerical=1, # will be change when fitted
module__categories=None, # will be change when fitted
module__d_out=1, # idem
module__regression=False,
module__categorical_indicator=categorical_indicator,
verbose=verbose,
callbacks=callbacks,
**kwargs
)
return mlp_skorch
Skorch Model:
<class 'skorch.classifier.NeuralNetClassifier'>[uninitialized](
module=<class 'tabular.bin.resnet.ResNet'>,
module__activation=reglu,
module__categorical_indicator=tensor([False, True, False, False, False, False, False, False]),
module__categories=None,
module__d=256,
module__d_embedding=128,
module__d_hidden_factor=2,
module__d_numerical=1,
module__d_out=1,
module__hidden_dropout=0.2,
module__n_layers=8,
module__normalization=['batchnorm'],
module__regression=False,
module__residual_dropout=0.2,
)
I have 8 columns in X for training, 1 of which is a categorical column which is to be embedding through an embedding layer in the NN. From what I've found so far, that is the root of this error since it's coming across this categorical class in execution. But in the forward method, it's supposed to have an embedding layer for the same. Any idea what changes I might need to make for the same?
Error stack:
Traceback (most recent call last):
File "/test.py", line 639, in <module>
model.fit(X_train, y_train)
File "/anaconda3/lib/python3.9/site-packages/skorch/classifier.py", line 142, in fit
return super(NeuralNetClassifier, self).fit(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 917, in fit
self.partial_fit(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 876, in partial_fit
self.fit_loop(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 789, in fit_loop
self.run_single_epoch(dataset_train, training=True, prefix="train",
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 822, in run_single_epoch
for data in self.get_iterator(dataset, training=training):
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
return [default_collate(samples) for samples in transposed]
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 86, in default_collate
raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'pandas.core.arrays.categorical.Categorical'>

Scikit-Multiflow - Cannot take a larger sample than population when 'replace'=False

So I was trying to run the following code, where x is a feature vector with dimensions (2381,) and y is a label with dimension (1,) after being cast to a Numpy array.
from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data
np.random.seed(1)
def main() -> None:
dataset = data.get_full_dataset()
metadata = data.get_metadata()
training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
model = AdaptiveRandomForestClassifier()
correct_count = 0
n_samples = 0
for x, y in zip(initial_features, initial_labels):
y = np.asarray([y])
y_prediction = model.predict(x)
if y_prediction[0] == y:
correct_count += 1
model.partial_fit(x, y)
n_samples += 1
print(f"Accuracy: {correct_count / n_samples}")
if __name__ == "__main__":
main()
However, I am yielding the following error:
Traceback (most recent call last):
File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 31, in <module>
main()
File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 24, in main
model.partial_fit(x, y)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 313, in partial_fit
self._partial_fit(X[i], y[i], self.classes, weight[i])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 328, in _partial_fit
self.ensemble[i].partial_fit(np.asarray([X]), np.asarray([y]),
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 569, in partial_fit
self.classifier.partial_fit(X, y, classes=classes, sample_weight=sample_weight)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 394, in partial_fit
self._partial_fit(X[i], y[i], sample_weight[i])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 424, in _partial_fit
learning_node.learn_from_instance(X, y, sample_weight, self)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_nb_adaptive.py", line 54, in learn_from_instance
super().learn_from_instance(X, y, weight, ht)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 58, in learn_from_instance
self.list_attributes = self._sample_features(get_dimensions(X)[1])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 72, in _sample_features
return self.random_state.choice(
File "mtrand.pyx", line 965, in numpy.random.mtrand.RandomState.choice
ValueError: Cannot take a larger sample than population when 'replace=False'
Can anyone help me out?
Going to answer my own question, since scikit-multiflow does not necessarily have the best documentation. The feature vector x has to have dimensions (1, n), which in this case yields (1, 2381). This can be achieved programmatically as follows:
from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data
np.random.seed(1)
def main() -> None:
dataset = data.get_full_dataset()
metadata = data.get_metadata()
training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
model = AdaptiveRandomForestClassifier()
correct_count = 0
n_samples = 0
for x, y in zip(initial_features, initial_labels):
x = np.expand_dims(x, axis=0)
y = np.asarray([y])
y_prediction = model.predict(x)
if y_prediction[0] == y:
correct_count += 1
model.partial_fit(x, y)
n_samples += 1
print(f"Accuracy: {correct_count / n_samples}")
if __name__ == "__main__":
main()

Error in build while using keras custom layer

I am trying to train an unsupervised classification model for which i am using deep clustering with my model on Keras.
The code I am referring for clustering is this.
While running the code i am getting an error in the cutom layer while adding weights. Below you can see the Code and the error.
import metrics
import numpy as np
from tensorflow.keras.layers import Layer, InputSpec
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from sklearn.cluster import KMeans
class ClusteringLayer(Layer):
"""
Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
sample belonging to each cluster. The probability is calculated with student's t-distribution.
# Example
```
model.add(ClusteringLayer(n_clusters=10))
```
# Arguments
n_clusters: number of clusters.
weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
alpha: parameter in Student's t-distribution. Default to 1.0.
# Input shape
2D tensor with shape: `(n_samples, n_features)`.
# Output shape
2D tensor with shape: `(n_samples, n_clusters)`.
"""
def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(ClusteringLayer, self).__init__(**kwargs)
self.n_clusters = n_clusters
self.alpha = alpha
self.initial_weights = weights
self.input_spec = InputSpec(ndim=2)
def build(self, input_shape):
assert len(input_shape) == 2
input_dim = input_shape[1]
self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
if self.initial_weights is not None:
self.set_weights(self.initial_weights)
del self.initial_weights
self.built = True
def call(self, inputs, **kwargs):
""" student t-distribution, as same as used in t-SNE algorithm.
q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it.
Arguments:
inputs: the variable containing data, shape=(n_samples, n_features)
Return:
q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
"""
q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
q **= (self.alpha + 1.0) / 2.0
q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
return q
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) == 2
return input_shape[0], self.n_clusters
def get_config(self):
config = {'n_clusters': self.n_clusters}
base_config = super(ClusteringLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Inf:
def __init__(self, D1, D2, n_clusters):
from tensorflow.keras.models import model_from_json
self.n_clusters = n_clusters
json_file = open(D1, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(D2)
print("Loaded model from disk")
loaded_model.summary()
self.model = loaded_model
def create_model(self):
hidden = self.model.get_layer(name='encoded').output
self.encoder = Model(inputs = self.model.input, outputs = hidden)
clustering_layer = ClusteringLayer(n_clusters=self.n_clusters)(hidden)
self.model = Model(inputs = self.model.input, outputs = clustering_layer)
self.model = model
def compile(self, loss='kld', optimizer='adam'):
self.model.compile(loss=loss, optimizer=optimizer)
def fit(self, x, y=None, batch_size=16, maxiter=2e4, tol=1e-3, update_interval=140, save_dir='./results/temp'):
print('Update interval', update_interval)
save_interval = x.shape[0] / batch_size * 5
print('Save interval', save_interval)
print('Initializing cluster centers with k-means.')
kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
self.y_pred = kmeans.fit_predict(self.encoder.predict(x))
y_pred_last = np.copy(self.y_pred)
self.model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])
# Step : deep clustering
# logging file
import csv, os
if not os.path.exists(save_dir):
os.makedirs(save_dir)
logfile = open(save_dir + '/dcec_log.csv', 'w')
logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
logwriter.writeheader()
loss = [0, 0, 0]
index = 0
for ite in range(int(maxiter)):
if ite % update_interval == 0:
q, _ = self.model.predict(x, verbose=0)
p = self.target_distribution(q) # update the auxiliary target distribution p
# evaluate the clustering performance
self.y_pred = q.argmax(1)
if y is not None:
acc = np.round(metrics.acc(y, self.y_pred), 5)
nmi = np.round(metrics.nmi(y, self.y_pred), 5)
ari = np.round(metrics.ari(y, self.y_pred), 5)
loss = np.round(loss, 5)
logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2])
logwriter.writerow(logdict)
print('Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari', ari, '; loss=', loss)
# check stop criterion
delta_label = np.sum(self.y_pred != y_pred_last).astype(np.float32) / self.y_pred.shape[0]
y_pred_last = np.copy(self.y_pred)
if ite > 0 and delta_label < tol:
print('delta_label ', delta_label, '< tol ', tol)
print('Reached tolerance threshold. Stopping training.')
logfile.close()
break
# train on batch
if (index + 1) * batch_size > x.shape[0]:
loss = self.model.train_on_batch(x=x[index * batch_size::],
y=[p[index * batch_size::], x[index * batch_size::]])
index = 0
else:
loss = self.model.train_on_batch(x=x[index * batch_size:(index + 1) * batch_size],
y=[p[index * batch_size:(index + 1) * batch_size],
x[index * batch_size:(index + 1) * batch_size]])
index += 1
# save intermediate model
if ite % save_interval == 0:
# save DCEC model checkpoints
print('saving model to:', save_dir + '/dcec_model_' + str(ite) + '.h5')
self.model.save_weights(save_dir + '/dcec_model_' + str(ite) + '.h5')
ite += 1
# save the trained model
logfile.close()
print('saving model to:', save_dir + '/dcec_model_final.h5')
self.model.save_weights(save_dir + '/dcec_model_final.h5')
My Output layer is a dense layer with output dimension(?,128).
I am getting a following error in the clustering layer.
File "C:/Users/u/Desktop/trained/inference.py", line 45, in build
self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 384, in add_weight
aggregation=aggregation)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\training\tracking\base.py", line 663, in _add_variable_with_custom_getter
**kwargs_for_getter)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\keras\engine\base_layer_utils.py", line 155, in make_variable
shape=variable_shape if variable_shape.rank else None)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\variables.py", line 259, in __call__
return cls._variable_v1_call(*args, **kwargs)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\variables.py", line 220, in _variable_v1_call
shape=shape)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\variables.py", line 198, in <lambda>
previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2495, in default_variable_creator
shape=shape)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\variables.py", line 263, in __call__
return super(VariableMetaclass, cls).__call__(*args, **kwargs)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 460, in __init__
shape=shape)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 604, in _init_from_args
initial_value() if init_from_fn else initial_value,
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\keras\engine\base_layer_utils.py", line 135, in <lambda>
init_val = lambda: initializer(shape, dtype=dtype)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\init_ops.py", line 533, in __call__
shape, -limit, limit, dtype, seed=self.seed)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\random_ops.py", line 239, in random_uniform
shape = _ShapeTensor(shape)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\ops\random_ops.py", line 44, in _ShapeTensor
return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\ops.py", line 1087, in convert_to_tensor
return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\ops.py", line 1145, in convert_to_tensor_v2
as_ref=False)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\ops.py", line 1224, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\constant_op.py", line 305, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\constant_op.py", line 246, in constant
allow_broadcast=True)
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\constant_op.py", line 284, in _constant_impl
allow_broadcast=allow_broadcast))
File "C:\Users\u\AppData\Local\Continuum\anaconda3\envs\test_env\lib\site-packages\tensorflow\python\framework\tensor_util.py", line 562, in make_tensor_proto
"supported type." % (type(values), values))
TypeError: Failed to convert object of type <class 'tuple'> to Tensor. Contents: (17, Dimension(128)). Consider casting elements to a supported type.
I have used an autoencoder's encoder past as an input. Following is the encoder part of the autoencoder.
ip = Input(shape=(256,256,1))
x = Conv2D(16, (3,3), padding='same')(ip)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.2)(x)
x = MaxPooling2D((2,2), padding='same')(x)
x = Flatten()(x)
x = Dense(128, name="encoded")(x)
Replace
input_dim = input_shape[1]
with
input_dim = input_shape[1].value
in the build() method of ClusteringLayer, so that input_dim will be 128 instead of Dimension(128).
Replace
input_dim = input_shape[1].value
With
input_dim = input_shape[1]
and also Replace
if (index + 1) * batch_size > x.shape[0]:
loss = self.model.train_on_batch(x=x[index * batch_size::], y=[p[index * batch_size::], x[index * batch_size::]])
index = 0
else:
loss = self.model.train_on_batch(x=x[index * batch_size:(index + 1) * batch_size], y=[p[index * batch_size:(index + 1) * batch_size], x[index * batch_size:(index + 1) * batch_size]])
index += 1
With
if (index + 1) * batch_size > x.shape[0]:
loss = self.model.train_on_batch(x=x[index * batch_size::], y=p[index * batch_size::])
index = 0
else:
loss = self.model.train_on_batch(x=x[index * batch_size:(index + 1) * batch_size], y=p[index * batch_size:(index + 1) * batch_size])
index += 1

Python classification technique naive bayes

I am doing a research on classification techniques. I found a code online for Naive Bayes Classification in python. I have shared the code below. But I am getting errors in it. Please help in solving the errors. The software I am using is Anaconda with Python 3.6 in it.
The code is as follows:
import csv
def loadCsv(filename):
lines = csv.reader(open(filename))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
import random
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
import math
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.iteritems():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.iteritems():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
splitRatio = 0.67
filename = 'E:\iris.data.csv'
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train = {1} and test = {2} rows'). format(len(dataset), len(trainingSet), len(testSet))
summaries = summarizeByClass(trainingSet)
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%').format(accuracy)
main()
And I am getting this as my traceback:
runfile('C:/Users/Lenovo/Desktop/Naive .py', wdir='C:/Users/Lenovo/Desktop')
Traceback (most recent call last):
File "<ipython-input-11-c6b2508abccc>", line 1, in <module>
runfile('C:/Users/Lenovo/Desktop/Naive .py', wdir='C:/Users/Lenovo/Desktop')
File "C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/Naive .py", line 109, in <module>
main()
File "C:/Users/Lenovo/Desktop/Naive .py", line 101, in main
dataset = loadCsv(filename)
File "C:/Users/Lenovo/Desktop/Naive .py", line 7, in loadCsv
dataset[i] = [float(x) for x in dataset[i]]
File "C:/Users/Lenovo/Desktop/Naive .py", line 7, in <listcomp>
dataset[i] = [float(x) for x in dataset[i]]
ValueError: could not convert string to float: 'Iris-setosa'
Please help me solve the problem. Thank you in advance

TypeError: unsupported operand type(s) for -: 'list' and 'float'

Following is my python code
import csv
import random
import math
def separateByClass(dat):
separated = {}
for i in range(len(dat)):
vector = dat[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector.pop()].append(vector)
return separated
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.iteritems():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.iteritems():
if bestLabel is None or probability > bestProb:
bestProb = probability
print bestProb
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main(str):
#clustered data
filename = 'a.csv'
lines = csv.reader(open(filename, "rb"))
a=list(lines)
for i in range(len(a)):
a[i]=[float(x) for x in a[i]]
#main data
filename = 'h.csv'
lines = csv.reader(open(filename, "rb"))
data = list(lines)
for i in range(len(data)):
data[i] = [float(x) for x in data[i]]
data[i].append(a[i][1])
s = separateByClass(data)
count=True
for key,values in s.items():
if count:
a=values
count=False
b=values
trainingSet1, testSet1 = splitDataset(a, 0.67)
trainingSet, testSet = splitDataset(b, 0.67)
trainingSet.extend(trainingSet1)
testSet.extend(testSet1)
summaries = summarizeByClass(trainingSet)
testset=str
predictions = predict(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
result = predict(summaries, testset)
returnValue.append(accuracy)
returnValue.append(result)
print returnValue
def ab():
st=[70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1.0]
a=main(st)
return a
ab()
The two files used a.csv and h.csv contains 2 and 14 colums respectively.
The code runs perfectly fine if the accuracy was not computed.
ie only when the predict() is used.
It gives the error as follows.
Traceback (most recent call last):
File "D:\nowedit\P.py", line 126, in <module>
ab()
File "D:\nowedit\P.py", line 124, in ab
a=main(st)
File "D:\nowedit\P.py", line 115, in main
predictions = predict(summaries, testSet)
File "D:\nowedit\P.py", line 60, in predict
probabilities = calculateClassProbabilities(summaries, inputVector)
File "D:\nowedit\P.py", line 56, in calculateClassProbabilities
probabilities[classValue] *= calculateProbability(x, mean, stdev)
File "D:\nowedit\P.py", line 46, in calculateProbability
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
TypeError: unsupported operand type(s) for -: 'list' and 'float'
You can't use list and float as parameters for this math function.
If you add this line to your calculateProbability(x, mean, stdev) function:
print("%s%s%s" % (type(x), type(mean), type(stdev))
You will see that x is a list and mean is a float.
As the error states:
TypeError: unsupported operand type(s) for -: 'list' and 'float'
You can't use list and float as parameters for this math function.

Categories

Resources