Python classification technique naive bayes - python

I am doing a research on classification techniques. I found a code online for Naive Bayes Classification in python. I have shared the code below. But I am getting errors in it. Please help in solving the errors. The software I am using is Anaconda with Python 3.6 in it.
The code is as follows:
import csv
def loadCsv(filename):
lines = csv.reader(open(filename))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
import random
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
import math
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.iteritems():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.iteritems():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
splitRatio = 0.67
filename = 'E:\iris.data.csv'
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train = {1} and test = {2} rows'). format(len(dataset), len(trainingSet), len(testSet))
summaries = summarizeByClass(trainingSet)
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%').format(accuracy)
main()
And I am getting this as my traceback:
runfile('C:/Users/Lenovo/Desktop/Naive .py', wdir='C:/Users/Lenovo/Desktop')
Traceback (most recent call last):
File "<ipython-input-11-c6b2508abccc>", line 1, in <module>
runfile('C:/Users/Lenovo/Desktop/Naive .py', wdir='C:/Users/Lenovo/Desktop')
File "C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/Naive .py", line 109, in <module>
main()
File "C:/Users/Lenovo/Desktop/Naive .py", line 101, in main
dataset = loadCsv(filename)
File "C:/Users/Lenovo/Desktop/Naive .py", line 7, in loadCsv
dataset[i] = [float(x) for x in dataset[i]]
File "C:/Users/Lenovo/Desktop/Naive .py", line 7, in <listcomp>
dataset[i] = [float(x) for x in dataset[i]]
ValueError: could not convert string to float: 'Iris-setosa'
Please help me solve the problem. Thank you in advance

Related

Pytorch transformer decoder inplace modified error (although I didn't use inplace operations..)

I am studying by designing a model structure using Transformer encoder and decoder.
I trained the classification model as a result of the encoder and trained the generative model with the decoder result (the result of the encoder as an input).
Exports multiple results to output.
The following error occurred while learning:
I tracked the error using torch.autograd.set_detect_anomaly(True).
I saw an article about the same error on the PyTorch forum.
However, they were mostly using inplace operations such as += or x[:, 0]=0. So it was solved when I fixed.
But I didn't use any of these operations.
I tried to change unsqueeze() and squeeze() to view(), and also attach clone() to tensor maipulation. but error hasn't be fixed.
What is the problem?
model code
import torch
import torch.nn as nn
import random
from torch.nn.utils.rnn import pad_sequence
import math
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertForQuestionAnswering
from tqdm import tqdm
import pandas as pd
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class SelfAttention(nn.Module):
def __init__(self, embedding_dim, num_heads):
super(SelfAttention, self).__init__()
self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads,
batch_first=True)
def forward(self, x):
query = x
key = x
value = x
attn_output = self.multihead_attn(query, key, value, need_weights=False)
return attn_output
class Encoder(nn.Module):
def __init__(self, embedding_dim):
super(Encoder, self).__init__()
self.embedding_dim = embedding_dim
# self.pos_encoder = PositionalEncoding()
self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=8, batch_first=True)
self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer, num_layers=6)
self.feedforward = nn.Linear(self.embedding_dim, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.encoder(x)
cls_out = torch.mean(out, dim=-2)
cls_out = self.feedforward(cls_out)
cls_out = self.sigmoid(cls_out)
return out, cls_out
class Decoder(nn.Module):
def __init__(self, embedding_dim):
super(Decoder, self).__init__()
# self.bert = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
self.embedding_dim = embedding_dim
self.decoder_layer = nn.TransformerDecoderLayer(d_model=self.embedding_dim, nhead=8, batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer=self.decoder_layer, num_layers=6)
def forward(self, tgt, memory):
out = self.decoder(tgt, memory)
return out
class AlzhBERT(nn.Module):
def __init__(self, embedding_dim):
super(AlzhBERT, self).__init__()
self.embedding_dim = embedding_dim
self.max_sent_length = 7
self.token_level_attn = nn.ModuleList([SelfAttention(self.embedding_dim, num_heads=8) for _ in range(10)])
self.token_level_attn_single = SelfAttention(self.embedding_dim, num_heads=8)
self.sentence_level_attn = SelfAttention(self.embedding_dim, num_heads=8)
self.encoder = Encoder(embedding_dim=embedding_dim)
self.decoder = Decoder(embedding_dim=embedding_dim)
def forward(self, X_batch):
i = 0
enc_outs = {}
dec_outs = {}
for datastruct in X_batch:
enc_outs[i] = []
dec_outs[i] = []
j=0
for section in datastruct.sections:
print(i, " + ", j)
inv = section.inv.requires_grad_(True).to(device)
y_dec = section.next_uttr.requires_grad_(True).to(device)
par = section.par
# print(par)
try:
tmp = par.dim()
except AttributeError:
print(par)
print("attr err")
j = j+1
continue
# par = par.permute(1,0,2) # (seq_len, sent_len, embed) => 한 번에 self attention
# 여러개 self_attention
# for p in par:
result = self.token_level_attn_single(par.to(device).requires_grad_(True))[0]
res = torch.mean(result, dim=-2).unsqueeze(0)
res_sent = self.sentence_level_attn(res.to(device))[0]
context = torch.mean(res_sent, dim=-3)
inv_input = torch.mean(inv, dim=-2)
# x_enc = torch.concat((inv_input, context))
# x_enc = x_enc.view([1, -1, self.embedding_dim])
enc_out, cls_out = self.encoder(torch.concat([inv_input, context]).unsqueeze(0))
# y_dec = torch.mean(y_dec, dim=-2).to(device)
# enc_out = torch.mean(enc_out, dim=-2).unsqueeze(0).to(device)
dec_out = self.decoder(y_dec, enc_out.to(device))
enc_outs[i].append(cls_out)
dec_outs[i].append(dec_out)
j = j+1
enc_outs[i] = torch.tensor(enc_outs[i], requires_grad=True)
i = i + 1
return enc_outs, dec_outs
train code
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device: ", device)
torch.autograd.set_detect_adnomaly(True)
def train_loop(dataloader, model, loss_fn, optimizer, epochs):
# dataloader = dataloader["train"]
size = len(dataloader.dataset)
writer = SummaryWriter()
enc_optimizer = optimizer[0]
dec_optimizer = optimizer[1]
for epoch in range(epochs):
enc_loss_hist = []
dec_loss_hist = []
accuracy = []
print("======== epoch ", epoch, "==========\n")
for i, (Xs, ys) in tqdm(enumerate(dataloader), desc="Train..."):
X_folds, y_folds = cross_validation(10, Xs, ys)
model.train()
for X, y in zip(X_folds['train'], y_folds['train']): # Xf는 DataStruct의 리스트임
# print("<Check Data>")
# print("X 0: ", X[0])
# print("label 0: ", y[0])
# Prediction and Loss
# X = batch_to_tensor(X)
# X = torch.tensor(X).to(device)
y = torch.tensor(y, dtype=torch.float32).to(device)
enc_preds, dec_preds = model(X)
for k in range(len(X)):
for t in range(len(enc_preds[k])):
enc_loss = loss_fn(y[k].to(device), enc_preds[k][t].to(device)).requires_grad_(True)
dec_loss = loss_fn(X[k].sections[t].next_uttr.to(device), dec_preds[k][t].to(device)).requires_grad_(True)
cls_out = torch.tensor(1 if enc_preds[k][t] >= 0.5 else 0)
cls_loss = torch.sum(cls_out == y[k])
accuracy.append(cls_loss)
# Backpropagation
enc_optimizer.zero_grad()
dec_optimizer.zero_grad()
enc_loss.backward(retain_graph=True)
enc_optimizer.step()
dec_loss.backward()
dec_optimizer.step()
enc_loss_hist.append(enc_loss)
dec_loss_hist.append(dec_loss)
cross_validation_loop(X_folds["valid"], y_folds["valid"], model, loss_fn, epoch)
enc_loss_save = torch.mean(torch.tensor(enc_loss_hist))
dec_loss_save = torch.mean(torch.tensor(dec_loss_hist))
accuracy_save = torch.mean(torch.tensor(accuracy, dtype=torch.float32))
writer.add_scalar("Avg Enc Loss/train", enc_loss_save, epoch)
writer.add_scalar("Avg Dec Loss/train", dec_loss_save, epoch)
writer.add_scalar("Avg Accuracy/train", accuracy_save)
if device == "cuda":
saved_model_dir = "/home/juny/AlzheimerModel/checkpoint"
else:
saved_model_dir = "./saved_model"
now = datetime.now()
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': enc_optimizer.state_dict(),
'loss': [enc_loss_save, dec_loss_save],
}, os.path.join('/home/juny/AlzheimerModel/checkpoint',
now.strftime("%Y-%m-%d-%H-%M") + "-e" + str(epoch) + ".pt"))
torch.save(model.state_dict(), os.path.join(saved_model_dir, "saved_model" + now.strftime("%Y-%m-%d-%H-%M") + ".pt"))
encloss, decloss, current = enc_loss_save, dec_loss_save.item(), i * len(X)
print(f"enc loss: {encloss:>7f} dec loss: {decloss:>7f} [{current:>5d}/{size:>5d}")
writer.flush()
writer.close()
error
C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\autograd\__init__.py:173: UserWarning: Error detected in NativeLayerNormBackward0. Traceback of forward call that caused the error:
File "C:/Users/usr/PycharmProjects/project/train.py", line 265, in <module>
train_loop(dataloader=train_dataloader, model=model, loss_fn=loss_fn, optimizer=(enc_optimizer, dec_optimizer), epochs=epochs)
File "C:/Users/usr/PycharmProjects/project/train.py", line 47, in train_loop
enc_preds, dec_preds = model(X)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\usr\PycharmProjects\project\AlzhBERT.py", line 139, in forward
dec_out = self.decoder(y_dec, enc_out.to(device))
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\usr\PycharmProjects\project\AlzhBERT.py", line 84, in forward
out = self.decoder(tgt, memory)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\transformer.py", line 291, in forward
output = mod(output, memory, tgt_mask=tgt_mask,
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\transformer.py", line 578, in forward
x = self.norm3(x + self._ff_block(x))
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\modules\normalization.py", line 189, in forward
return F.layer_norm(
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\nn\functional.py", line 2503, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
(Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Train...: 0it [00:05, ?it/s]
Traceback (most recent call last):
File "C:/Users/usr/PycharmProjects/project/train.py", line 265, in <module>
train_loop(dataloader=train_dataloader, model=model, loss_fn=loss_fn, optimizer=(enc_optimizer, dec_optimizer), epochs=epochs)
File "C:/Users/usr/PycharmProjects/project/train.py", line 65, in train_loop
loss.backward(retain_graph=True)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "C:\Users\usr\anaconda3\envs\pytorch-python3.8\lib\site-packages\torch\autograd\__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [768]] is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Process finished with exit code 1

Getting TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class '...Categorical'>

I am trying to run a Resnet model through Skorch for classification which I found in a research paper. I am still learning the ways of Torch and Skorch, and I'm unable to find what to fix to get this to work.
ResNet class:
class ResNet(nn.Module):
def __init__(
self,
*,
d_numerical: int,
categories: ty.Optional[ty.List[int]],
d_embedding: int,
d: int,
d_hidden_factor: float,
n_layers: int,
activation: str,
normalization: str,
hidden_dropout: float,
residual_dropout: float,
d_out: int,
regression: bool,
categorical_indicator
) -> None:
super().__init__()
#categories = None #TODO
def make_normalization():
return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[
normalization[0]
](d)
self.categorical_indicator = categorical_indicator #Added
self.regression = regression
self.main_activation = deep.get_activation_fn(activation)
self.last_activation = deep.get_nonglu_activation_fn(activation)
self.residual_dropout = residual_dropout
self.hidden_dropout = hidden_dropout
d_in = d_numerical
d_hidden = int(d * d_hidden_factor)
if categories is not None:
d_in += len(categories) * d_embedding
category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
self.register_buffer('category_offsets', category_offsets)
self.category_embeddings = nn.Embedding(int(sum(categories)), d_embedding)
nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
print(f'{self.category_embeddings.weight.shape}')
self.first_layer = nn.Linear(d_in, d) # 1, 256
self.layers = nn.ModuleList(
[
nn.ModuleDict(
{
'norm': make_normalization(),
'linear0': nn.Linear(
d, d_hidden * (2 if activation.endswith('glu') else 1)
),
'linear1': nn.Linear(d_hidden, d),
}
)
for _ in range(n_layers)
]
)
self.last_normalization = make_normalization()
self.head = nn.Linear(d, d_out) # 256, 1
def forward(self, x) -> Tensor:
if not self.categorical_indicator is None:
x_num = x[:, ~self.categorical_indicator].float()
x_cat = x[:, self.categorical_indicator].long() #TODO
else:
x_num = x
x_cat = None
x = []
if x_num is not None:
x.append(x_num)
if x_cat is not None:
x.append(
self.category_embeddings(x_cat + self.category_offsets[None]).view(
x_cat.size(0), -1
)
)
x = torch.cat(x, dim=-1)
x = self.first_layer(x)
for layer in self.layers:
layer = ty.cast(ty.Dict[str, nn.Module], layer)
z = x
z = layer['norm'](z)
z = layer['linear0'](z)
z = self.main_activation(z)
if self.hidden_dropout:
z = F.dropout(z, self.hidden_dropout, self.training)
z = layer['linear1'](z)
if self.residual_dropout:
z = F.dropout(z, self.residual_dropout, self.training)
x = x + z
x = self.last_normalization(x)
x = self.last_activation(x)
x = self.head(x)
if not self.regression:
x = x.squeeze(-1)
return x
class InputShapeSetterResnet(skorch.callbacks.Callback):
def __init__(self, regression=False, batch_size=None,
categorical_indicator=None):
self.categorical_indicator = categorical_indicator
self.regression = regression
self.batch_size = batch_size
def on_train_begin(self, net, X, y):
print("categorical_indicator", self.categorical_indicator)
if self.categorical_indicator is None:
d_numerical = X.shape[1]
categories = None
else:
d_numerical = X.shape[1] - sum(self.categorical_indicator)
# categories = list((X[:, self.categorical_indicator].max(0) + 1).astype(int))
categories = [sum(self.categorical_indicator)]
net.set_params(module__d_numerical=d_numerical,
module__categories=categories, #FIXME #lib.get_categories(X_cat),
module__d_out=2 if self.regression == False else 1) #FIXME#D.info['n_classes'] if D.is_multiclass else 1,
print("Numerical features: {}".format(d_numerical))
print("Categories {}".format(categories))
Skorch Wrapper:
def create_resnet_skorch(id, wandb_run=None, use_checkpoints=True,
categorical_indicator=None, **kwargs):
print(kwargs)
if "verbose" not in kwargs:
verbose = 0
else:
verbose = kwargs.pop("verbose")
if "lr_scheduler" not in kwargs:
lr_scheduler = False
else:
lr_scheduler = kwargs.pop("lr_scheduler")
if "es_patience" not in kwargs.keys():
es_patience = 40
else:
es_patience = kwargs.pop('es_patience')
if "lr_patience" not in kwargs.keys():
lr_patience = 30
else:
lr_patience = kwargs.pop('lr_patience')
optimizer = kwargs.pop('optimizer')
if optimizer == "adam":
optimizer = Adam
elif optimizer == "adamw":
optimizer = AdamW
elif optimizer == "sgd":
optimizer = SGD
device = kwargs.pop('device')
if device == "cuda": # ! only for CPU training, is cuda by default
device = "cpu"
batch_size = kwargs.pop('batch_size')
callbacks = [InputShapeSetterResnet(categorical_indicator=categorical_indicator),
EarlyStopping(monitor="valid_loss",
patience=es_patience)]
callbacks.append(EpochScoring(scoring='accuracy', name='train_accuracy', on_train=True))
if lr_scheduler:
callbacks.append(LRScheduler(policy=ReduceLROnPlateau, patience=lr_patience, min_lr=2e-5,
factor=0.2)) # FIXME make customizable
if use_checkpoints:
callbacks.append(Checkpoint(dirname="skorch_cp", f_params=r"params_{}.pt".format(id), f_optimizer=None,
f_criterion=None))
if not wandb_run is None:
callbacks.append(WandbLogger(wandb_run, save_model=False))
callbacks.append(LearningRateLogger())
if not categorical_indicator is None:
categorical_indicator = torch.BoolTensor(categorical_indicator)
mlp_skorch = NeuralNetClassifier(
ResNet,
# Shuffle training data on each epoch
criterion=torch.nn.CrossEntropyLoss,
optimizer=optimizer,
batch_size=max(batch_size, 1), # if batch size is float, it will be reset during fit
iterator_train__shuffle=True,
module__d_numerical=1, # will be change when fitted
module__categories=None, # will be change when fitted
module__d_out=1, # idem
module__regression=False,
module__categorical_indicator=categorical_indicator,
verbose=verbose,
callbacks=callbacks,
**kwargs
)
return mlp_skorch
Skorch Model:
<class 'skorch.classifier.NeuralNetClassifier'>[uninitialized](
module=<class 'tabular.bin.resnet.ResNet'>,
module__activation=reglu,
module__categorical_indicator=tensor([False, True, False, False, False, False, False, False]),
module__categories=None,
module__d=256,
module__d_embedding=128,
module__d_hidden_factor=2,
module__d_numerical=1,
module__d_out=1,
module__hidden_dropout=0.2,
module__n_layers=8,
module__normalization=['batchnorm'],
module__regression=False,
module__residual_dropout=0.2,
)
I have 8 columns in X for training, 1 of which is a categorical column which is to be embedding through an embedding layer in the NN. From what I've found so far, that is the root of this error since it's coming across this categorical class in execution. But in the forward method, it's supposed to have an embedding layer for the same. Any idea what changes I might need to make for the same?
Error stack:
Traceback (most recent call last):
File "/test.py", line 639, in <module>
model.fit(X_train, y_train)
File "/anaconda3/lib/python3.9/site-packages/skorch/classifier.py", line 142, in fit
return super(NeuralNetClassifier, self).fit(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 917, in fit
self.partial_fit(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 876, in partial_fit
self.fit_loop(X, y, **fit_params)
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 789, in fit_loop
self.run_single_epoch(dataset_train, training=True, prefix="train",
File "/anaconda3/lib/python3.9/site-packages/skorch/net.py", line 822, in run_single_epoch
for data in self.get_iterator(dataset, training=training):
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
return [default_collate(samples) for samples in transposed]
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/anaconda3/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 86, in default_collate
raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'pandas.core.arrays.categorical.Categorical'>

Scikit-Multiflow - Cannot take a larger sample than population when 'replace'=False

So I was trying to run the following code, where x is a feature vector with dimensions (2381,) and y is a label with dimension (1,) after being cast to a Numpy array.
from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data
np.random.seed(1)
def main() -> None:
dataset = data.get_full_dataset()
metadata = data.get_metadata()
training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
model = AdaptiveRandomForestClassifier()
correct_count = 0
n_samples = 0
for x, y in zip(initial_features, initial_labels):
y = np.asarray([y])
y_prediction = model.predict(x)
if y_prediction[0] == y:
correct_count += 1
model.partial_fit(x, y)
n_samples += 1
print(f"Accuracy: {correct_count / n_samples}")
if __name__ == "__main__":
main()
However, I am yielding the following error:
Traceback (most recent call last):
File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 31, in <module>
main()
File "/home/nathan/Documents/Research/BodmasOnline/main.py", line 24, in main
model.partial_fit(x, y)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 313, in partial_fit
self._partial_fit(X[i], y[i], self.classes, weight[i])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 328, in _partial_fit
self.ensemble[i].partial_fit(np.asarray([X]), np.asarray([y]),
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/meta/adaptive_random_forests.py", line 569, in partial_fit
self.classifier.partial_fit(X, y, classes=classes, sample_weight=sample_weight)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 394, in partial_fit
self._partial_fit(X[i], y[i], sample_weight[i])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/hoeffding_tree.py", line 424, in _partial_fit
learning_node.learn_from_instance(X, y, sample_weight, self)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_nb_adaptive.py", line 54, in learn_from_instance
super().learn_from_instance(X, y, weight, ht)
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 58, in learn_from_instance
self.list_attributes = self._sample_features(get_dimensions(X)[1])
File "/home/nathan/Documents/Research/BodmasOnline/venv/lib/python3.10/site-packages/skmultiflow/trees/nodes/random_learning_node_classification.py", line 72, in _sample_features
return self.random_state.choice(
File "mtrand.pyx", line 965, in numpy.random.mtrand.RandomState.choice
ValueError: Cannot take a larger sample than population when 'replace=False'
Can anyone help me out?
Going to answer my own question, since scikit-multiflow does not necessarily have the best documentation. The feature vector x has to have dimensions (1, n), which in this case yields (1, 2381). This can be achieved programmatically as follows:
from skmultiflow.meta import AdaptiveRandomForestClassifier
import numpy as np
import data
np.random.seed(1)
def main() -> None:
dataset = data.get_full_dataset()
metadata = data.get_metadata()
training_batch = data.get_windows(dataset, metadata, data.get_initial_training_groups())
streaming_batch = data.get_windows(dataset, metadata, data.get_incremental_learning_groups())
initial_features = np.concatenate([dataset.feature_vectors for group, dataset in training_batch])
initial_labels = np.concatenate([dataset.labels for group, dataset in training_batch])
model = AdaptiveRandomForestClassifier()
correct_count = 0
n_samples = 0
for x, y in zip(initial_features, initial_labels):
x = np.expand_dims(x, axis=0)
y = np.asarray([y])
y_prediction = model.predict(x)
if y_prediction[0] == y:
correct_count += 1
model.partial_fit(x, y)
n_samples += 1
print(f"Accuracy: {correct_count / n_samples}")
if __name__ == "__main__":
main()

Classification Techniques In python

I have found two codes online on Classification Techniques. One technique is Naive Bayes and the other one is KNn. I have used two datasets: one is iris.data and the other one is prima-indians-diabetes.data.
The prima indians dataset is working properly in Naive Bayes Algorithm and Iris.data is working correctly in KNn algorithm. But I want to compare both the algorithms which is possible only when one dataset runs in both of the algorithms.
I am attaching the algorithm of Naive bayes and KNn with both the datasets. and the respective tracebacks.
Naive Bayes with iris.data
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers) / float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
filename = 'E:\iris.data.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print(('Accuracy: {0}%').format(accuracy))
main()
and the traceback for this is:
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py',
wdir='C:/Users/Lenovo/Desktop/EE Codes') Traceback (most recent call
last):
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py",
line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py', wdir='C:/Users/Lenovo/Desktop/EE Codes')
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 710, in runfile
execfile(filename, namespace)
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py", line 63
print 'Train set: ' + repr(len(trainingSet))
^ SyntaxError: invalid syntax
KNn with prima indians:
# Example of kNN implemented from Scratch in Python
import csv
import random
import math
import operator
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
with open(filename, 'rt') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance)-1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
# prepare data
trainingSet=[]
testSet=[]
split = 0.67
loadDataset('E:\pima-indians-diabetes.data.csv', split, trainingSet, testSet)
print 'Train set: ' + repr(len(trainingSet))
print 'Test set: ' + repr(len(testSet))
# generate predictions
predictions=[]
k = 3
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
main()
And the traceback is:
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py',
wdir='C:/Users/Lenovo/Desktop/EE Codes') Traceback (most recent call
last):
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py",
line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in
runfile('C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py', wdir='C:/Users/Lenovo/Desktop/EE Codes')
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 710, in runfile
execfile(filename, namespace)
File
"C:\Users\Lenovo\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Lenovo/Desktop/EE Codes/Knn with prima.py", line 63
print 'Train set: ' + repr(len(trainingSet))
^ SyntaxError: invalid syntax
What is the problem with these two bits of code?

TypeError: unsupported operand type(s) for -: 'list' and 'float'

Following is my python code
import csv
import random
import math
def separateByClass(dat):
separated = {}
for i in range(len(dat)):
vector = dat[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector.pop()].append(vector)
return separated
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.iteritems():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.iteritems():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.iteritems():
if bestLabel is None or probability > bestProb:
bestProb = probability
print bestProb
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main(str):
#clustered data
filename = 'a.csv'
lines = csv.reader(open(filename, "rb"))
a=list(lines)
for i in range(len(a)):
a[i]=[float(x) for x in a[i]]
#main data
filename = 'h.csv'
lines = csv.reader(open(filename, "rb"))
data = list(lines)
for i in range(len(data)):
data[i] = [float(x) for x in data[i]]
data[i].append(a[i][1])
s = separateByClass(data)
count=True
for key,values in s.items():
if count:
a=values
count=False
b=values
trainingSet1, testSet1 = splitDataset(a, 0.67)
trainingSet, testSet = splitDataset(b, 0.67)
trainingSet.extend(trainingSet1)
testSet.extend(testSet1)
summaries = summarizeByClass(trainingSet)
testset=str
predictions = predict(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
result = predict(summaries, testset)
returnValue.append(accuracy)
returnValue.append(result)
print returnValue
def ab():
st=[70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1.0]
a=main(st)
return a
ab()
The two files used a.csv and h.csv contains 2 and 14 colums respectively.
The code runs perfectly fine if the accuracy was not computed.
ie only when the predict() is used.
It gives the error as follows.
Traceback (most recent call last):
File "D:\nowedit\P.py", line 126, in <module>
ab()
File "D:\nowedit\P.py", line 124, in ab
a=main(st)
File "D:\nowedit\P.py", line 115, in main
predictions = predict(summaries, testSet)
File "D:\nowedit\P.py", line 60, in predict
probabilities = calculateClassProbabilities(summaries, inputVector)
File "D:\nowedit\P.py", line 56, in calculateClassProbabilities
probabilities[classValue] *= calculateProbability(x, mean, stdev)
File "D:\nowedit\P.py", line 46, in calculateProbability
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
TypeError: unsupported operand type(s) for -: 'list' and 'float'
You can't use list and float as parameters for this math function.
If you add this line to your calculateProbability(x, mean, stdev) function:
print("%s%s%s" % (type(x), type(mean), type(stdev))
You will see that x is a list and mean is a float.
As the error states:
TypeError: unsupported operand type(s) for -: 'list' and 'float'
You can't use list and float as parameters for this math function.

Categories

Resources