get MAE and RMSE
do have tried like this
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
df = pd.merge(movies, ratings, on='movieId', how='inner')
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader)
trainSet, testSet = train_test_split(data, test_size=.25, random_state=0)
algo = SVD(random_state=0)
algo.fit(trainSet)
predictions = algo.test(testSet)
def MAE(predictions):
return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
return accuracy.rmse(predictions, verbose=False)
print("RMSE: ", RMSE(predictions))
print("MAE: ", MAE(predictions))
and get error
"Singleton array array(<surprise.dataset.DatasetAutoFolds object at 0x000001BD67B62490>,
dtype=object) cannot be considered a valid collection."
Related
I am currently working with multilabel text classification in the Arabic language using binary relevance and label power set, after I make all preprocessing that I need when I need to combine chi and mutual feature selection based on their weights, I am facing this problem
Found input variables with inconsistent numbers of samples: [28332, 24]
where my dataset has one column have the text and 24 columns as a target as shown in the image :
enter image description here
I am writing this code
`class Classifier:
def __init__(self):
self.merged_df = pd.read_csv(r"D:\project\Ymal.csv", encoding='utf-8')
self.train_df, self.test_df = train_test_split(self.merged_df,test_size=0.2,random_state=42)
self.vectorizer = CountVectorizer()
self.ModelsPerformance = {}
def train(self):
self.train_text = self.train_df['text']
self.test_text = self.test_df['text']
self.train_labels = self.train_df.drop(columns=['text'])
self.test_labels = self.test_df.drop(columns=['text'])
self.mlb = MultiLabelBinarizer()
self.train_labels = self.mlb.fit_transform(self.train_labels)
self.test_labels = self.mlb.transform(self.test_labels)
self.train_text_bow = self.vectorizer.fit_transform(self.train_text)
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.chi2_selector = SelectKBest(chi2, k='all',)
self.mi_selector = SelectKBest(mutual_info_classif, k='all',)
self.chi2_features = self.chi2_selector.fit_transform(self.train_text_bow,self.train_labels)
self.mi_features = self.mi_selector.fit_transform(self.train_text_bow,self.train_labels)
self.weights_chi2 = self.chi2_selector.scores_
self.weights_mi = self.mi_selector.scores_
self.weights = (self.weights_chi2 + self.weights_mi ) / 2
self.top_features = np.argsort(self.weights)[-4000:] #[::-1]
self.train_combined_features = self.train_text_bow[:,self.top_features]
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.test_combined_features = self.test_text_bow[:, self.top_features]
def metricsReport(self,modelName, test_labels, predictions):
hamLoss = hamming_loss(test_labels, predictions)
print("------" + modelName + " Model Metrics-----")
accuracy = accuracy_score(test_labels, predictions)
macroPrecision = precision_score(test_labels, predictions, average='macro')
macroRecall = recall_score(test_labels, predictions, average='macro')
macroF1 = f1_score(test_labels, predictions, average='macro')
microPrecision = precision_score(test_labels, predictions, average='micro')
microRecall = recall_score(test_labels, predictions, average='micro')
microF1 = f1_score(test_labels, predictions, average='micro')
weightedF1 = f1_score(test_labels, predictions, average='weighted')
# print metrics
print("Hamming Loss: {:.4f}".format(hamLoss))
print('Accuracy: {0:.4f}'.format(accuracy))
print('Macro Precision: {0:.4f}'.format(macroPrecision))
print('Macro Recall: {0:.4f}'.format(macroRecall))
print('Macro F1-measure: {0:.4f}'.format(macroF1))
print('Micro Precision: {0:.4f}'.format(microPrecision))
print('Micro Recall: {0:.4f}'.format(microRecall))
print('Micro F1-measure: {0:.4f}\n'.format(microF1))
print('Weighted F1-measure: {0:.4f}\n'.format(weightedF1))
def fitAlgorithms(self):
algorithms = [{'name': 'LinearSVC', 'model': LinearSVC(max_iter=12000, dual=False),
'params': {'C': [0.1, 1, 10]}},
{'name': 'KNN', 'model': KNeighborsClassifier(),
'params': {'n_neighbors': [5, 10, 15]}},
{'name': 'RandomForest', 'model': RandomForestClassifier(),
'params': {'n_estimators': [100, 300, 500]}},
{'name': 'LogisticRegression', 'model': LogisticRegression(),
'params': {'C': [0.1, 1, 10]}},
{'name': 'DecisionTree', 'model': DecisionTreeClassifier(),
'params': {'max_depth': [5, 10, 15]}},
{'name': 'MultinomialNB', 'model': MultinomialNB(),
'params': {'alpha': [0.1, 1, 10]}}
]
for algorithm in algorithms:
model = algorithm['model']
name = algorithm['name']
params = algorithm['params']
# Fit the binary relevance and label powerset classifiers before the grid search
binary_relevance_classifier = BinaryRelevance(model)
binary_relevance_classifier.fit(self.train_combined_features, self.train_labels)
labelPowerSet_classifier = LabelPowerset(model)
labelPowerSet_classifier.fit(self.train_combined_features, self.train_labels)
print(f"Performing GridSearchCV for {name}...")
clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
clf.fit(self.train_combined_features, self.train_labels)
best_params = clf.best_params_
print(f"Best parameters for {name}: {best_params}")
model.set_params(**best_params)
binary_relevance_preds = binary_relevance_classifier.predict(self.test_combined_features)
self.metricsReport(f"Binary Relevance with {name}", self.test_labels, binary_relevance_preds)
labelPowerSet_preds = labelPowerSet_classifier.predict(self.test_combined_features)
self.metricsReport(f"Label Powerset with {name}", self.test_labels, labelPowerSet_preds)
self.ModelsPerformance[name] = clf.best_score_
return self.ModelsPerformance
# Create an instance of the Classifier
classifier = Classifier()
# Invoke the training method
classifier.train()
# Invoke the fitAlgorithms() method
classifier.fitAlgorithms()
but this basic problem is this error I referee it above
please any one can help me and if any one can optimize this ?
I believe that error is clear but I cant avoid this , also i tried the do this to sure the shape but it fine
print("train_text_bow shape:", train_text_bow.shape) print("train_labels shape:", train_labels.shape) train_text_bow shape: (28332, 121714) train_labels shape: (28332, 24)t
I need just to avoid this error
Here is my code. The packages imported are not shown. I am trying to feed the CIFAR-10 test data into alexnet. The dictionary at the end needs to be sorted so I can find the most common classification. Please help, I have tried everything!
............................................................................................................................................................................................................................................................................................................
alexnet = models.alexnet(pretrained=True)
transform = transforms.Compose([ #[1]
transforms.Resize(256), #[2]
transforms.CenterCrop(224), #[3]
transforms.ToTensor(), #[4]
transforms.Normalize( #[5]
mean=[0.485, 0.456, 0.406], #[6]
std=[0.229, 0.224, 0.225] #[7]
)])
# Getting the CIFAR-10 dataset
dataset = CIFAR10(root='data/', download=True, transform=transform)
test_dataset = CIFAR10(root='data/', train=False, transform=transform)
classes = dataset.classes
#print(classes)
torch.manual_seed(43)
val_size = 10000
train_size = len(dataset) - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
#print(len(train_ds), len(val_ds))
batch_size=100
train_loader = DataLoader(train_ds, batch_size, shuffle=True, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size, num_workers=8, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size, num_workers=8, pin_memory=True)
with open("/home/shaan/Computer Science/CS4442/Ass4/imagenet_classes.txt") as f:
classes = eval(f.read())
holder = []
dic = {}
current = ''
#data_iter = iter(test_loader)
#images,labels = data_iter.next()
#alexnet.eval()
with torch.no_grad():
for data in test_loader:
images, labels = data
out = alexnet(images)
#print(out.shape)
for j in range(0,batch_size):
sorted, indices = torch.sort(out,descending=True)
percentage = F.softmax(out,dim=1)[j]*100
results = [(classes[i.item()],percentage[i].item()) for i in indices[j][:5]]
holder.append(results[0][0])
holder.sort()
for z in holder:
if current != z:
count = 1
dic[z] = count
current = z
else:
count = count + 1
dic[z] = count
current = z
This is where im getting the error:
for w in sorted(dic, key=dic.get, reverse=True):
print(w, dic[w])
This line is the problem
sorted, indices = torch.sort(out,descending=True)
You created a variable named sorted, which is exactly the same name as sorted function you call when it error.
Just change this to something else like
sorted_out, indices = torch.sort(out,descending=True)
When trying to build my data set an error of "TypeError: 'set' object is not subscriptable" is received.
dataDir = '/content/drive/My Drive/Colab Notebooks/HW 3/' # Directory with input files
trainFile = 'q2train.csv' # Training examples
labelFile = 'q2label.csv' # Test label
validFile = 'q2valid.csv' # Valid Files
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train',
'label',
'valid'}
def get_data(data_set_name, test_prop=0.2, seed=2019):
"""returns data for training, testing, and data characteristics"""
data = data_sets[data_set_name]
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_prop,
random_state=seed)
nF = X.shape[1] # number of features
nC = len(np.unique(y)) # number of classes
nTrain, nTest = len(y_train), len(y_test)
print("\nData set: %s" %data_set_name)
print("\tNumber of features %d" %nF)
print("\tNumber of output classes = %d" %(nC))
print("\tNumber of training examples = %d" %(nTrain))
print("\tNumber of testing examples = %d" %(nTest))
return X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest
for name in data_set:
X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest = get_data(name)
Any help would be appreciated, thanks in advance.
Use a dictionary:
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train': train,
'label': label,
'valid': valid
}
Then data_sets[data_set_name] will retrieve the dataset you want.
Hello i have problem with GridSearchCV it works perfectly on mnist_dataset but not on my own data, and i don't know why.
# df = pd.read_csv('bank-full.csv',sep=';')
# print(df.head())
#
# print(df.shape)
#
# print(df.columns)
# print(df.info)
# df.columns = [col.replace('"', '') for col in df.columns]
#
#
# df.drop(columns=['day', 'poutcome'], axis =1 , inplace=True)
#
#
# print(df.head())
# print(df.shape)
#
# le = preprocessing.LabelEncoder()
# df.job = le.fit_transform(df.job)
# df.education = le.fit_transform(df.education)
# df.housing = le.fit_transform(df.housing)
# df.loan = le.fit_transform(df.loan)
# #df.poutcome = le.fit_transform(df.poutcome)
# df.month = le.fit_transform(df.month)
# df.contact = le.fit_transform(df.contact)
# df.marital = le.fit_transform(df.marital)
# df.default = le.fit_transform(df.default)
# df.y = le.fit_transform(df.y)
#
#
#
# print(df.head())
#
# X = df.iloc[:, 0:14]
# y = df.iloc[:, 14]
# X = np.array(X, dtype="float64")
# y = np.array(y,dtype="float64")
#
# scaler = Normalizer()
# X = scaler.fit_transform(X)
#
#
#
#
# x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
# model = LogisticRegression(penalty='l2', max_iter=1000)
# model.fit(x_train, y_train)
# prediction = model.predict(x_test)
# from sklearn.metrics import accuracy_score
# print("ACC: {} ".format(accuracy_score(y_test, prediction)))
#
#
# print(x_train.shape)
#
# nn = Sequential()
# nn.add(Dense(120,input_dim = 14, activation='relu'))
# nn.add(Dense(240,activation='relu'))
#
#
# nn.add(Dense(1))
# nn.add(Activation('sigmoid'))
#
# nn.compile(loss=keras.losses.binary_crossentropy,
# optimizer='sgd',
# metrics=['accuracy'])
#
# nn.fit(x_train, y_train,
# batch_size=10,
# epochs=10,
# verbose=1,
#
# validation_data=(x_test, y_test))
#
# loss_acc = nn.evaluate(x_test, y_test, verbose=0)
# print('Test loss:', loss_acc[0])
# print('Test accuracy:', loss_acc[1])
data = bm.load_data('bank-full.csv')
data = bm.preprocess_data(data)
X,y = bm.split_data(data)
scaler = Normalizer()
X = scaler.fit_transform(X)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
start = time()
model = KerasClassifier(build_fn=nnmodel.create_model())
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = np.array([50, 100, 150])
batches = np.array([5, 10, 20])
param_grid = dict(optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
print("total time:", time() - start)
this commented section, is just a simple keras model, that works perfectrly but below if i try gridSearchCV on this model, it gives me this errors:
https://pastebin.com/mhJLSXAS , for example if i run this program https://www.kaggle.com/shujunge/gridsearchcv-with-keras it works perferctly byt on my data it's not, does somebody know why ?
Scikitlearn builds each time new model. Script has to build a classifier with specific parameters inside the grid search method. So you have to send the method name as an argument, not the result of it.
Probably nnmodel.create_model is your function which creates a new model based on parameters. So try to change:
build_fn=nnmodel.create_model()
To:
build_fn=nnmodel.create_model
import pandas as pd
import tensorflow as tf
import tempfile``
CSV_COLUMNS = [ ]
train_file = '/home/nick/
test_file = '/home/nick/
def input_fn(data_file, num_epochs, shuffle):
#"""Input builder function."""
df_data = pd.read_csv(
tf.gfile.Open(data_file),
names=CSV_COLUMNS,
skipinitialspace=True,
engine="python",
skiprows=1)
# remove NaN elements
df_data = df_data.dropna(how="any", axis=0)
labels = df_data["NPK"].apply(lambda x: "<10" in x).astype(int)
return tf.estimator.inputs.pandas_input_fn(
x=df_data,
y=labels,
batch_size=100,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
DA = tf.feature_column.categorical_column_with_vocabulary_list( )
LO = tf.contrib.layers.sparse_column_with_hash_bucket( )
deep_columns = [tf.feature_column.indicator_column(DA) tf.feature_column.indicator_column(PD)
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.DNNClassifier(
feature_columns=deep_columns,
hidden_units=[1024, 512, 256],
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=0.1,
l1_regularization_strength=0.001
))
# set num_epochs to None to get infinite stream of data.
m.fit(
input_fn=input_fn(train_file, num_epochs=None, shuffle=True),
steps=20000)
# set steps to None to run evaluation until all data consumed.
results = m.evaluate(
input_fn=input_fn(test_file, num_epochs=1, shuffle=False),
steps=None)
print("model directory = %s" % model_dir)
#in the result we have accuracy, precision auc and other things. How can I choose them?
for key in sorted(results):
print("%s: %s" % (key, results[key]))
I'd like to know how I can change threshold value in the evaluation of the deep model. This is the code, if you run it you can see that this value is 0.5, I'd like to change it from o to 1 to improve the model
I hope you can help me
thank you