I have 4 types of data.
each one has been pre-processed using:
x1,y1=Standardisation
x2,y2=Normalisation
x3,y3=Rescale
and one is completely unprocessed (x,y).
I have applied logistic regression to each like this:
#Building Logistic Regression model on the UNPROCESSED DATA
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_predict = lr_model.predict(x_test)
print('Logistic Regression - ',accuracy_score(lr_predict,y_test))
#Building Logistic Regression model on the NORMALISED DATA
from sklearn.linear_model import LogisticRegression
lr_norm = LogisticRegression()
lr_norm.fit(x1_train, y1_train)
y_pred = lr_norm.predict(x1_test)
print("Accuracy of logistic regression on test set with Rescaled features: {:.2f}".format(lr_norm.score(x1_test, y1_test)))
and so on...
I want to make one graph, not sure which, that best represents the performance through its accuracy score, or whatever else there may be... but of the other models I wish to test down below:
svm_model = SVC(kernel='linear')
svm_model.fit(x_train,y_train)
svc_predict = svm_model.predict(x_test)
print('SVM - ',accuracy_score(svc_predict,y_test))
print('\t\t\t\tTRAIN DATA\n')
print(classification_report(y_train, svm_model.predict(x_train), target_names=encoder.inverse_transform([0,1,2])))
print('\n')
print('\t\t\t\tTEST DATA\n')
print(classification_report(y_test, svm_model.predict(x_test), target_names=encoder.inverse_transform([0,1,2])))
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
nb_predict = nb_model.predict(x_test)
print('Naive bayes - ',accuracy_score(nb_predict,y_test))
dt_model = DecisionTreeClassifier(max_leaf_nodes=3)
dt_model.fit(x_train,y_train)
dt_predict = dt_model.predict(x_test)
print('Decision Tree - ',accuracy_score(dt_predict,y_test))
rfc_model = RandomForestClassifier(max_depth=3)
rfc_model.fit(x_train,y_train)
rfc_predict = rfc_model.predict(x_test)
print('Random Forest - ',accuracy_score(rfc_predict,y_test))
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train,y_train)
knn_predict = knn_model.predict(x_test)
print('knn - ',accuracy_score(knn_predict,y_test))
Hope this makes sense..
#preprare data
pre_processing=[('NOT PROCESSED', None)]
pre_processing.append(('RESCALED', MinMaxScaler(feature_range=(0, 1))))
pre_processing.append(('STANDARDIZED', StandardScaler()))
pre_processing.append(('NORMALIZED', Normalizer()))
# prepare models
models = []
models.append(( 'LR' , LogisticRegression(max_iter=10000)))
models.append(( 'LDA' , LinearDiscriminantAnalysis()))
models.append(( 'KNN' , KNeighborsClassifier()))
models.append(( 'CART' , DecisionTreeClassifier()))
models.append(( 'NB' , GaussianNB()))
models.append(( 'SVM' , SVC(probability=True)))
results = []
names = []
higher_acc=0
standard=0
best_model=''
for process in pre_processing:
globals()['df_'+process[0]] = pd.DataFrame(index=None, columns=None)
for algo in models:
estimators = [process,algo]
model = Pipeline(estimators)
ss = ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
names.append(algo[0])
for scoring in performance_metrix:
cv_results = cross_val_score(model, X_train, Y_train, cv=ss, scoring=scoring)
globals()['df_'+process[0]].loc[algo[0],scoring]= '%s\u00B1%s'%(round(cv_results.mean()*100.0,2),round(cv_results.std()*100.0,2))
if performance_metrix.index(scoring)==0:
results.append(cv_results)
if cv_results.mean()*100.0 > higher_acc:
higher_acc=cv_results.mean()*100.0
standard=cv_results.std()*100.0
best_model=process[0], algo[0]
elif cv_results.mean()*100.0 == higher_acc:
if cv_results.std()*100.0 < standard:
higher_acc=cv_results.mean()*100.0
best_model=process[0], algo[0]
print('For %s data we produced:\n\n'%(process[0]),globals()['df_'+process[0]],'\n\n')
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithms accuracy comparison for %s data'%(process[0]))
ax = fig.add_subplot(111)
pyplot.boxplot(results[:len(models)])
ax.set_xticklabels(names)
pyplot.show()
# Create a pipeline that standardizes the data then creates a model
print("The overall best performance was the one obtained with %s data, using %s algorithm. \nIt's Accuracy resulted to be %s with a standard deviation of %s" %(best_model[0],best_model[1],round(higher_acc,2),round(standard,2)))
datasets = {
"Unprocessed": (x_train, x_test, y_train, y_test),
"Standardisation": (x1_train, x1_test, y1_train, y1_test),
"Normalisation": (x2_train, x2_test, y2_train, y2_test),
"Rescale": (x3_train, x3_test, y3_train, y3_test),
}
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(max_leaf_nodes=3),
"Random Forest": RandomForestClassifier(max_depth=3)
}
def evaluate_model(model, dataset):
x_train, x_test, y_train, y_test = data
model.fit(x_train, y_train)
pred = model.predict(x_test)
return accuracy_score(pred, y_test)
model_scores_for_datasets = {}
for dataset_name, dataset in datasets.items():
dataset_scores = {}
for model_name, model in models.items():
model_score = evaluate_model(model, dataset)
dataset_scores[model_name] = model_score
model_scores_for_datasets[dataset_name] = dataset_scores
Here, model_scores_for_datasets will contain the accuracy results for every dataset for each model and will look something like:
{
"Unprocessed" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Standardisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Normalisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Rescale" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
}
You now have the results for each dataset and can create your required plots. Something along these line:
for dataset_name, scores in model_scores_for_datasets.items():
# For example:
# dataset_name will be "Unprocessed"
# scores will be a dict like so:
# {
# "Logistic Regression" : 10,
# "Decision Tree": 5,
# "Random Forest": 20
# }
generate_plot(dataset_name scores)
Of course, you need to figure out the generate_plot function. Hope this helps and gives you some idea.
Related
I am currently working with multilabel text classification in the Arabic language using binary relevance and label power set, after I make all preprocessing that I need when I need to combine chi and mutual feature selection based on their weights, I am facing this problem
Found input variables with inconsistent numbers of samples: [28332, 24]
where my dataset has one column have the text and 24 columns as a target as shown in the image :
enter image description here
I am writing this code
`class Classifier:
def __init__(self):
self.merged_df = pd.read_csv(r"D:\project\Ymal.csv", encoding='utf-8')
self.train_df, self.test_df = train_test_split(self.merged_df,test_size=0.2,random_state=42)
self.vectorizer = CountVectorizer()
self.ModelsPerformance = {}
def train(self):
self.train_text = self.train_df['text']
self.test_text = self.test_df['text']
self.train_labels = self.train_df.drop(columns=['text'])
self.test_labels = self.test_df.drop(columns=['text'])
self.mlb = MultiLabelBinarizer()
self.train_labels = self.mlb.fit_transform(self.train_labels)
self.test_labels = self.mlb.transform(self.test_labels)
self.train_text_bow = self.vectorizer.fit_transform(self.train_text)
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.chi2_selector = SelectKBest(chi2, k='all',)
self.mi_selector = SelectKBest(mutual_info_classif, k='all',)
self.chi2_features = self.chi2_selector.fit_transform(self.train_text_bow,self.train_labels)
self.mi_features = self.mi_selector.fit_transform(self.train_text_bow,self.train_labels)
self.weights_chi2 = self.chi2_selector.scores_
self.weights_mi = self.mi_selector.scores_
self.weights = (self.weights_chi2 + self.weights_mi ) / 2
self.top_features = np.argsort(self.weights)[-4000:] #[::-1]
self.train_combined_features = self.train_text_bow[:,self.top_features]
self.test_text_bow = self.vectorizer.transform(self.test_text)
self.test_combined_features = self.test_text_bow[:, self.top_features]
def metricsReport(self,modelName, test_labels, predictions):
hamLoss = hamming_loss(test_labels, predictions)
print("------" + modelName + " Model Metrics-----")
accuracy = accuracy_score(test_labels, predictions)
macroPrecision = precision_score(test_labels, predictions, average='macro')
macroRecall = recall_score(test_labels, predictions, average='macro')
macroF1 = f1_score(test_labels, predictions, average='macro')
microPrecision = precision_score(test_labels, predictions, average='micro')
microRecall = recall_score(test_labels, predictions, average='micro')
microF1 = f1_score(test_labels, predictions, average='micro')
weightedF1 = f1_score(test_labels, predictions, average='weighted')
# print metrics
print("Hamming Loss: {:.4f}".format(hamLoss))
print('Accuracy: {0:.4f}'.format(accuracy))
print('Macro Precision: {0:.4f}'.format(macroPrecision))
print('Macro Recall: {0:.4f}'.format(macroRecall))
print('Macro F1-measure: {0:.4f}'.format(macroF1))
print('Micro Precision: {0:.4f}'.format(microPrecision))
print('Micro Recall: {0:.4f}'.format(microRecall))
print('Micro F1-measure: {0:.4f}\n'.format(microF1))
print('Weighted F1-measure: {0:.4f}\n'.format(weightedF1))
def fitAlgorithms(self):
algorithms = [{'name': 'LinearSVC', 'model': LinearSVC(max_iter=12000, dual=False),
'params': {'C': [0.1, 1, 10]}},
{'name': 'KNN', 'model': KNeighborsClassifier(),
'params': {'n_neighbors': [5, 10, 15]}},
{'name': 'RandomForest', 'model': RandomForestClassifier(),
'params': {'n_estimators': [100, 300, 500]}},
{'name': 'LogisticRegression', 'model': LogisticRegression(),
'params': {'C': [0.1, 1, 10]}},
{'name': 'DecisionTree', 'model': DecisionTreeClassifier(),
'params': {'max_depth': [5, 10, 15]}},
{'name': 'MultinomialNB', 'model': MultinomialNB(),
'params': {'alpha': [0.1, 1, 10]}}
]
for algorithm in algorithms:
model = algorithm['model']
name = algorithm['name']
params = algorithm['params']
# Fit the binary relevance and label powerset classifiers before the grid search
binary_relevance_classifier = BinaryRelevance(model)
binary_relevance_classifier.fit(self.train_combined_features, self.train_labels)
labelPowerSet_classifier = LabelPowerset(model)
labelPowerSet_classifier.fit(self.train_combined_features, self.train_labels)
print(f"Performing GridSearchCV for {name}...")
clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
clf.fit(self.train_combined_features, self.train_labels)
best_params = clf.best_params_
print(f"Best parameters for {name}: {best_params}")
model.set_params(**best_params)
binary_relevance_preds = binary_relevance_classifier.predict(self.test_combined_features)
self.metricsReport(f"Binary Relevance with {name}", self.test_labels, binary_relevance_preds)
labelPowerSet_preds = labelPowerSet_classifier.predict(self.test_combined_features)
self.metricsReport(f"Label Powerset with {name}", self.test_labels, labelPowerSet_preds)
self.ModelsPerformance[name] = clf.best_score_
return self.ModelsPerformance
# Create an instance of the Classifier
classifier = Classifier()
# Invoke the training method
classifier.train()
# Invoke the fitAlgorithms() method
classifier.fitAlgorithms()
but this basic problem is this error I referee it above
please any one can help me and if any one can optimize this ?
I believe that error is clear but I cant avoid this , also i tried the do this to sure the shape but it fine
print("train_text_bow shape:", train_text_bow.shape) print("train_labels shape:", train_labels.shape) train_text_bow shape: (28332, 121714) train_labels shape: (28332, 24)t
I need just to avoid this error
get MAE and RMSE
do have tried like this
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
df = pd.merge(movies, ratings, on='movieId', how='inner')
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader)
trainSet, testSet = train_test_split(data, test_size=.25, random_state=0)
algo = SVD(random_state=0)
algo.fit(trainSet)
predictions = algo.test(testSet)
def MAE(predictions):
return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
return accuracy.rmse(predictions, verbose=False)
print("RMSE: ", RMSE(predictions))
print("MAE: ", MAE(predictions))
and get error
"Singleton array array(<surprise.dataset.DatasetAutoFolds object at 0x000001BD67B62490>,
dtype=object) cannot be considered a valid collection."
I am trying to convert a hyperparameter tuning algorithm to a MultiOutput regression setup, can someone please help me create DMatrix for the same. Here is the code for reference:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
Any further clarification required, please comment.TIA!
When trying to build my data set an error of "TypeError: 'set' object is not subscriptable" is received.
dataDir = '/content/drive/My Drive/Colab Notebooks/HW 3/' # Directory with input files
trainFile = 'q2train.csv' # Training examples
labelFile = 'q2label.csv' # Test label
validFile = 'q2valid.csv' # Valid Files
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train',
'label',
'valid'}
def get_data(data_set_name, test_prop=0.2, seed=2019):
"""returns data for training, testing, and data characteristics"""
data = data_sets[data_set_name]
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_prop,
random_state=seed)
nF = X.shape[1] # number of features
nC = len(np.unique(y)) # number of classes
nTrain, nTest = len(y_train), len(y_test)
print("\nData set: %s" %data_set_name)
print("\tNumber of features %d" %nF)
print("\tNumber of output classes = %d" %(nC))
print("\tNumber of training examples = %d" %(nTrain))
print("\tNumber of testing examples = %d" %(nTest))
return X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest
for name in data_set:
X_train, X_test, y_train, y_test, nF, nC, nTrain, nTest = get_data(name)
Any help would be appreciated, thanks in advance.
Use a dictionary:
train = pd.read_csv(dataDir+trainFile)
valid = pd.read_csv(dataDir+validFile)
label = pd.read_csv(dataDir+labelFile)
data_sets = {
'train': train,
'label': label,
'valid': valid
}
Then data_sets[data_set_name] will retrieve the dataset you want.
I'm trying to train models with different classifiers. and want to save each classifiers classification report in csv file to get score values of classes.But when am trying to do this following error came up.
I will really appreciate if some one can help me to resolve this issue, i can do testing individually with each classifiers and save to file. but it will increase my code that i really don't want.
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
ValueError: Mix type of y not allowed, got types {'multiclass', 'continuous-
multioutput'}
**this is the code am trying with:**
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,
GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd
,Phone,Base_ID,Cell_ID,ULTraffic,TCPSYNAtteDelay,date
1,13548599180,54,251,469,28,20160718034300.0
2,13548599180,1,46,1273,31,20160725023800.0
3,13548599180,54,251,1273,25,20160714012400.0
4,13548599180,54,251,1273,31,20160714085200.0
5,13548599180,54,251,1273,31,20160718034200.0
6,13548599180,54,251,1273,25,20160713082200.0
7,13548599180,54,251,2373,33,20160713091800.0
8,13548599180,54,251,2373,33,20160713091800.0
9,13548599180,54,251,639,31,20160714015300.0
10,13548599180,92,278,690,28,20160704010900.0
11,13548599180,92,278,693,28,20160704010900.0
12,13548599180,92,278,689,31,20160704010900.0
13,13548599180,92,278,1060,25,20160704010900.0
14,13548599180,54,251,1339,29,20160713091500.0
15,13548599180,54,251,1202,29,20160713091500.0
16,13548599180,54,251,975,26,20160713091500.0
17,13548599180,54,251,745,29,20160714015300.0
18,13548599180,54,251,707,27,20160714015300.0
url = 'path\stackflow_sample.csv'
names = ['Phone','Base_ID', 'Cell_ID', 'ULTraffic',
'TCPSYNAtteDelay','date']
data = pd.read_csv(url, names=names,low_memory=False)
a=data.loc[:,('Phone','Base_ID', 'Cell_ID', 'ULTraffic',
'TCPSYNAtteDelay','date')]
data_dia = a.Phone ; x_1 = a.drop('Phone',axis = 1 )
x_train, x_test, y_train, y_test = train_test_split(x_1, data_dia,
test_size=0.3, random_state=42)
classifiers = [
KNeighborsClassifier(3)]
DecisionTreeClassifier()
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis()]
# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
clf.fit(x_train, y_train)
name = clf.__class__.__name__
print("=" * 30)
print(name)
print('****Results****')
train_predictions = clf.predict(x_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(x_test)
ll = log_loss(y_test, train_predictions)
print("Log Loss: {}".format(ll))
log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
log = log.append(log_entry)
report = classification_report(y_test, train_predictions)
print("log:",log)
print("=" * 30)
**This is the function am using to save classification_report results of
classifiers in other file func.py **
def classifaction_report_csv(report):
report_data = []
lines = report.split('\n')
for line in lines[2:-3]:
row = {}
row_data = line.split(' ')
row['class'] = row_data[0]
row['precision'] = float(row_data[1])
row['recall'] = float(row_data[2])
row['f1_score'] = float(row_data[3])
row['support'] = float(row_data[4])
report_data.append(row)
dataframe = pd.DataFrame.from_dict(report_data)
dataframe.to_csv('classification_report.csv', index = False)