I am using SVM and I have a problem in which the execution of the program stays stuck in model.fit(X_test, y_test), which corresponds to fitting the SVM model. How to fix that? Here is my code:
# Make Predictions with Naive Bayes On The Iris Dataset
import collections
from csv import reader
from math import sqrt, exp, pi
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals.six import StringIO
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import datasets, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# Function to split the dataset
def splitdataset(balance_data, column_count):
# Separating the target variable
X = balance_data.values[:, 1:column_count]
Y = balance_data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
def importdata():
balance_data = pd.read_csv( 'dataExtended.txt', sep= ',')
row_count, column_count = balance_data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",balance_data.head())
balance_data['gold'] = balance_data['gold'].astype('category').cat.codes
balance_data['Program'] = balance_data['Program'].astype('category').cat.codes
return balance_data, column_count
# Driver code
def main():
print("hey")
# Building Phase
data,column_count = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data, column_count)
#Create a svm Classifier
model = svm.SVC(kernel='linear',probability=True) # Linear Kernel
print('before fitting')
model.fit(X_test, y_test)
print('fitting over')
predicted = model.predict(X_test)
print('prediction over')
print(metrics.classification_report(y_test, predicted))
print('classification over')
print(metrics.confusion_matrix(y_test, predicted))
probs = model.predict_proba(X_test)
probs_list = list(probs)
y_pred=[None]*len(y_test)
y_pred_list = list(y_pred)
y_test_list = list(y_test)
i=0
threshold=0.7
while i<len(probs_list):
#print('probs ',probs_list[i][0])
if (probs_list[i][0]>=threshold) & (probs_list[i][1]<threshold):
y_pred_list[i]=0
i=i+1
elif (probs_list[i][0]<threshold) & (probs_list[i][1]>=threshold):
y_pred_list[i]=1
i=i+1
else:
#print(y_pred[i])
#print('i==> ',i, ' probs length ', len(probs_list), ' ', len(y_pred_list), ' ', len(y_test_list))
y_pred_list.pop(i)
y_test_list.pop(i)
probs_list.pop(i)
#print(y_pred_list)
print('confusion matrix\n',confusion_matrix(y_test_list,y_pred_list))
print('classification report\n', classification_report(y_test_list,y_pred_list))
print('accuracy score', accuracy_score(y_test_list, y_pred_list))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_list, y_pred_list))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_list, y_pred_list))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_list, y_pred_list)))
if __name__=="__main__":
main()
This is most likely due to the parameter probability set to True when you initialize the model. As you can read in the docs:
probability: bool, default=False
Whether to enable probability
estimates. This must be enabled prior to calling fit, will slow down
that method as it internally uses 5-fold cross-validation, and
predict_proba may be inconsistent with predict.
This issue has been discussed on StackOverflow here and here.
Related
I am a total beginner and I am trying to compare different methods of handling missing data. In order to evaluate the effect of each method (drop raws with missing values, drop columns with missigness over 40%, impute with the mean, impute with the KNN), I compare the results of the LDA accuracy and LogReg accuracy on the training set between a dataset with 10% missing values, 20% missing values against the results of the original complete dataset. Unfortunately, I get pretty much the same results even between the complete dataset and the dataset with 20% missing-ness. I don't know what I am doing wrong.
from numpy import nan
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#dataset = read_csv('telecom_churn_rev10.csv')
dataset = read_csv('telecom_churn_rev20.csv')
dataset = dataset.replace(nan, 0)
values = dataset.values
X = values[:,1:11]
y = values[:,0]
dataset.fillna(dataset.mean(), inplace=True)
#dataset.fillna(dataset.mode(), inplace=True)
print(dataset.isnull().sum())
imputer = SimpleImputer(missing_values = nan, strategy = 'mean')
transformed_values = imputer.fit_transform(X)
print('Missing: %d' % isnan(transformed_values).sum())
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
result = cross_val_score(model, X, y, cv = cv, scoring = 'accuracy')
print('Accuracy: %.3f' % result.mean())
#print('Accuracy: %.3f' % result.mode())
print(dataset.describe())
print(dataset.head(20))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)
from sklearn import metrics
# make predictions on X
expected = y
predicted = classifier.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
# make predictions on X test
expected = y_test
predicted = classifier.predict(X_test)
# summarize the fit of the model
print(metrics.confusion_matrix(expected, predicted))
print(metrics.classification_report(expected, predicted))
You replace all your missing values with 0 at that line : dataset = dataset.replace(nan, 0). After this line, you have a full dataset without missing values. So, the .fillna() and the SimpleImputer() are useless after that line.
Ideally I should get same result as score is nothing but R-Square. But not sure why results are coming different.
from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()
data.data.shape
data.feature_names
data.target_names
import pandas as pd
house_data = pd.DataFrame(data.data, columns=data.feature_names)
house_data.describe()
house_data['Price'] = data.target
X = house_data.iloc[:, 0:8].values
y = house_data.iloc[:, -1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
#Check R-square on training data
from sklearn.metrics import mean_squared_error, r2_score
y_pred = linear_model.predict(X_test)
print(linear_model.score(X_test, y_test))
print(r2_score(y_pred, y_test))
Output
0.5957643114594776
0.34460597952465033
from the docs: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
sklearn.metrics.r2_score(y_true, y_pred,...)
You are passing y_true and y_pred the wrong way around. If you switch them you get the correct result.
print(linear_model.score(X_test, y_test))
print(r2_score(y_test, y_pred))
0.5957643114594777
0.5957643114594777
Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]
I'm completely unaware as to why i'm receiving this error. I am trying to implement XGBoost but it returns with error "ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric." Even after i've One Hot Encoded my categorical data. If anyone knows what is causing this and a possible solution i'd greatly appreciate it. Here is my code written in Python:
# Artificial Neural Networks - With XGBoost
# PRE PROCESS
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding Categorical Data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(), [1, 2])],
remainder = 'passthrough')
X = np.array(ct.fit_transform(X), dtype = np.float)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
# Fitting XGBoost to the training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train)
# Predicting the Test set Results
y_pred = classifier.predict(x_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
I'm unable to match LGBM's cv score by hand.
Here's a MCVE:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {'random_state': 42}
results = lgb.cv(params, lgb.Dataset(X_train, y_train), folds=folds, num_boost_round=1000, early_stopping_rounds=100, metrics=['auc'])
print('LGBM\'s cv score: ', results['auc-mean'][-1])
clf = lgb.LGBMClassifier(**params, n_estimators=len(results['auc-mean']))
val_scores = []
for train_idx, val_idx in folds.split(X_train):
clf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
val_scores.append(roc_auc_score(y_train.iloc[val_idx], clf.predict_proba(X_train.iloc[val_idx])[:,1]))
print('Manual score: ', np.mean(np.array(val_scores)))
I was expecting the two CV scores to be identical - I have set random seeds, and done exactly the same thing. Yet they differ.
Here's the output I get:
LGBM's cv score: 0.9851513530737058
Manual score: 0.9903622177441328
Why? Am I not using LGMB's cv module correctly?
You are splitting X into X_train and X_test.
For cv you split X_train into 5 folds while manually you split X into 5 folds. i.e you use more points manually than with cv.
change results = lgb.cv(params, lgb.Dataset(X_train, y_train) to results = lgb.cv(params, lgb.Dataset(X, y)
Futhermore, there can be different parameters. For example, the number of threads used by lightgbm changes the result. During cv the models are fitted in parallel. Hence the number of threads used might differ from your manual sequential training.
EDIT after 1st correction:
You can achieve the same results using manual splitting / cv using this code:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective':'binary',
'metric':'auc',
}
data_all = lgb.Dataset(X_train, y_train)
results = lgb.cv(params, data_all,
folds=folds.split(X_train),
num_boost_round=1000,
early_stopping_rounds=100)
print('LGBM\'s cv score: ', results['auc-mean'][-1])
val_scores = []
for train_idx, val_idx in folds.split(X_train):
data_trd = lgb.Dataset(X_train.iloc[train_idx],
y_train.iloc[train_idx],
reference=data_all)
gbm = lgb.train(params,
data_trd,
num_boost_round=len(results['auc-mean']),
verbose_eval=100)
val_scores.append(roc_auc_score(y_train.iloc[val_idx], gbm.predict(X_train.iloc[val_idx])))
print('Manual score: ', np.mean(np.array(val_scores)))
yields
LGBM's cv score: 0.9914524426410262
Manual score: 0.9914524426410262
What makes the difference is this line reference=data_all. During cv, the binning of the variables (refers to lightgbm doc) is constructed using the whole dataset (X_train) while in you manual for loop it was built on the training subset (X_train.iloc[train_idx]). By passing the reference to the dataset containg all the data, lightGBM will reuse the same binning, giving same results.