Problem with plotting decision regions for classification model - python

I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here -> https://colab.research.google.com/drive/1JqFyoAk0zithy4esfjiyo6MdB12iBndi?usp=sharing
Dataset from Kaggle -> https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
np.random.seed(42)
sns.set()
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
sheet_name='Date_Fruit_Datasets')
data = raw_data.copy()
data.head(n=10)
data.describe().transpose()
data.info()
data.shape
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
X.shape
y.shape
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
})
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
results.to_csv(path_or_buf='/content/data_fruit_predictions.csv')
predict_proba.to_csv(path_or_buf='/content/data_fruit_predict_proba.csv')
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
plt.show()
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
and
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
ax.scatter(x=x_data,

Related

Getting 'ValueError: Expected 2D array, got 1D array instead: array=[-0.27861589].' in python for predicting SVR regrssion

I am having trouble to solve the array dimension problem showing in the code. When I am trying to figure out the y_predict, the valueerror problem is showing. here is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#importing dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:,1:2].values
y = dataset.iloc[:,2].values
y=np.reshape(y,(10,1))
#Spliting dataset into training set and test set
'''from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)'''
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)
######## SVR regression
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf') #rbf = gaussian kernel
svr_regressor.fit(X, y)
#Prediction of given value using SVR regression
X = np.reshape(X,(-1, 1))
y_predict = sc_y.inverse_transform(svr_regressor.predict(sc_X.transform([[6.5]])))
########### Visulization of svr model
plt.scatter(X, y, color = 'blue')
plt.plot(X, svr_regressor.predict(X), color = 'red')
plt.show()
I am getting error:
ValueError: Expected 2D array, got 1D array instead:
array=[-0.27861589].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

sklearn.plot_tree how to visualize class_labels for classification task?

I have recently done my trial code on Decision Tree. It is working perfectly fine except for one thing. The tree being plotted does not contain class names in it. Do I do something wrong?
Please see code below and a picture of the data set.
#Import Data#
import pandas as pd
data_set = pd.read_excel(r"C:\Users\User\Desktop\Tree.xlsx")
print(data_set.head())
#Set Features and Training Targets#
features_names=["Money","Debt"]
target_names=["Mood1", "Mood2", "Mood3"]
features = data_set[features_names]
targets = data_set[target_names]
print(features)
print(targets)
#Set Training Set and Test Set#
train_features = features[:10]
train_targets = targets[:10]
test_features = features[10:]
test_targets = targets[10:]
print (train_features)
print (train_targets)
print(test_features)
print(test_targets)
#Estimating Tree#
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth = 3)
dt = dt.fit(train_features, train_targets)
print(dt.score(train_features, train_targets))
print(dt.score(test_features, test_targets))
#Plotting the Tree#
from sklearn import tree
import matplotlib.pyplot as plt
tree.plot_tree(dt, feature_names=features_names, class_names=target_names, filled = True)
plt.show()
In regression tasks visualizing labels might not work; the documentation states that class_name parameter is "Only relevant for classification".
In this case, your target variable Mood could be categorical, representing it's values in a single column. Once this is done, you can set
tree.plot_tree(clf, class_names=True)
for symbolic representation of class names
or
class_names = ['setosa', 'versicolor', 'virginica']
tree.plot_tree(clf, class_names=class_names)
for the specific class names.
Full Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
clf.fit(X_train, y_train)
# Symbolic class name representation
tree.plot_tree(clf, class_names=True)
# Specific class name representation
class_names = iris['target_names']
tree.plot_tree(clf, class_names=class_names)

Calculating AUC for LogisticRegression model

Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]

Dataset indices for predicted values is not matching with those for actual values

I am a python novice who is trying to solve a regression problem with neural networks. I am at the stage where I want to plot the predicted vs actual followed by determining the regression coefficient.
Model training
#import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline
#importing the dataset
data = pd.read_csv("PPV_dataset.csv")
X = np.array(data.drop(["PPV"],1))
y = np.array(data["PPV"])
#model training & prediction
nn = MLPRegressor(hidden_layer_sizes=(100,), activation = 'logistic', solver = 'sgd')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.fit(X_train, y_train)
pred = nn.predict(X_test)
#indices of test set
a = X_test
indices = []
for row in range(len(X)):
for i in range(len(a)):
if np.all(a[i]==X[row]):
indices.append(row)
#listing actual values in an array
actual_values = []
for i in range(len(indices)):
actual_values.append(y[indices[i]])
Comparing actual to predicted values
len(actual_values)
13
len(pred)
12
Image of dataset
You should use the matplotlib and the seaborn libraries for plotting you graph,
and for coeficient r_sq = nn.score(actual_values, pred)
I recommend using seaborn.lmplot() in your case
for roberts particular case I suggest:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

added Standardscaler but receive errors in Cross Validation and the correlation matrix

This is the code I built to apply a multiple linear regression. I added standard scaler to fix the Y intercept p-value which was not significant but the problem that the results of CV RMSE in the end changed and have nosense anymore and received an error in the code for plotting the correlation Matrix saying : AttributeError: 'numpy.ndarray' object has no attribute 'corr'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from scipy.stats.stats import pearsonr
# Import Excel File
data = pd.read_excel("C:\\Users\\AchourAh\\Desktop\\Multiple_Linear_Regression\\SP Level Reasons Excels\\SP000273701_PL14_IPC_03_09_2018_Reasons.xlsx",'Sheet1') #Import Excel file
# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)
# Extraction of the independent and dependent variables
X = data1.iloc[0:len(data1),[1,2,3,4,5,6,7]] #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),9] #Extract the column of the PAUS SP
# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=1)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# Statistical Analysis of the training set with Statsmodels
X = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X).fit()
print(est.summary()) # print the results
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm = LinearRegression() # create an lm object of LinearRegression Class
lm.fit(X_train,Y_train) # train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train.
print(lm.intercept_)
print(lm.coef_)
mse_test = mean_squared_error(Y_test, lm.predict(X_test))
print(math.sqrt(mse_test))
# Data Splitting to train and test set of the reduced data
X_1 = data1.iloc[0:len(data1),[1,2]] #Extract the column of the COPCOR SP we are going to check its impact
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_1, Y, test_size =0.25,random_state=1)
X_train2 = ss.fit_transform(X_train2)
X_test2 = ss.transform(X_test2)
# Statistical Analysis of the reduced model with Statsmodels
X_reduced = sm.add_constant(X_train2) # add a constant to the model
est_reduced = sm.OLS(Y_train2, X_reduced).fit()
print(est_reduced.summary()) # print the results
# Fitting a Linear Model for the reduced model with Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm1 = LinearRegression() #create an lm object of LinearRegression Class
lm1.fit(X_train2, Y_train2)
print(lm1.intercept_)
print(lm1.coef_)
mse_test1 = mean_squared_error(Y_test2, lm1.predict(X_test2))
print(math.sqrt(mse_test1))
#Cross Validation and Training again the model
from sklearn.model_selection import KFold
from sklearn import model_selection
kf = KFold(n_splits=6, random_state=1)
for train_index, test_index in kf.split(X_train2):
print("Train:", train_index, "Validation:",test_index)
X_train1, X_test1 = X[train_index], X[test_index]
Y_train1, Y_test1 = Y[train_index], Y[test_index]
results = -1 * model_selection.cross_val_score(lm1, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(np.sqrt(results))
#RMSE values interpretation
print(math.sqrt(mse_test1))
print(math.sqrt(results.mean()))
#Good model built no overfitting or underfitting (Barely Same for test and training :Goal of Cross validation but low prediction accuracy = Value is big
import seaborn
Corr=X_train2.corr(method='pearson')
mask=np.zeros_like(Corr)
mask[np.triu_indices_from(mask)]=True
seaborn.heatmap(Corr,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,mask=mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
enter code here
Do you have an idea how to fix the issue ?
I'm guessing the problem lies with:
Corr=X_train2.corr(method='pearson')
.corr is a pandas dataframe method but X_train2 is a numpy array at that stage. If a dataframe/series is passed into StandardScaler, a numpy array is returned. Try replacing the above with:
Corr=pd.DataFrame(X_train2).corr(method='pearson')
or make use of numpy.corrcoef or numpy.correlate in their respective forms.

Categories

Resources