Input data cannot be a list XGBoost - python

Here is my code.
import pandas as pd
import numpy as np
import json
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
training_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/MLB_Training_Data.csv')
df_model = training_data.copy()
scaler = StandardScaler()
features = [['OBS', 'Runs']]
for feature in features:
df_model[feature] = scaler.fit_transform(df_model[feature])
test_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/Test_Data.csv')
X = training_data.iloc[:,1] #independent columns
y = training_data.iloc[:,-1] #target column
X = X.values.reshape(-1,1)
results = []
# fit final model
model = XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
round(mean_squared_error(y_train, y_train_pred),2),
round(mean_squared_error(y_test, y_test_pred),2)
))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
# define one new data instance
index = 0
count = 0
while count < len(test_data):
team = test_data.loc[index].at['Team']
OBS = test_data.loc[index].at['OBS']
Xnew = [[ OBS ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Team': team,
'Runs': (round(ynew[0],2))
})
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Runs'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Team', 'Runs'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/MLB_Data/ML/Results/Projected_Runs_XGBoost.xlsx', engine='xlsxwriter') # pylint: disable=abstract-class-instantiated
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()
and the error I'm getting is TypeError: Input data can not be a list.
The data coming from test_data is a csv with a team name and obs which is a float
like this NYY 0.324
Every way to solve it I've seen is just to put it in a 2d array like I did - Xnew = [[ OBS ]],
but I'm still getting the error.
Is there something else I need to do to the test_data coming in? I tried using values.reshape, but that didn't fix it either.

You need to transform your Xnew:
Xnew = np.array(Xnew).reshape((1,-1))

Related

Undersampling numpy array

I have a train set with 10192 samples of '0' and 2512 samples of '1'.
I've applied a PCA on the set to reduce the dimensionality.
I want to undersample this numpy array.
Here's my code :
df = read_csv("train.csv")
X = df.drop(['label'], axis = 1)
y = df['label']
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = PCA(n_components = 19)
model.fit(X_train)
X_train_pca = model.transform(X_train)
X_validation_pca = model.transform(X_validation)
X_train = np.array(X_train_pca)
X_validation = np.array(X_validation_pca)
y_train = np.array(y_train)
y_validation = np.array(y_validation)
How can I undersample '0' class from X_train numpy array?
Try after importing csv into df:
# class count
count_class_0, count_class_1 = df.label.value_counts()
# separate according to `label`
df_class_0 = df[df['label'] == 0]
df_class_1 = df[df['label'] == 1]
# sample only from class 0 quantity of rows of class 1
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
Then perform all calculations on df_test_under data frame.
Alternatively use RandomUnderSampler:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

When switching from Scalar to PolynomialFeatures get an error

I am trying to switch from using Scalar to transform my data to quadratic.fit_transform
here is my code
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
training_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Training_data.csv")
test_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Test_Data.csv")
X = training_data.iloc[:,1:4] #independent columns
y = training_data.iloc[:,-1] #target column
model = LinearRegression()
quadratic = PolynomialFeatures(degree=2)
X_quad = quadratic.fit_transform(X)
model.fit(X_quad,y)
results = []
index = 0
count = 0
while count < len(test_data):
name = test_data.loc[index].at['Player_Name']
Scrambling = test_data.loc[index].at['Scrambling']
Total_Putts_GIR = test_data.loc[index].at['Total_Putts_GIR']
SG_Putting = test_data.loc[index].at['SG_Putting']
Xnew = [[ Scrambling, Total_Putts_GIR, SG_Putting ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Name': name,
'Projection': (round(ynew[0],2))
}
)
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Projection'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Name', 'Projection'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/PGA/Regressions/Linear_Regressions/Results/Projections_LR_LL.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()
However, when I run this I get an error saying
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 10 is different from 3)
is there another step I need to add? Not sure why it's changing the size of my input data.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
training_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Training_data.csv")
test_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Test_Data.csv")
X = training_data.iloc[:,1:4] #independent columns
y = training_data.iloc[:,-1] #target column
degree = 2
model=make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
round(mean_squared_error(y_train, y_train_pred),2),
round(mean_squared_error(y_test, y_test_pred),2)
))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
results = []
index = 0
count = 0
while count < len(test_data):
name = test_data.loc[index].at['Player_Name']
Scrambling = test_data.loc[index].at['Scrambling']
Total_Putts_GIR = test_data.loc[index].at['Total_Putts_GIR']
SG_Putting = test_data.loc[index].at['SG_Putting']
Xnew = [[ Scrambling, Total_Putts_GIR, SG_Putting ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Name': name,
'Projection': (round(ynew[0],2))
}
)
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Projection'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Name', 'Projection'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/PGA/Regressions/Linear_Regressions/Results/Projections_LR_LR.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()

Saving and loading neupy algorithm with dill library can return different predictions for the same time period?

First of all thank you for reading this, and thank you in advance if you can help.
This is the algorithm that I´m using for supervised learning:
# Define neural network
cgnet = algorithms.LevenbergMarquardt(
connection=[
layers.Input(XTrain.shape[1]),
layers.Relu(6),
layers.Linear(1)
],
mu_update_factor=2,
mu=0.1,
shuffle_data=True,
verbose=True,
decay_rate=0.1,
addons=[algorithms.WeightElimination]
)
Cross validation results are good (k=10):
[0.16767815652364237, 0.13396493112368024, 0.19033966833586402, 0.12023567250054788, 0.11826824035439124, 0.13115856672872392, 0.14250003819441104, 0.12729442202775898, 0.31073760721487326, 0.19299511349686768]
[0.9395976956178138, 0.9727526340820827, 0.9410503161549465, 0.9740922179654977, 0.9764171089773663, 0.9707258917808179, 0.9688830174583372, 0.973160633351555, 0.8551738446276884, 0.936661707991699]
MEA: 0.16 (+/- 0.11)
R2: 0.95 (+/- 0.07)
After training I have saved the algorithm with dill:
with open('network-storage.dill', 'wb') as f:
dill.dump(cgnet, f)
Then if I load the network with dill and consider the X values of the entire training set I get the same R2 (0.9691), until now everything is ok. This are the results:
If I try to do the same thing but with only the last few years [2018-2022] I get this (prediction of y with X training values (2018 to 2022):
Instead of this (prediction of y with X training values (1992 to 2022):
Why do I get different predictions for the same period when I load different X values range? (X input from 1992 to 2022: y prediction for 1992 to 2022 is ok.
(X input from 2018 to 2022: y prediction for 2018 to 2022 is not ok.
This is the code:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import dill
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import KFold
from scipy.interpolate import Rbf
from scipy import stats
from neupy import layers, environment, algorithms
from neupy import plots
# Import data
data = pd.read_excel('DataAL_Incremento.xlsx', index_col=0, header=1).iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,-1]]
data.columns = ['PPO4L(in)','PPO4(in)','NH4L(in)','NH4(in)','NO3L(in)','NNO3(in)','CBOL(in)', 'CBO(in)','Temp(In)','Temp(alb)','Tair ','Tdew',
'Wvel','Cl_aL(in)','Cl_a(in)','ODL(in)','OD(in)','Qin(in)','ODalb','PPO4(alb)','NNO3(alb)']
# Add filtered data
tmp0 = data.iloc[:,[9, 6, 14]].rolling(9, center=False, axis=0).mean()
tmp0.columns = ['Temp(alb)_09','CBOL(in)_09','Cl_a(in)_09']
tmp1 = data.iloc[:,[9, 6, 14]].rolling(15, center=False, axis=0).mean()
tmp1.columns = ['Temp(alb)_15', 'CBOL(in)_15','Cl_a(in)_15']
tmp2 = data.iloc[:,[9, 6, 14]].rolling(31, center=False, axis=0).mean()
tmp2.columns = ['Temp(alb)_31', 'CBOL(in)_31','Cl_a(in)_31']
data = pd.concat((data, tmp0, tmp1, tmp2), axis=1)
# Drop empty records
data = data.dropna()
# Define data
X = data.loc[:, ['CBOL(in)', 'CBO(in)','Temp(In)','Temp(alb)','Tair ','Cl_aL(in)','Cl_a(in)','OD(in)','Temp(alb)_31', 'CBOL(in)_31','Cl_a(in)_31']]
y = data.loc[:, ['ODalb']]
years = data.index.year
yearsTrain = range(1992,2022)
yearsTest = 2019,2020,2021
#yearsTrain, yearsTest = train_test_split(np.unique(years), test_size=0.2, train_size=0.8, random_state=None)
XTrain = X.query('#years in #yearsTrain')
yTrain = y.query('#years in #yearsTrain').values.ravel()
XTest = X.query('#years in #yearsTest')
yTest = y.query('#years in #yearsTest').values.ravel()
results = y.query('#years in #yearsTest')
#===============================================================================
# Neural network
#===============================================================================
# Define neural network
cgnet = algorithms.LevenbergMarquardt(
connection=[
layers.Input(XTrain.shape[1]),
layers.Relu(6),
layers.Linear(1)
],
mu_update_factor=2,
mu=0.1,
shuffle_data=True,
verbose=True,
decay_rate=0.1,
addons=[algorithms.WeightElimination]
)
# Scale
XScaler = StandardScaler()
XScaler.fit(XTrain)
XTrainScaled = XScaler.transform(XTrain)
XTestScaled = XScaler.transform(XTest)
yScaler = StandardScaler()
yScaler.fit(yTrain.reshape(-1, 1))
yTrainScaled = yScaler.transform(yTrain.reshape(-1, 1)).ravel()
yTestScaled = yScaler.transform(yTest.reshape(-1, 1)).ravel()
# Train
cgnet.train(XTrainScaled, yTrainScaled, XTestScaled, yTestScaled, epochs=30)
yEstTrain = yScaler.inverse_transform(cgnet.predict(XTrainScaled).reshape(-1, 1)).ravel()
mae = np.mean(np.abs(yTrain-yEstTrain))
results['ANN'] = yScaler.inverse_transform(cgnet.predict(XTestScaled).reshape(-1, 1)).ravel()
# Metrics
mse = np.mean((yTrain-yEstTrain)**2)
mseTes = np.mean((yTest-results['ANN'])**2)
maeTes = np.mean(np.abs(yTest-results['ANN']))
meantrain = np.mean(yTrain)
ssTest = (yTrain-meantrain)**2
r2=(1-(mse/(np.mean(ssTest))))
meantest = np.mean(yTest)
ssTrain = (yTest-meantest)**2
r2Tes=(1-(mseTes/(np.mean(ssTrain))))
# Plot results
print("NN MAE: %f (All), %f (Test) " % (mae, maeTes))
print ("NN MSE: %f (All), %f (Test) " % (mse, mseTes))
print ("NN R2: %f (All), %f (Test) " % (r2, r2Tes))
results.plot()
plt.show(block=True)
plots.error_plot(cgnet)
plt.show(block=True)
plt.scatter(yTest,results['ANN'])
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show(block=True)
#===============================================================================
# Save algorithms - Neural network
#===============================================================================
with open('network-storage.dill', 'wb') as f:
dill.dump(cgnet, f)
#===============================================================================
# Load algorithms - Neural network
#===============================================================================
#Prepare data
dataVal = pd.read_excel('DataAL_IncrementoTeste.xlsx', index_col=0, header=1).iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,-1]]
dataVal.columns = ['PPO4L(in)','PPO4(in)','NH4L(in)','NH4(in)','NO3L(in)','NNO3(in)','CBOL(in)', 'CBO(in)','Temp(In)','Temp(alb)','Tair ','Tdew',
'Wvel','Cl_aL(in)','Cl_a(in)','ODL(in)','OD(in)','Qin(in)','ODalb','PPO4(alb)','NNO3(alb)']
# Add filtered data
tmp0 = dataVal.iloc[:,[9, 6, 14]].rolling(9, center=False, axis=0).mean()
tmp0.columns = ['Temp(alb)_09','CBOL(in)_09','Cl_a(in)_09']
tmp1 = dataVal.iloc[:,[9, 6, 14]].rolling(15, center=False, axis=0).mean()
tmp1.columns = ['Temp(alb)_15', 'CBOL(in)_15','Cl_a(in)_15']
tmp2 = dataVal.iloc[:,[9, 6, 14]].rolling(31, center=False, axis=0).mean()
tmp2.columns = ['Temp(alb)_31', 'CBOL(in)_31','Cl_a(in)_31']
dataVal = pd.concat((dataVal, tmp0, tmp1, tmp2), axis=1)
# Drop empty records (removes adjacent columns)
dataVal = dataVal.dropna()
# Define data
Xval = dataVal.loc[:, ['CBOL(in)', 'CBO(in)','Temp(In)','Temp(alb)','Tair ','Cl_aL(in)','Cl_a(in)','OD(in)','Temp(alb)_31', 'CBOL(in)_31','Cl_a(in)_31']]
yval = dataVal.loc[:, ['ODalb']]
years = dataVal.index.year
yearsTrain = range(2018,2022)
XFinalVal = Xval.query('#years in #yearsTrain')
yFinalVal = yval.query('#years in #yearsTrain').values.ravel()
resultsVal = yval.query('#years in #yearsTrain')
# Load algorithms
with open('network-storage.dill', 'rb') as f:
cgnet = dill.load(f)
# Scale X
XScaler = StandardScaler()
XScaler.fit(XFinalVal)
XFinalScaled = XScaler.transform(XFinalVal)
# Scale y
yScaler = StandardScaler()
yScaler.fit(yFinalVal.reshape(-1, 1))
yTrainScaled = yScaler.transform(yFinalVal.reshape(-1, 1)).ravel()
# Predict
y_predicted = yScaler.inverse_transform(cgnet.predict(XFinalScaled).reshape(-1, 1)).ravel()
resultsVal['ANN'] = y_predicted
scoreMean = metrics.mean_absolute_error(yFinalVal, y_predicted)
scoreR2 = metrics.r2_score(yFinalVal, y_predicted)
print(scoreMean)
print(scoreR2)
plt.scatter(yFinalVal,y_predicted)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show(block=True)
resultsVal.plot()
plt.show(block=True)
#===============================================================================
# Cross validation - Neural network
#===============================================================================
XScaler = StandardScaler()
XScaler.fit(XTrain)
XTrainScaled = XScaler.transform(XTrain)
XTestScaled = XScaler.transform(XTest)
yScaler = StandardScaler()
yScaler.fit(yTrain.reshape(-1, 1))
yTrainScaled = yScaler.transform(yTrain.reshape(-1, 1)).ravel()
yTestScaled = yScaler.transform(yTest.reshape(-1, 1)).ravel()
kfold = KFold(n_splits=10, shuffle=True, random_state=None)
scoresMean = []
scoresR2 = []
for train, test in kfold.split(XTrainScaled):
x_train, x_test = XTrainScaled[train], XTrainScaled[test]
y_train, y_test = yTrainScaled[train], yTrainScaled[test]
cgnet = algorithms.LevenbergMarquardt(
connection=[
layers.Input(XTrain.shape[1]),
layers.Relu(6),
layers.Linear(1)
],
mu_update_factor=2,
mu=0.1,
shuffle_data=True,
verbose=True,
decay_rate=0.1,
addons=[algorithms.WeightElimination]
)
cgnet.train(x_train, y_train, epochs=100)
y_predicted = cgnet.predict(x_test)
scoreMean = metrics.mean_absolute_error(y_test, y_predicted)
scoreR2 = metrics.r2_score(y_test, y_predicted)
scoresMean.append(scoreMean)
scoresR2.append(scoreR2)
print(scoresMean)
print(scoresR2)
scoresMean = np.array(scoresMean)
scoresR2 = np.array(scoresR2)
print("MEA: %0.2f (+/- %0.2f)" % (scoresMean.mean(), scoresMean.std() * 2))
print("R2: %0.2f (+/- %0.2f)" % (scoresR2.mean(), scoresR2.std() * 2))
I think that one of the problems might be with the scaling that you apply before the training. In the training stage you fit scaler function using training data
XScaler = StandardScaler()
XScaler.fit(XTrain)
But after you loaded network using dill you've fitted scaler with different data (validation data specificaly)
XScaler = StandardScaler()
XScaler.fit(XFinalVal)
In the second case, you use different scaling for the prediction which network hasn't seen during the training. New scaling might create different distrubition of the samples compare to the one that networks expects.
In order to make effect from the training reproducible you also need to save XScaler and load it at the same time when you load network.
Everything that I've described also true for the yScaler

Make prediction from Pandas DataFrame

I am very new to DataScience/Pandas in general. I mainly followed this and could get it to work using different classifiers.
import pandas as pd
import src.helper as helper
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
# Headings
headings = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# Load the data
shrooms = pd.read_csv('data/shrooms_no_header.csv', names=headings, converters={"header": float})
# Replace the ? in 'stalk-root' with 0
shrooms.loc[shrooms['stalk-root'] == '?', 'stalk-root'] = np.nan
shrooms.fillna(0, inplace=True)
# Remove columns with only one unique value
for col in shrooms.columns.values:
if len(shrooms[col].unique()) <= 1:
print("Removing column {}, which only contains the value: {}".format(col, shrooms[col].unique()[0]))
shrooms.drop(col, axis=1, inplace=True)
# Col to predict later
col_predict = 'class'
# Binary Encoding
all_cols = list(shrooms.columns.values)
all_cols.remove(col_predict)
helper.encode(shrooms, [col_predict])
# Expand Shrooms DataFrame to Binary Values
helper.expand(shrooms, all_cols)
# Remove the class we want to predict
x_all = list(shrooms.columns.values)
x_all.remove(col_predict)
# Set Train/Test ratio
ratio = 0.7
# Split the DF
df_train, df_test, X_train, Y_train, X_test, Y_test = helper.split_df(shrooms, col_predict, x_all, ratio)
# Try different classifier
# TODO: Batch Use to compare
classifier = GradientBoostingClassifier(n_estimators=1000)
# TODO: Optimize Hyperparamter (where applicable)
# Time the training
timer_start = time.process_time()
classifier.fit(X_train, Y_train)
timer_stop = time.process_time()
time_diff = timer_stop - timer_start
# Get the score
score_train = classifier.score(X_train, Y_train)
score_test = classifier.score(X_test, Y_test)
print('Train Score {}, Test Score {}, Time {}'.format(score_train, score_test, time_diff))
# TODO: Test a manual DataFrame
The "helpers" are functions I don't quite understand fully, but they work:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
def split_df(df, y_col, x_cols, ratio):
"""
This method transforms a dataframe into a train and test set, for this you need to specify:
1. the ratio train : test (usually 0.7)
2. the column with the Y_values
"""
mask = np.random.rand(len(df)) < ratio
train = df[mask]
test = df[~mask]
y_train = train[y_col].values
y_test = test[y_col].values
x_train = train[x_cols].values
x_test = test[x_cols].values
return train, test, x_train, y_train, x_test, y_test
def encode(df, columns):
for col in columns:
le = LabelEncoder()
col_values_unique = list(df[col].unique())
le_fitted = le.fit(col_values_unique)
col_values = list(df[col].values)
le.classes_
col_values_transformed = le.transform(col_values)
df[col] = col_values_transformed
def expand(df, list_columns):
for col in list_columns:
colvalues = df[col].unique()
for colvalue in colvalues:
newcol_name = "{}_is_{}".format(col, colvalue)
df.loc[df[col] == colvalue, newcol_name] = 1
df.loc[df[col] != colvalue, newcol_name] = 0
df.drop(list_columns, inplace=True, axis=1)
def correlation_to(df, col):
correlation_matrix = df.corr()
correlation_type = correlation_matrix[col].copy()
abs_correlation_type = correlation_type.apply(lambda x: abs(x))
desc_corr_values = abs_correlation_type.sort_values(ascending=False)
y_values = list(desc_corr_values.values)[1:]
x_values = range(0, len(y_values))
xlabels = list(desc_corr_values.keys())[1:]
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(x_values, y_values)
ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
plt.xticks(x_values, xlabels, rotation='vertical')
plt.show()
I would like to have a "manual" test, such as entering x attributes and getting a prediction based on that.
So for example, I hardcode a DataFrame like the following:
manual = pd.DataFrame({
"cap-shape": ["x"],
"cap-surface": ["s"],
"cap-color": ["n"],
"bruises": ["f"],
"odor": ["n"],
"gill-attachment": ["a"],
"gill-spacing": ["c"],
"gill-size": ["b"],
"gill-color": ["y"],
"stalk-shape": ["e"],
"stalk-root": ["?"],
"stalk-surface-above-ring": ["s"],
"stalk-surface-below-ring": ["s"],
"stalk-color-above-ring": ["o"],
"stalk-color-below-ring": ["o"],
"veil-type": ["p"],
"veil-color": ["o"],
"ring-number": ["o"],
"ring-type": ["p"],
"spore-print-color": ["o"],
"population": ["c"],
"habitat": ["l"]
})
How would I apply the same encoding? My code says helper.encode(manual, [col_predict]) but the manual ofc does not have a col_predict?
Please bear in mind I am a complete beginner, I searched the web a l ot, but I cannot come up with a proper source/tutorial that lets me test a single set.
The full code can be found here.
Try this:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
data = pd.read_csv('agaricus-lepiota.data.txt', header=None) #read data
data.rename(columns={0: 'y'}, inplace = True) #rename predict column (edible or not)
le = LabelEncoder() # encoder to do label encoder
data = data.apply(lambda x: le.fit_transform(x)) #apply LE to all columns
X = data.drop('y', 1) # X without predict column
y = data['y'] #predict column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = GradientBoostingClassifier()#you can pass arguments
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) #it is predict for objects in test
print(accuracy_score(y_test, y_pred)) #check accuracy
I think you can read more about this in sklearn site.
Is this example what you want?
To check your manual data:
manual = manual.apply(lambda x: le.fit_transform(x))
clf.predict(manual)

Python Bayes heart prediction, results are not accurate

I'm trying to make a heart disease prediction program using Naive Bayes. When I finished the classifier, the cross validation showed a mean accuracy of 80% However when I try to make a prediction on a given sample, the prediction is all wrong! The dataset is the heart disease dataset from UCI repository, it contains 303 samples. There are two classes 0: healthy and 1: ill, when I try making a prediction on a sample from the dataset, it doesn't predicts its true value, except for very few samples. Here is the code:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
class Predict:
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Train_Test_Split_data(self,dataset):
Y = dataset['Num'].apply(lambda x: 1 if x > 0 else 0)
X = dataset.drop('Num', axis=1)
validation_size = 0.20
seed = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
return X_train, X_test, Y_train, Y_test
def Scaler(self, X_train, X_test):
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, X_test
def Cross_Validate(self, clf, X_train, Y_train, cv=5):
scores = cross_val_score(clf, X_train, Y_train, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
return score, scores
def Fit_Score(self, clf, X_train, Y_train, X_test, Y_test, label='x'):
clf.fit(X_train, Y_train)
fit_score = clf.score(X_train, Y_train)
pred_score = clf.score(X_test, Y_test)
print("%s: fit score %.5f, predict score %.5f" % (label, fit_score, pred_score))
return pred_score
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
data = self.Read_Clean(dataset_path)
X_train, X_test, Y_train, Y_test = self.Train_Test_Split_data(data)
X_train, X_test = self.Scaler(X_train, X_test)
self.NB = GaussianNB()
self.Fit_Score(self.NB, X_train, Y_train, X_test, Y_test, label='NB')
self.Cross_Validate(self.NB, X_train, Y_train, 10)
return self.ReturnPredictionValue(self.NB, sample)
When I run:
if __name__ == '__main__':
sample = [41.0, 0.0, 2.0, 130.0, 204.0, 0.0, 2.0, 172.0, 0.0, 1.4, 1.0, 0.0, 3.0]
p = Predict()
print "Prediction value: {}".format(p.PredictionMain(sample))
The result is:
NB: fit score 0.84711, predict score 0.83607 CV scores mean: 0.8000
Prediction value: 1
I get 1 instead of 0 (this sample is already one of the dataset samples).
I did this for more than one sample from the dataset and I get wrong result most of the time, it's as if the accuracy is not 80%!
Any help would be appreciated.
Thanks in advance.
Edit:
Problem solved using Pipeline. The final code is:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
class Predict:
def __init__(self):
self.X = []
self.Y = []
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Split_Dataset(self, df):
self.Y = df['Num'].apply(lambda x: 1 if x > 0 else 0)
self.X = df.drop('Num', axis=1)
def Create_Pipeline(self):
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('bayes', GaussianNB()))
model = Pipeline(estimators)
return model
def Cross_Validate(self, clf, cv=5):
scores = cross_val_score(clf, self.X, self.Y, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
def Fit_Score(self, clf, label='x'):
clf.fit(self.X, self.Y)
fit_score = clf.score(self.X, self.Y)
print("%s: fit score %.5f" % (label, fit_score))
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
print "dataset: "+ dataset_path
data = self.Read_Clean(dataset_path)
self.Split_Dataset(data)
self.model = self.Create_Pipeline()
self.Fit_Score(self.model, label='NB')
self.Cross_Validate(self.model, 10)
return self.ReturnPredictionValue(self.model, sample)
Now making a prediction on the same sample in the question returns [0] which is the true value. Actually by running the following method:
def CheckTrue(self):
clf = self.Create_Pipeline()
out = cross_val_predict(clf, self.X, self.Y)
p = [out == self.Y]
c = 0
for i in range(303):
if p[0][i] == True:
c += 1
print "Samples with true values: {}".format(c)
I get 249 true samples using the pipeline code, whereas I got only 150 before.
You're not applying StandardScaler to the sample. Classifier expects scaled data as it was trained on StandardScaler.transform output, but sample is not scaled the same way as in training.
It is easy to make such mistakes when combining multiple steps (scaling, preprocessing, classification) manually. To avoid such issues it is a good idea to use scikit-learn Pipeline.

Categories

Resources