PySpark - How to get precision / recall / ROC from TrainValidationSplit?

PySpark - How to get precision / recall / ROC from TrainValidationSplit? - python

My current approach to evaluate different parameters for LinearSVC and get the best one:
tokenizer = Tokenizer(inputCol="Text", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
LSVC = LinearSVC()
rescaledData = idfModel.transform(featurizedData)
paramGrid = ParamGridBuilder()\
.addGrid(LSVC.maxIter, [1])\
.addGrid(LSVC.regParam, [0.001, 10.0])\
.build()
crossval = TrainValidationSplit(estimator=LSVC,
estimatorParamMaps=paramGrid,
evaluator=MulticlassClassificationEvaluator(metricName="weightedPrecision"),
testRatio=0.01)
cvModel = crossval.fit(rescaledData.select("KA", "features").selectExpr("KA as label", "features as features"))
bestModel = cvModel.bestModel
Now I would like to get the basic parameters of ML (like precision, recall etc.), how do I get those?

You can try this
from pyspark.mllib.evaluation import MulticlassMetrics
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
You can check this link for further info
https://spark.apache.org/docs/2.1.0/mllib-evaluation-metrics.html

Related

GridSearchCV with LightFM

I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806

The sensitivity does not improve despite making multiple changes in model and dataset

I have a CNN model which I run on the dataset which is linked here for viewing : data
I have tried using sensitivity and specificity provided by Keras and also tried the one using scikit learn. I want help to understand if something is wrong with my code? I understand the model performance depends on a lot of things but I want to know if there is something wrong with the code that is giving me sensitivity different for Keras and different for scikit learn. Also, there are misclassifications. how can I improve my model results?
My code looks like below:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
#import keras as k
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
#from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import TimeDistributed
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn import metrics
def confusion_metrics (conf_matrix):
# save confusion matrix and slice into four pieces
TP = conf_matrix[1][1]
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)
# calculate accuracy
conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))
# calculate mis-classification
conf_misclassification = 1- conf_accuracy
# calculate the sensitivity
conf_sensitivity = (TP / float(TP + FN))
# calculate the specificity
conf_specificity = (TN / float(TN + FP))
# calculate precision
conf_precision = (TN / float(TN + FP))
# calculate f_1 score
conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))
print('-'*50)
print(f'Accuracy: {round(conf_accuracy,2)}')
print(f'Mis-Classification: {round(conf_misclassification,2)}')
print(f'Sensitivity: {round(conf_sensitivity,2)}')
print(f'Specificity: {round(conf_specificity,2)}')
print(f'Precision: {round(conf_precision,2)}')
print(f'f_1 Score: {round(conf_f1,2)}')
def og_build_model_less_layer(n_rows,n_cols):
ecg_input = Input(shape=(n_cols,n_rows), name='ecg_signal')
print('model_input shape:' , ecg_input.shape)
c1 = Conv1D(80, 2,name = 'conv_1',kernel_initializer="glorot_uniform")(ecg_input)
b1 = BatchNormalization(name = 'BN_1')(c1) #a1 = Activation('relu')(b1)
d1 = Dropout(0.4,name = 'drop_1')(b1)
c2 = Conv1D(80,2,name = 'conv_2',kernel_initializer="glorot_uniform")(d1)
b2 = BatchNormalization(name = 'BN_2')(c2)
d2 = Dropout(0.6,name = 'drop_2')(b2)
c3 = Conv1D(80, 2,name = 'conv_3',kernel_initializer="glorot_uniform")(d2)
b3 = BatchNormalization(name = 'BN_3')(c3)
d3 = Dropout(0.4,name = 'drop_3')(b3)
c4 = Conv1D(80, 2,name = 'conv_4',kernel_initializer="glorot_uniform")(d3)
b4 = BatchNormalization(name = 'BN_4')(c4)
d4 = Dropout(0.6,name = 'drop_4')(b4)
c5 = Conv1D(80, 2,name = 'conv_5',kernel_initializer="glorot_uniform")(d4)
b5 = BatchNormalization(name = 'BN_5')(c5)
d5 = Dropout(0.5,name = 'drop_5')(b5)
fl = Flatten(name='fl')(d5)
den1 = Dense(256,name='den1')(fl)
den = Dense(30,name='den2')(den1)
drp = Dropout(0.5)(den)
output = Dense(1, activation='sigmoid')(drp)
opt = Adam(learning_rate=1e-4)
sens = tf.keras.metrics.SensitivityAtSpecificity(0.15)
spec = tf.keras.metrics.SpecificityAtSensitivity(0.15)
model = Model(inputs=ecg_input, outputs=output, name='model')
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy',sens,spec])
print(model.summary)
return model
train_df = pd.read_pickle('data/train_ecg_gl.pkl')
train_df = train_df.dropna()
train_df = train_df.sort_values(by='Time', ascending=True)#, na_position='first')
test_df = pd.read_pickle('data/test_ecg_gl.pkl')
test_df = test_df.dropna()
test_df = test_df.sort_values(by='Time', ascending=True)
df = pd.concat([train_df, test_df], ignore_index=True)
df = df.sort_values(by='Time')
data = df.iloc[:,1:161].values
data=data[...,None]
labels = df['hypo_label'].values
train_data = train_df.iloc[:,1:161].values
train_data=train_data[...,None]
train_labels = train_df['hypo_label'].values
test_data = test_df.iloc[:,1:161].values
test_data=test_data[...,None]
test_labels = test_df['hypo_label'].values
xtrain,ytrain = train_data,train_labels
xtest,ytest = test_data,test_labels
n_cols = data.shape[2]
n_rows = data.shape[1]
batch_size,lr , verbose , epochs , val_split = 45 ,0.01, 1, 40, 0.1
early_stopping_callback = EarlyStopping(monitor = 'loss', patience = 10, mode = 'min', restore_best_weights = True)
cb_lr_reducer = ReduceLROnPlateau(monitor='loss', factor= 0.1, patience=10, min_lr= 1e-5)
model = og_build_model_less_layer(n_cols,n_rows)
model.fit(x = xtrain, y = ytrain, epochs=epochs,verbose=verbose,batch_size=batch_size,validation_split=val_split, shuffle=False,callbacks=[cb_lr_reducer, early_stopping_callback])
_, taccuracy,tsensitivity,tspecificity = model.evaluate(xtest, ytest, batch_size=batch_size, verbose=verbose)
print('Model Test 0.7*0.3 Accuracy:' , taccuracy)
print('Model Test 0.7*0.3 sensitivity:' , tsensitivity)
print('Model Test 0.7*0.3 specificity:' , specificity)
y_pred = model.predict(xtest)
y_pred = y_pred.flatten()
print(y_pred)
#print(p_pred.round(2))
# extract the predicted class labels
y_pred = np.where(y_pred < 0.5, 0, 1)
# Creating the confusion matrix
cm = metrics.confusion_matrix(ytest, y_pred)
# Assigning columns names
cm_df = pd.DataFrame(cm, columns = ['Predicted Negative', 'Predicted Positive'],index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
print(cm_df)
confusion_metrics(cm)
After I run this code for the data linked here, it gives me following output:
Model Test 0.7*0.3 Accuracy: 0.654349148273468
Model Test 0.7*0.3 sensitivity: 0.9166133999824524
Model Test 0.7*0.3 specificity: 0.9982390403747559
##################################################
##################################################
##################################################
[0.00757153 0.00837034 0.02366774 ... 0.5926605 0.59990513 0.56060743]
Predicted Negative Predicted Positive
Actual Negative 29073 2160
Actual Positive 14531 1107
True Positives: 1107
True Negatives: 29073
False Positives: 2160
False Negatives: 14531
--------------------------------------------------
Accuracy: 0.64
Mis-Classification: 0.36
Sensitivity: 0.07
Specificity: 0.93
Precision: 0.93
f_1 Score: 0.13

The performance metrics improved when I used my custom validation set which is a stratified split of 80-20 from training.

None Type error in Python when running streamlit

Hi I am trying create an App in python that will allow users to choose which classification model they want to implement on one of three open source data in SK-Learn library The code is the following:
import streamlit as st
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
st.title("Streamlit example")
st.write("""
# Explore different classifier
which one is the best?
""")
dataset_name = st.sidebar.selectbox("Select Dataset", ("Iris","Breast Cancer","Wine Dataset") )
classifier_name = st.sidebar.selectbox("Select Classifier", ("KNN","SVM","Random Forest") )
def get_dataset(dataset_name):
if dataset_name == "Iris":
data = datasets.load_iris()
elif dataset_name == "Breast Cancer":
data = datasets.load_breast_cancer()
else:
data = datasets.load_wine()
X = data.data
y = data.target
return X, y
X, y = get_dataset(dataset_name)
st.write("Shape of datset", X.shape)
st.write("Number of classes", len(np.unique(y)))
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
else:
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params
params = add_parameter_ui(classifier_name)
def get_classifier(clf_name,params):
if clf_name == "KNN":
clf = KNeighborsClassifier(n_neighbors=params['K'])
elif clf_name == "SVM":
clf = SVC(C= params['C'])
else:
clf = RandomForestClassifier(n_estimators=params["n_estimators"],max_depth=params["max_depth"],random_state=1234)
return clf
clf = get_classifier(classifier_name,params)
The error is:
clf = KNeighborsClassifier(n_neighbors=params['K'])
TypeError: 'NoneType' object is not subscriptable
I know the error is supposed to be self-explanatory but I tried to state clf = None but still get the same error and i'm asking someone to put me in the right direction.

The problem is in your add_parameter_ui function. you are not returning a value in the case of clf_name is KNN or SVM and this causes params in the main code to be None so calling params['K'] is not because 'NoneType' object is not subscriptable.
Here is the fixed code:
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
return params
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
return params
# If Random Forest
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params

LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results

While passing the exact same parameters to LightGBM and sklearn's implementation of LightGBM, I am getting different results. Initially, I was getting the exact same results on doing this, however, I made some changes to my code and now I can't find out why they're not coming the same. This means that the performance metrics and feature importance are coming differently. Please help me figure it out, I can't figure out the mistake I am making. It could either be a mistake in the way I am implementing LightGBM using the original library or in sklearn's implementation. Link for explanation on why we should get identical results - light gbm - python API vs Scikit-learn API
x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42)
n_folds = 5
lgb_train = lgb.Dataset(x_train, y_train)
def objective(params, n_folds = n_folds):
"""Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
print(params)
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])
params['min_child_samples'] = int(params['min_child_samples'])
params['subsample_freq'] = int(params['subsample_freq'])
# Perform n_fold cross validation with hyperparameters
# Use early stopping and evalute based on ROC AUC
cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000,
early_stopping_rounds=100, metrics='auc')
# Extract the best score
best_score = max(cv_results['auc-mean'])
# Loss must be minimized
loss = 1 - best_score
num_iteration = int(np.argmax(cv_results['auc-mean']) + 1)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, num_iteration])
# Dictionary with information for evaluation
return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration}
space = {
'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
'max_depth' : hp.quniform('max_depth', 3, 10, 1),
'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
'subsample_freq': hp.quniform('subsample_freq',0,10,1),
'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01),
'learning_rate' : 0.05,
'objective' : 'binary',
}
out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'estimators'])
of_connection.close()
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10)
bayes_trials_results = sorted(trials.results, key = lambda x: x['loss'])
results = pd.read_csv('results/gbm_trials.csv')
# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
best_bayes_estimators = int(results.loc[0, 'estimators'])
best['max_depth'] = int(best['max_depth'])
best['num_leaves'] = int(best['num_leaves'])
best['min_child_samples'] = int(best['min_child_samples'])
num_boost_round=int(best_bayes_estimators * 1.1)
best['objective'] = 'binary'
best['boosting_type'] = 'gbdt'
best['subsample_freq'] = int(best['subsample_freq'])
#Actual LightGBM
best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round)
print('Plotting feature importances...')
ax = lgb.plot_importance(best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain')
feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split')
feature_imp.to_clipboard()
y_pred_score = best_gbm.predict(x_test)
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()
#Sklearn's Implementation of LightGBM
best_sk = dict(best)
del best_sk['min_gain_to_split']
sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split'])
sk_best_gbm.fit(x_train, y_train)
sk_best_gbm.get_params()
print('Plotting feature importances...')
ax = lgb.plot_importance(sk_best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["Importance"] = sk_best_gbm.feature_importances_
feature_imp.to_clipboard()
y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1]
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()

Use MLPRegressor to get better results as linear model

My Problem is to get better result in MSE and i hope in R2 as in linear model.
So i used the sklearn.neural_network.MLPRegressor library to compare it
def testfit(v,k,subset=2,hls=(50,50,50,10),acv='identity'):
# prep variables
n = len(v)
n1 = n/subset
X = getX(v[0:n1],k)
y = gety(v[0:n1],k)
# define models
nn1 = MLPRegressor(hidden_layer_sizes=hls, activation=acv, solver='adam', alpha=0.01,batch_size='auto',
learning_rate='constant', learning_rate_init=0.1, power_t=1, max_iter=50000, shuffle=True,
random_state=None, tol=0.00001, verbose=False, warm_start=False, momentum=0.9,
nesterovs_momentum=True, early_stopping=False, validation_fraction=0.5, beta_1=0.9, beta_2=0.999,
epsilon=1e-10)
ols = linear_model.LinearRegression()
# run models
st = time.time()
fnnw = nn1.fit(X,y)
nnw_dur = time.time() - st
st = time.time()
flin = ols.fit(X,y)
ols_dur = time.time() - st
# run gof
X2 = getX(v[n1:n],k)
y2 = gety(v[n1:n],k)
# neural network
# in-sample
yn = fnnw.predict(X)
gin = pearsonr(y,yn)[0]**2
ginse = sum((y-yn)**2)
# out-sample
yn2 = fnnw.predict(X2)
oin = pearsonr(y2,yn2)[0]**2
oinse = sum((y2-yn2)**2)
# ols
# in.sample
yl = flin.predict(X)
gil = pearsonr(y,yl)[0]**2
gilse = sum((y-yl)**2)
yl2 = flin.predict(X2)
oil = pearsonr(y2,yl2)[0]**2
oilse = sum((y2-yl2)**2)
plt.subplot(321)
plt.plot(y2)
plt.plot(yl2)
enter image description here
The best case in this scenario is that my neural network NNW MSE in FORCAST +1 is smaller than OLS MSE FORCAST +1
Or is it not possible to get a smaller Error in nn as in linear model in this way

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PySpark - How to get precision / recall / ROC from TrainValidationSplit? - python

Related

GridSearchCV with LightFM

The sensitivity does not improve despite making multiple changes in model and dataset

None Type error in Python when running streamlit

LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results

Use MLPRegressor to get better results as linear model

Categories

Resources