PySpark - How to get precision / recall / ROC from TrainValidationSplit? - python
My current approach to evaluate different parameters for LinearSVC and get the best one:
tokenizer = Tokenizer(inputCol="Text", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
LSVC = LinearSVC()
rescaledData = idfModel.transform(featurizedData)
paramGrid = ParamGridBuilder()\
.addGrid(LSVC.maxIter, [1])\
.addGrid(LSVC.regParam, [0.001, 10.0])\
.build()
crossval = TrainValidationSplit(estimator=LSVC,
estimatorParamMaps=paramGrid,
evaluator=MulticlassClassificationEvaluator(metricName="weightedPrecision"),
testRatio=0.01)
cvModel = crossval.fit(rescaledData.select("KA", "features").selectExpr("KA as label", "features as features"))
bestModel = cvModel.bestModel
Now I would like to get the basic parameters of ML (like precision, recall etc.), how do I get those?
You can try this
from pyspark.mllib.evaluation import MulticlassMetrics
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
You can check this link for further info
https://spark.apache.org/docs/2.1.0/mllib-evaluation-metrics.html
Related
GridSearchCV with LightFM
I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features. I'm stuck and I do not know how to go on. import pandas as pd import scipy.ndimage.tests import turicreate as tc from gensim.models import KeyedVectors import os import numpy as np from lightfm import LightFM from lightfm.data import Dataset from lightfm.evaluation import auc_score from sklearn.model_selection import KFold, train_test_split, GridSearchCV from lightfm.cross_validation import random_train_test_split import itertools import scipy.sparse def create_processed_dataset(): """ One-Time execution Returns: embeddings.csv and observations.csv """ output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered') os.makedirs(output_path, exist_ok=True) """ Data imports """ u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1') r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1') vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv') # Load mappings and filter them if a corresponding embedding is found mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"]) mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)] mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])] # Create a pandas dataframe with embeddings embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]]) embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"])) embeddings.set_index("movie_id", inplace=True) ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])] embeddings.to_csv(os.path.join(output_path, 'embeddings.csv')) ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False) def generate_list_of_hyper_parameters(parameters_grid): return ( {y: z for y, z in zip(parameters_grid.keys(), x)} for x in itertools.product(*parameters_grid.values()) ) def create_csr_from_dataset(observations, embeddings): dataset = Dataset(item_identity_features=True, user_identity_features=False) feature_names = [str(i) for i in range(0, 200)] dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) num_items, num_fts = dataset.item_features_shape() print(f'Num items: {num_items}, num_features: {num_fts}.') interactions, weights = dataset.build_interactions( observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None) ) item_features = [] for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")): for x, y in row.items(): item_features.append((item_id, {x: y})) item_features = dataset.build_item_features(item_features) return interactions, item_features def folding(interactions, k_folds=10): if not scipy.sparse.issparse(interactions): return None coo = interactions.tocoo() kf = KFold(n_splits=k_folds) # Define the split - into 2 folds shape = interactions.shape uids, iids, data = (coo.row, coo.col, coo.data) def to_coo_matrix(indexes): return scipy.sparse.coo_matrix( (data[indexes], (uids[indexes], iids[indexes])), shape=shape, dtype=coo.dtype, ) return [ (to_coo_matrix(train_index), to_coo_matrix(validation_index)) for train_index, validation_index in kf.split(data) ] def grid_search(parameters_grid, k_fold, interactions, item_features=None): results = [] for hyper_params in generate_list_of_hyper_parameters(parameters_grid): for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)): print(f"{hyper_params} && current_fold:{current_fold}") model = LightFM(**hyper_params) model.fit(train, epochs=50, item_features=item_features, num_threads=6) score = auc_score(model, validation, train_interactions=train, num_threads=6).mean() results.append((score, hyper_params, model)) print(f"{hyper_params} && current_fold:{current_fold} && score: {score}") results.sort(key=lambda x: x[0]) return results def main(): observations = pd.read_csv('data/ml-100k-filtered/observations.csv') embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id") interactions, item_features = create_csr_from_dataset(observations, embeddings) train, test = random_train_test_split(interactions, test_percentage=0.2) print(embeddings.head()) num_movies = len(embeddings.index) num_ratings = len(observations.index) num_users = observations.user_id.unique().size sparsity = 1 - num_ratings / (num_users * num_movies) print( f"num_users: {num_users}, num_movies: {num_movies}, " f"num_observations: {num_ratings}, " f"sparsity: ~{sparsity * 100}" ) model = LightFM() # parametri da testare param_grid = { 'no_components': range(10, 110, 10), 'learning_rate': [0.01, 0.05, 0.1], 'item_alpha': [0.0001, 0.001, 0.01], 'user_alpha': [0.0001, 0.001, 0.01], } results = grid_search(param_grid, 10, train, item_features=item_features) print(results[0][0]) # grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10) # grid.fit(train) # # # stampare i migliori parametri # print("Best parameters found: ", grid.best_params_) if __name__ == "__main__": main() Head of embeddings.csv movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199 781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097 521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05 1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898 Head of observations.csv user_id,movie_id,rating,unix_timestamp 196,242,3,881250949 22,377,1,878887116 166,346,1,886397596 298,474,4,884182806
The sensitivity does not improve despite making multiple changes in model and dataset
I have a CNN model which I run on the dataset which is linked here for viewing : data I have tried using sensitivity and specificity provided by Keras and also tried the one using scikit learn. I want help to understand if something is wrong with my code? I understand the model performance depends on a lot of things but I want to know if there is something wrong with the code that is giving me sensitivity different for Keras and different for scikit learn. Also, there are misclassifications. how can I improve my model results? My code looks like below: import numpy as np import pandas as pd import os import tensorflow as tf #import keras as k from IPython.display import display import warnings warnings.filterwarnings("ignore") from sklearn.model_selection import train_test_split from tensorflow.keras.utils import plot_model #from sklearn.preprocessing import StandardScaler from tensorflow.keras.layers import TimeDistributed from sklearn.model_selection import train_test_split from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from sklearn import metrics def confusion_metrics (conf_matrix): # save confusion matrix and slice into four pieces TP = conf_matrix[1][1] TN = conf_matrix[0][0] FP = conf_matrix[0][1] FN = conf_matrix[1][0] print('True Positives:', TP) print('True Negatives:', TN) print('False Positives:', FP) print('False Negatives:', FN) # calculate accuracy conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN)) # calculate mis-classification conf_misclassification = 1- conf_accuracy # calculate the sensitivity conf_sensitivity = (TP / float(TP + FN)) # calculate the specificity conf_specificity = (TN / float(TN + FP)) # calculate precision conf_precision = (TN / float(TN + FP)) # calculate f_1 score conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity)) print('-'*50) print(f'Accuracy: {round(conf_accuracy,2)}') print(f'Mis-Classification: {round(conf_misclassification,2)}') print(f'Sensitivity: {round(conf_sensitivity,2)}') print(f'Specificity: {round(conf_specificity,2)}') print(f'Precision: {round(conf_precision,2)}') print(f'f_1 Score: {round(conf_f1,2)}') def og_build_model_less_layer(n_rows,n_cols): ecg_input = Input(shape=(n_cols,n_rows), name='ecg_signal') print('model_input shape:' , ecg_input.shape) c1 = Conv1D(80, 2,name = 'conv_1',kernel_initializer="glorot_uniform")(ecg_input) b1 = BatchNormalization(name = 'BN_1')(c1) #a1 = Activation('relu')(b1) d1 = Dropout(0.4,name = 'drop_1')(b1) c2 = Conv1D(80,2,name = 'conv_2',kernel_initializer="glorot_uniform")(d1) b2 = BatchNormalization(name = 'BN_2')(c2) d2 = Dropout(0.6,name = 'drop_2')(b2) c3 = Conv1D(80, 2,name = 'conv_3',kernel_initializer="glorot_uniform")(d2) b3 = BatchNormalization(name = 'BN_3')(c3) d3 = Dropout(0.4,name = 'drop_3')(b3) c4 = Conv1D(80, 2,name = 'conv_4',kernel_initializer="glorot_uniform")(d3) b4 = BatchNormalization(name = 'BN_4')(c4) d4 = Dropout(0.6,name = 'drop_4')(b4) c5 = Conv1D(80, 2,name = 'conv_5',kernel_initializer="glorot_uniform")(d4) b5 = BatchNormalization(name = 'BN_5')(c5) d5 = Dropout(0.5,name = 'drop_5')(b5) fl = Flatten(name='fl')(d5) den1 = Dense(256,name='den1')(fl) den = Dense(30,name='den2')(den1) drp = Dropout(0.5)(den) output = Dense(1, activation='sigmoid')(drp) opt = Adam(learning_rate=1e-4) sens = tf.keras.metrics.SensitivityAtSpecificity(0.15) spec = tf.keras.metrics.SpecificityAtSensitivity(0.15) model = Model(inputs=ecg_input, outputs=output, name='model') model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy',sens,spec]) print(model.summary) return model train_df = pd.read_pickle('data/train_ecg_gl.pkl') train_df = train_df.dropna() train_df = train_df.sort_values(by='Time', ascending=True)#, na_position='first') test_df = pd.read_pickle('data/test_ecg_gl.pkl') test_df = test_df.dropna() test_df = test_df.sort_values(by='Time', ascending=True) df = pd.concat([train_df, test_df], ignore_index=True) df = df.sort_values(by='Time') data = df.iloc[:,1:161].values data=data[...,None] labels = df['hypo_label'].values train_data = train_df.iloc[:,1:161].values train_data=train_data[...,None] train_labels = train_df['hypo_label'].values test_data = test_df.iloc[:,1:161].values test_data=test_data[...,None] test_labels = test_df['hypo_label'].values xtrain,ytrain = train_data,train_labels xtest,ytest = test_data,test_labels n_cols = data.shape[2] n_rows = data.shape[1] batch_size,lr , verbose , epochs , val_split = 45 ,0.01, 1, 40, 0.1 early_stopping_callback = EarlyStopping(monitor = 'loss', patience = 10, mode = 'min', restore_best_weights = True) cb_lr_reducer = ReduceLROnPlateau(monitor='loss', factor= 0.1, patience=10, min_lr= 1e-5) model = og_build_model_less_layer(n_cols,n_rows) model.fit(x = xtrain, y = ytrain, epochs=epochs,verbose=verbose,batch_size=batch_size,validation_split=val_split, shuffle=False,callbacks=[cb_lr_reducer, early_stopping_callback]) _, taccuracy,tsensitivity,tspecificity = model.evaluate(xtest, ytest, batch_size=batch_size, verbose=verbose) print('Model Test 0.7*0.3 Accuracy:' , taccuracy) print('Model Test 0.7*0.3 sensitivity:' , tsensitivity) print('Model Test 0.7*0.3 specificity:' , specificity) y_pred = model.predict(xtest) y_pred = y_pred.flatten() print(y_pred) #print(p_pred.round(2)) # extract the predicted class labels y_pred = np.where(y_pred < 0.5, 0, 1) # Creating the confusion matrix cm = metrics.confusion_matrix(ytest, y_pred) # Assigning columns names cm_df = pd.DataFrame(cm, columns = ['Predicted Negative', 'Predicted Positive'],index = ['Actual Negative', 'Actual Positive']) # Showing the confusion matrix print(cm_df) confusion_metrics(cm) After I run this code for the data linked here, it gives me following output: Model Test 0.7*0.3 Accuracy: 0.654349148273468 Model Test 0.7*0.3 sensitivity: 0.9166133999824524 Model Test 0.7*0.3 specificity: 0.9982390403747559 ################################################## ################################################## ################################################## [0.00757153 0.00837034 0.02366774 ... 0.5926605 0.59990513 0.56060743] Predicted Negative Predicted Positive Actual Negative 29073 2160 Actual Positive 14531 1107 True Positives: 1107 True Negatives: 29073 False Positives: 2160 False Negatives: 14531 -------------------------------------------------- Accuracy: 0.64 Mis-Classification: 0.36 Sensitivity: 0.07 Specificity: 0.93 Precision: 0.93 f_1 Score: 0.13
The performance metrics improved when I used my custom validation set which is a stratified split of 80-20 from training.
None Type error in Python when running streamlit
Hi I am trying create an App in python that will allow users to choose which classification model they want to implement on one of three open source data in SK-Learn library The code is the following: import streamlit as st import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier st.title("Streamlit example") st.write(""" # Explore different classifier which one is the best? """) dataset_name = st.sidebar.selectbox("Select Dataset", ("Iris","Breast Cancer","Wine Dataset") ) classifier_name = st.sidebar.selectbox("Select Classifier", ("KNN","SVM","Random Forest") ) def get_dataset(dataset_name): if dataset_name == "Iris": data = datasets.load_iris() elif dataset_name == "Breast Cancer": data = datasets.load_breast_cancer() else: data = datasets.load_wine() X = data.data y = data.target return X, y X, y = get_dataset(dataset_name) st.write("Shape of datset", X.shape) st.write("Number of classes", len(np.unique(y))) def add_parameter_ui(clf_name): params = dict() if clf_name =="KNN": K = st.sidebar.slider("K",1,15) params["K"] = K elif clf_name =="SVM": C = st.sidebar.slider("C", 0.01,10.0) params["C"] = C else: max_depth = st.sidebar.slider("max_depth", 2,15) n_estimators = st.sidebar.slider("n_estimators",1,100) params["max_depth"]= max_depth params["n_estimators"] = n_estimators return params params = add_parameter_ui(classifier_name) def get_classifier(clf_name,params): if clf_name == "KNN": clf = KNeighborsClassifier(n_neighbors=params['K']) elif clf_name == "SVM": clf = SVC(C= params['C']) else: clf = RandomForestClassifier(n_estimators=params["n_estimators"],max_depth=params["max_depth"],random_state=1234) return clf clf = get_classifier(classifier_name,params) The error is: clf = KNeighborsClassifier(n_neighbors=params['K']) TypeError: 'NoneType' object is not subscriptable I know the error is supposed to be self-explanatory but I tried to state clf = None but still get the same error and i'm asking someone to put me in the right direction.
The problem is in your add_parameter_ui function. you are not returning a value in the case of clf_name is KNN or SVM and this causes params in the main code to be None so calling params['K'] is not because 'NoneType' object is not subscriptable. Here is the fixed code: def add_parameter_ui(clf_name): params = dict() if clf_name =="KNN": K = st.sidebar.slider("K",1,15) params["K"] = K return params elif clf_name =="SVM": C = st.sidebar.slider("C", 0.01,10.0) params["C"] = C return params # If Random Forest max_depth = st.sidebar.slider("max_depth", 2,15) n_estimators = st.sidebar.slider("n_estimators",1,100) params["max_depth"]= max_depth params["n_estimators"] = n_estimators return params
LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results
While passing the exact same parameters to LightGBM and sklearn's implementation of LightGBM, I am getting different results. Initially, I was getting the exact same results on doing this, however, I made some changes to my code and now I can't find out why they're not coming the same. This means that the performance metrics and feature importance are coming differently. Please help me figure it out, I can't figure out the mistake I am making. It could either be a mistake in the way I am implementing LightGBM using the original library or in sklearn's implementation. Link for explanation on why we should get identical results - light gbm - python API vs Scikit-learn API x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42) n_folds = 5 lgb_train = lgb.Dataset(x_train, y_train) def objective(params, n_folds = n_folds): """Objective function for Gradient Boosting Machine Hyperparameter Tuning""" print(params) params['max_depth'] = int(params['max_depth']) params['num_leaves'] = int(params['num_leaves']) params['min_child_samples'] = int(params['min_child_samples']) params['subsample_freq'] = int(params['subsample_freq']) # Perform n_fold cross validation with hyperparameters # Use early stopping and evalute based on ROC AUC cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000, early_stopping_rounds=100, metrics='auc') # Extract the best score best_score = max(cv_results['auc-mean']) # Loss must be minimized loss = 1 - best_score num_iteration = int(np.argmax(cv_results['auc-mean']) + 1) of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([loss, params, num_iteration]) # Dictionary with information for evaluation return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration} space = { 'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5), 'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0), 'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0), 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0), 'max_depth' : hp.quniform('max_depth', 3, 10, 1), 'subsample' : hp.quniform('subsample', 0.6, 1, 0.05), 'num_leaves': hp.quniform('num_leaves', 20, 150, 1), 'subsample_freq': hp.quniform('subsample_freq',0,10,1), 'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01), 'learning_rate' : 0.05, 'objective' : 'binary', } out_file = 'results/gbm_trials.csv' of_connection = open(out_file, 'w') writer = csv.writer(of_connection) writer.writerow(['loss', 'params', 'estimators']) of_connection.close() trials = Trials() best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10) bayes_trials_results = sorted(trials.results, key = lambda x: x['loss']) results = pd.read_csv('results/gbm_trials.csv') # Sort with best scores on top and reset index for slicing results.sort_values('loss', ascending = True, inplace = True) results.reset_index(inplace = True, drop = True) results.head() best_bayes_estimators = int(results.loc[0, 'estimators']) best['max_depth'] = int(best['max_depth']) best['num_leaves'] = int(best['num_leaves']) best['min_child_samples'] = int(best['min_child_samples']) num_boost_round=int(best_bayes_estimators * 1.1) best['objective'] = 'binary' best['boosting_type'] = 'gbdt' best['subsample_freq'] = int(best['subsample_freq']) #Actual LightGBM best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round) print('Plotting feature importances...') ax = lgb.plot_importance(best_gbm, max_num_features=15) plt.show() feature_imp = pd.DataFrame() feature_imp["feature"] = list(x_train.columns) feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain') feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split') feature_imp.to_clipboard() y_pred_score = best_gbm.predict(x_test) roc_auc_score_list = [] f1_score_list = [] accuracy_score_list = [] precision_score_list = [] recall_score_list = [] thresholds = [0.4,0.5,0.6,0.7] for threshold in thresholds: print("threshold is {}".format(threshold)) y_pred = np.where(y_pred_score>=threshold, 1, 0) print(roc_auc_score(y_test,y_pred_score)) print(f1_score(y_test,y_pred)) print(accuracy_score(y_test,y_pred)) print(precision_score(y_test,y_pred)) print(recall_score(y_test,y_pred)) roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score)) f1_score_list.append(f1_score(y_test,y_pred)) accuracy_score_list.append(accuracy_score(y_test,y_pred)) precision_score_list.append(precision_score(y_test,y_pred)) recall_score_list.append(recall_score(y_test,y_pred)) performance_metrics = pd.DataFrame( {'thresholds':thresholds, 'roc_auc_score':roc_auc_score_list, 'f1_score':f1_score_list, 'accuracy_score':accuracy_score_list, 'precision_score':precision_score_list, 'recall_score':recall_score_list }) performance_metrics.transpose().to_clipboard() #Sklearn's Implementation of LightGBM best_sk = dict(best) del best_sk['min_gain_to_split'] sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split']) sk_best_gbm.fit(x_train, y_train) sk_best_gbm.get_params() print('Plotting feature importances...') ax = lgb.plot_importance(sk_best_gbm, max_num_features=15) plt.show() feature_imp = pd.DataFrame() feature_imp["feature"] = list(x_train.columns) feature_imp["Importance"] = sk_best_gbm.feature_importances_ feature_imp.to_clipboard() y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1] roc_auc_score_list = [] f1_score_list = [] accuracy_score_list = [] precision_score_list = [] recall_score_list = [] thresholds = [0.4,0.5,0.6,0.7] for threshold in thresholds: print("threshold is {}".format(threshold)) y_pred = np.where(y_pred_score>=threshold, 1, 0) print(roc_auc_score(y_test,y_pred_score)) print(f1_score(y_test,y_pred)) print(accuracy_score(y_test,y_pred)) print(precision_score(y_test,y_pred)) print(recall_score(y_test,y_pred)) roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score)) f1_score_list.append(f1_score(y_test,y_pred)) accuracy_score_list.append(accuracy_score(y_test,y_pred)) precision_score_list.append(precision_score(y_test,y_pred)) recall_score_list.append(recall_score(y_test,y_pred)) performance_metrics = pd.DataFrame( {'thresholds':thresholds, 'roc_auc_score':roc_auc_score_list, 'f1_score':f1_score_list, 'accuracy_score':accuracy_score_list, 'precision_score':precision_score_list, 'recall_score':recall_score_list }) performance_metrics.transpose().to_clipboard()
Use MLPRegressor to get better results as linear model
My Problem is to get better result in MSE and i hope in R2 as in linear model. So i used the sklearn.neural_network.MLPRegressor library to compare it def testfit(v,k,subset=2,hls=(50,50,50,10),acv='identity'): # prep variables n = len(v) n1 = n/subset X = getX(v[0:n1],k) y = gety(v[0:n1],k) # define models nn1 = MLPRegressor(hidden_layer_sizes=hls, activation=acv, solver='adam', alpha=0.01,batch_size='auto', learning_rate='constant', learning_rate_init=0.1, power_t=1, max_iter=50000, shuffle=True, random_state=None, tol=0.00001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.5, beta_1=0.9, beta_2=0.999, epsilon=1e-10) ols = linear_model.LinearRegression() # run models st = time.time() fnnw = nn1.fit(X,y) nnw_dur = time.time() - st st = time.time() flin = ols.fit(X,y) ols_dur = time.time() - st # run gof X2 = getX(v[n1:n],k) y2 = gety(v[n1:n],k) # neural network # in-sample yn = fnnw.predict(X) gin = pearsonr(y,yn)[0]**2 ginse = sum((y-yn)**2) # out-sample yn2 = fnnw.predict(X2) oin = pearsonr(y2,yn2)[0]**2 oinse = sum((y2-yn2)**2) # ols # in.sample yl = flin.predict(X) gil = pearsonr(y,yl)[0]**2 gilse = sum((y-yl)**2) yl2 = flin.predict(X2) oil = pearsonr(y2,yl2)[0]**2 oilse = sum((y2-yl2)**2) plt.subplot(321) plt.plot(y2) plt.plot(yl2) enter image description here The best case in this scenario is that my neural network NNW MSE in FORCAST +1 is smaller than OLS MSE FORCAST +1 Or is it not possible to get a smaller Error in nn as in linear model in this way