How to perform grid search for Multiple ML Models

How to perform grid search for Multiple ML Models - python

Normally we use GridSearchCV for performing grid search on hyperparameters of one particular model, like for example:
model_ada = AdaBoostClassifier()
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
grid_ada = GridSearchCV(estimator = model_ada, param_grid = params_ada, scoring = 'accuracy', cv = 5, verbose = 1, n_jobs = -1)
grid_ada.fit(X_train, y_train)
Is there any technique or function which allows us to perform grid search on ML models themselves? For example, I want to do as given below:
models = {'model_gbm':GradientBoostingClassifier(), 'model_rf':RandomForestClassifier(), 'model_dt':DecisionTreeClassifier(), 'model_svm':SVC(), 'model_ada':AdaBoostClassifier()}
params_gbm = {'learning_rate':[0.1,0.2,0.3,0.4], 'n_estimators':[50,100,500,1000,2000]}
params_rf = {'n_estimators':[50,100,500,1000,2000]}
params_dt = {'splitter':['best','random'], 'max_depth':[1, 5, 10, 50, 100]}
params_svm = {'C':[1,2,5,10,50,100,500], 'kernel':['rbf','poly','sigmoid','linear']}
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
params = {'params_gbm':params_gbm, 'params_rf':params_rf, 'params_dt':params_dt, 'params_svm':params_svm, 'params_ada':params_ada}
grid_ml = "that function"(models = models, params = params)
grid_ml.fit(X_train, y_train)
where "that function" is the function which I need to use to perform this type of operation.

Even I faced a similar issue, but couldn't find a predefined package/method that could possibly achieve this. Hence I wrote my own function to achieve this :
def Algo_search(models , params):
max_score = 0
max_model = None
max_model_params = None
for i,j in zip(models.keys() , models.values() ):
gs = GridSearchCV(estimator=j,param_grid=params[i])
a = gs.fit(X_train,y_train)
score = gs.score(X_test,y_test)
if score > max_score:
max_score = score
max_model = gs.best_estimator_
max_model_params = gs.best_params_
return max_score, max_model, max_model_params
#Data points
models = {'model_gbm':GradientBoostingClassifier(), 'model_rf':RandomForestClassifier(),
'model_dt':DecisionTreeClassifier(), 'model_svm':SVC(), 'model_ada':AdaBoostClassifier()}
params_gbm = {'learning_rate':[0.1,0.2,0.3,0.4], 'n_estimators':[50,100,500,1000,2000]}
params_rf = {'n_estimators':[50,100,500,1000,2000]}
params_dt = {'splitter':['best','random'], 'max_depth':[1, 5, 10, 50, 100]}
params_svm = {'C':[1,2,5,10,50,100,500], 'kernel':['rbf','poly','sigmoid','linear']}
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
params = {'model_gbm':params_gbm, 'model_rf':params_rf, 'model_dt':params_dt, 'model_svm':params_svm, 'model_ada':params_ada}
grid_ml = Algo_search(models = models, params = params)

It should be straightforward to perform multiple GridSearchCV then compare the results.
Below is a complete example on how to achieve this.
Note that there is a room for improvement, I will leave it to you. However, this is just to give you some insights of the idea.
from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier, \
RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
def get_param(model_name, params):
"""
Not the most sufficient way.
I recommend to have params and models
in OrderedDict() instead.
"""
for k, v in params.items():
mn = str(model_name).upper().split('_')
for k_ in str(k).upper().split('_'):
if k_ in mn:
return v
def models_gridSearchCV(models, params, scorer, X, y):
all_results = dict.fromkeys(models.keys(), [])
best_model = {'model_name': None,
'best_estimator': None,
'best_params': None,
'best_score': -9999999}
for model_name, model in models.items():
print("Processing {} ...".format(model_name))
# or use OrderedDict() and zip(models, params) above
# so there will be no need to check
param = get_param(model_name, params)
if param is None:
continue
clf = GridSearchCV(model, param, scoring=scorer)
clf.fit(X, y)
all_results[model_name] = clf.cv_results_
if clf.best_score_ > best_model.get('best_score'):
best_model['model_name'] = model_name
best_model['best_estimator'] = clf.best_estimator_
best_model['best_params'] = clf.best_params_
best_model['best_score'] = clf.best_score_
return best_model, all_results
### TEST ###
iris = datasets.load_iris()
X, y = iris.data, iris.target
# OrderedDict() is recommended here
# to maintain order between models and params
models = {'model_gbm': GradientBoostingClassifier(),
'model_rf': RandomForestClassifier(),
'model_dt': DecisionTreeClassifier(),
'model_svm': SVC(),
'model_ada': AdaBoostClassifier()}
params_gbm = {'learning_rate': [0.1, 0.2], 'n_estimators': [50, 100]}
params_rf = {'n_estimators': [50, 100]}
params_dt = {'splitter': ['best', 'random'], 'max_depth': [1, 5]}
params_svm = {'C': [1, 2, 5], 'kernel': ['rbf', 'linear']}
params_ada = {'n_estimators': [10, 100], 'learning_rate': [0.5, 1]}
# OrderedDict() is recommended here
# to maintain order between models and params
params = {'params_gbm': params_gbm,
'params_rf': params_rf,
'params_dt': params_dt,
'params_svm': params_svm,
'params_ada': params_ada}
best_model, all_results = models_gridSearchCV(models, params, 'accuracy', X, y)
print(best_model)
# print(all_results)
Result
Processing model_gbm ...
Processing model_rf ...
Processing model_dt ...
Processing model_svm ...
Processing model_ada ...
{'model_name': 'model_svm', 'best_estimator': SVC(C=5),
'best_params': {'C': 5, 'kernel': 'rbf'}, 'best_score': 0.9866666666666667}

Related

GridSearchCV with LightFM

I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806

None Type error in Python when running streamlit

Hi I am trying create an App in python that will allow users to choose which classification model they want to implement on one of three open source data in SK-Learn library The code is the following:
import streamlit as st
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
st.title("Streamlit example")
st.write("""
# Explore different classifier
which one is the best?
""")
dataset_name = st.sidebar.selectbox("Select Dataset", ("Iris","Breast Cancer","Wine Dataset") )
classifier_name = st.sidebar.selectbox("Select Classifier", ("KNN","SVM","Random Forest") )
def get_dataset(dataset_name):
if dataset_name == "Iris":
data = datasets.load_iris()
elif dataset_name == "Breast Cancer":
data = datasets.load_breast_cancer()
else:
data = datasets.load_wine()
X = data.data
y = data.target
return X, y
X, y = get_dataset(dataset_name)
st.write("Shape of datset", X.shape)
st.write("Number of classes", len(np.unique(y)))
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
else:
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params
params = add_parameter_ui(classifier_name)
def get_classifier(clf_name,params):
if clf_name == "KNN":
clf = KNeighborsClassifier(n_neighbors=params['K'])
elif clf_name == "SVM":
clf = SVC(C= params['C'])
else:
clf = RandomForestClassifier(n_estimators=params["n_estimators"],max_depth=params["max_depth"],random_state=1234)
return clf
clf = get_classifier(classifier_name,params)
The error is:
clf = KNeighborsClassifier(n_neighbors=params['K'])
TypeError: 'NoneType' object is not subscriptable
I know the error is supposed to be self-explanatory but I tried to state clf = None but still get the same error and i'm asking someone to put me in the right direction.

The problem is in your add_parameter_ui function. you are not returning a value in the case of clf_name is KNN or SVM and this causes params in the main code to be None so calling params['K'] is not because 'NoneType' object is not subscriptable.
Here is the fixed code:
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
return params
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
return params
# If Random Forest
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params

Why is target encoder encoding some values as NaN?

I am using a target encoder from category_encoders to encode a feature, here is the code I m using:
from category_encoders import TargetEncoder
def encode_large_features(features, X_train, X_test, y_train):
print('target encoding features ...')
for _ in features:
target_encoder = TargetEncoder(_)
target_encoder.fit(X_train[_], y_train)
name = _ + '_encoded'
X_train[name] = target_encoder.transform(X_train[_])
X_train.drop([_], axis=1, inplace=True)
X_test[name] = target_encoder.transform(X_test[_])
X_test.drop([_], axis=1, inplace=True)
return X_train, X_test
the target encoder encodes some values as NaN and I dont know why? here is an example:

Faced the same issue: Raised Issue n Repo
Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting.
This will not return NaN in the training Dataset like category_encoder library.
Below example: chid is a categorical column apply KFoldTargetEncoder on it.
Libraries required:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn import base
Training Dataset:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
Fit_Transform on Training Data:
targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5)
train_df = targetc_chid.fit_transform(train_df)
Test Dataset:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames,
self.encodedName]].groupby(
self.colNames).mean().reset_index()
dd = {}
for row in tqdm(mean.itertuples(index=False)):
dd[row[0]] = row[1]
X[self.encodedName] = X[self.colNames]
X[self.encodedName] = X[self.encodedName].map(dd.get)
return X
Fit on Test Data:
test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc')
valid_df = test_targetc_chid.fit_transform(valid_df)

Classification Problem: iterate different K for KNN

I am currently working on a classification problem (tweet sentiment analysis) and I would like to include a for loop for different K-values (KNN) in the classifiers list below.
I know that I could just go with:
KNeighborsClassifier(3), KNeighborsClassifier(5)... But I am trying to implement the rather elegant solution with a for loop.
Unfortunately, trying to create an empty list and add the different K values to it and then including it in the classifiers = [] list does not work properly. Do you have any good recommendations?
My code:
classifiers = [
KNeighborsClassifier(3),
LogisticRegression(),
SVC(kernel = "rbf", C = 0.025, probability = True),
NuSVC(probability = True),
DecisionTreeClassifier(),
RandomForestClassifier(),
GradientBoostingClassifier(),
MultinomialNB(),
BernoulliNB()]
for clf in classifiers:
clf.fit(train_tf, y_train)
name = clf.__class__.__name__
expectation = y_train
train_prediction = clf.predict(train_tf)
acc = accuracy_score(expectation, train_prediction)
pre = precision_score(expectation, train_prediction)
rec = recall_score(expectation, train_prediction)
f1 = f1_score(expectation, train_prediction)
fig, ax = plt.subplots(1,2, figsize=(14,4))
plt.suptitle(f'{name} \n', fontsize = 18)
plt.subplots_adjust(top = 0.8)
skplt.metrics.plot_confusion_matrix(expectation, train_prediction, ax=ax[0])
skplt.metrics.plot_confusion_matrix(expectation, train_prediction, normalize=True, ax = ax[1])
plt.show()
print(f"for the {name} we receive the following values:")
print("Accuracy: {:.3%}".format(acc))
print('Precision score: {:.3%}'.format(pre))
print('Recall score: {:.3%}'.format(rec))
print('F1 score: {:.3%}'.format(f1))
If you need any more info, just let me know :) Thank you very much in advance!

Something like this could work, where you basically just freeze the iteration of KNeighbors until all the neighbor values are exhausted.
classifiers = [
KNeighborsClassifier(3),
LogisticRegression(),
SVC(kernel = "rbf", C = 0.025, probability = True),
NuSVC(probability = True),
DecisionTreeClassifier(),
RandomForestClassifier(),
GradientBoostingClassifier(),
MultinomialNB(),
BernoulliNB()]
n_neighbors = [3, 5, 6, 10, 15] # or whatever
class_iter = iter(classifiers)
clf = next(class_iter)
while True:
try:
if isinstance(clf, KNeighborsClassifier) and any(n_neighbors):
neighbor_val = n_neighbors.pop()
clf.set_params(n_neighbors=neighbor_val)
else:
clf = next(class_iter)
#rest of code here
clf.fit(train_tf, y_train)
name = clf.__class__.__name__
expectation = y_train
train_prediction = clf.predict(train_tf)
acc = accuracy_score(expectation, train_prediction)
pre = precision_score(expectation, train_prediction)
rec = recall_score(expectation, train_prediction)
f1 = f1_score(expectation, train_prediction)
fig, ax = plt.subplots(1,2, figsize=(14,4))
plt.suptitle(f'{name} \n', fontsize = 18)
plt.subplots_adjust(top = 0.8)
skplt.metrics.plot_confusion_matrix(expectation, train_prediction, ax=ax[0])
skplt.metrics.plot_confusion_matrix(expectation, train_prediction, normalize=True, ax = ax[1])
plt.show()
print(f"for the {name} we receive the following values:")
print("Accuracy: {:.3%}".format(acc))
print('Precision score: {:.3%}'.format(pre))
print('Recall score: {:.3%}'.format(rec))
print('F1 score: {:.3%}'.format(f1))
except StopIteration:
break

LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results

While passing the exact same parameters to LightGBM and sklearn's implementation of LightGBM, I am getting different results. Initially, I was getting the exact same results on doing this, however, I made some changes to my code and now I can't find out why they're not coming the same. This means that the performance metrics and feature importance are coming differently. Please help me figure it out, I can't figure out the mistake I am making. It could either be a mistake in the way I am implementing LightGBM using the original library or in sklearn's implementation. Link for explanation on why we should get identical results - light gbm - python API vs Scikit-learn API
x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42)
n_folds = 5
lgb_train = lgb.Dataset(x_train, y_train)
def objective(params, n_folds = n_folds):
"""Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
print(params)
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])
params['min_child_samples'] = int(params['min_child_samples'])
params['subsample_freq'] = int(params['subsample_freq'])
# Perform n_fold cross validation with hyperparameters
# Use early stopping and evalute based on ROC AUC
cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000,
early_stopping_rounds=100, metrics='auc')
# Extract the best score
best_score = max(cv_results['auc-mean'])
# Loss must be minimized
loss = 1 - best_score
num_iteration = int(np.argmax(cv_results['auc-mean']) + 1)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, num_iteration])
# Dictionary with information for evaluation
return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration}
space = {
'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
'max_depth' : hp.quniform('max_depth', 3, 10, 1),
'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
'subsample_freq': hp.quniform('subsample_freq',0,10,1),
'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01),
'learning_rate' : 0.05,
'objective' : 'binary',
}
out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'estimators'])
of_connection.close()
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10)
bayes_trials_results = sorted(trials.results, key = lambda x: x['loss'])
results = pd.read_csv('results/gbm_trials.csv')
# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
best_bayes_estimators = int(results.loc[0, 'estimators'])
best['max_depth'] = int(best['max_depth'])
best['num_leaves'] = int(best['num_leaves'])
best['min_child_samples'] = int(best['min_child_samples'])
num_boost_round=int(best_bayes_estimators * 1.1)
best['objective'] = 'binary'
best['boosting_type'] = 'gbdt'
best['subsample_freq'] = int(best['subsample_freq'])
#Actual LightGBM
best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round)
print('Plotting feature importances...')
ax = lgb.plot_importance(best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain')
feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split')
feature_imp.to_clipboard()
y_pred_score = best_gbm.predict(x_test)
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()
#Sklearn's Implementation of LightGBM
best_sk = dict(best)
del best_sk['min_gain_to_split']
sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split'])
sk_best_gbm.fit(x_train, y_train)
sk_best_gbm.get_params()
print('Plotting feature importances...')
ax = lgb.plot_importance(sk_best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["Importance"] = sk_best_gbm.feature_importances_
feature_imp.to_clipboard()
y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1]
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to perform grid search for Multiple ML Models - python

Related

GridSearchCV with LightFM

None Type error in Python when running streamlit

Why is target encoder encoding some values as NaN?

Classification Problem: iterate different K for KNN

LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results

Categories

Resources