increasing SVM model accuracy using hyperparameters - python

I'm building a language detection model based on how the letters appear in one word rather than how words appear in a sentence so the model is expected to predict the language of a single word at a time. The languages I'm using in my training data are English, Hungarian and Latin. The data is arranged in a manner that every single word is in one line.
I have tried to accomplish that using linear regression and the accuracy was around 80 % so I tried to solve the problem using SVM. After that, I did a grid search and the results weren't significantly improved.
I need the accuracy to be around 96% percent. Should I consider a different model or could this be improved?
You can find the data files [Here][1].
[Google_Colab_notebook][2]
confusion matrices for SVM and linear regression
SVM confusion matrix
here are the confusion matrices :
First SVM [SVM][3].First Linear regresion[LR][4].Optimized SVM [Optimized SVM ][5].
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import linear_model
from sklearn import pipeline
from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
import time
from sklearn.utils import resample
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#load the drive
drive.mount('/content/gdrive/',force_remount=True)
#load raw training files
hun_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/hun_words.txt","utf-8",header=None,names=["Hungarian"])
eng_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/eng_words.txt","utf-8",header=None,names=["English"])
lat_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/lat_words.txt","utf-8",header=None,names=["Latin"])
#regular expression to clean the data
string_pattern = r'\W|\d+|[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]'
regex_pattern = re.compile(string_pattern) # compile string pattern to re.Pattern object
#natural language processing toolkit
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('hungarian'))
#cleaning and putting the words into a list
data_hun=[]
lang_hun=[]
for i,line in hun_df.iterrows():
line = line['Hungarian']
line=line.lower()
line = re.sub(regex_pattern,'', line)
if not line in stop_words and len(line) > 1 and len(line) < 14: # trying to downsample to balance the classes
data_hun.append(line)
lang_hun.append("Hungarian") #len(data_hun) is 254211
stop_words = set(stopwords.words('english'))
data_eng=[]
lang_eng=[]
for i,line in eng_df.iterrows():
line = line['English']
line=line.lower()
line = re.sub(regex_pattern,'', line) # consider compiling the regex in a different cell
if not line in stop_words and len(line) > 1 and len(line) < 14: # trying to downsample to balance the classes
data_eng.append(line)
lang_eng.append("English") # len(data_eng) 336610
data_lat=[]
lang_lat=[]
for i,line in lat_df.iterrows():
line = line['Latin']
line=line.lower()
line = re.sub(regex_pattern,'', line)
if len(line)>1 and len(line) < 14 :
data_lat.append(line)
lang_lat.append("Latin") # len(data_lat) 185249
number_of_samples=10000
#downsample and balance data
lat = resample(data_lat,
replace=True,
n_samples=number_of_samples,
random_state=42)
hun = resample(data_hun,
replace=True,
n_samples=number_of_samples,
random_state=42)
eng = resample(data_eng,
replace=True,
n_samples=number_of_samples,
random_state=42)
lang_lat = lang_lat[0:number_of_samples]
lang_hun = lang_hun[0:number_of_samples]
lang_eng = lang_eng[0:number_of_samples]
df=pd.DataFrame( {"Text":eng+hun+lat,
"Language":lang_eng+lang_hun+lang_lat})
print(df.shape)
# splitting the data to train and test
X,y=df.iloc[:,0],df.iloc[:,1]
X_train,X_test,y_train,y_test= train_test_split(X,
y,
test_size=0.2,
random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# vectorize the cleaned text every letter is vectorized
vectorizer= feature_extraction.text.TfidfVectorizer(ngram_range=(1,5),analyzer='char')
LR_pipeline=pipeline.Pipeline([
('vectorizer',vectorizer),
('clf',linear_model.LogisticRegression(solver='newton-cg',max_iter=1000)
) # {‘newton-cg’, ‘lbfgs’, , ‘sag’, ‘saga’}
])
#linear regression model prepare and fit
start = time.time()
LR_pipeline.fit(X_train,y_train)
end = round(time.time()-start,2)
end
y_predicted=LR_pipeline.predict(X_test)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
#linear regression model metrics
print(classification_report(y_test, y_predicted))
#prepare data for the SVM model and fit it
Tfidf_vect = feature_extraction.text.TfidfVectorizer(ngram_range=(1,5),analyzer='char')
Tfidf_vect.fit(df.iloc[:,0])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',verbose=True)
start = time.time()
SVM.fit(X_train_Tfidf,y_train)# predict the labels on validation dataset
end = round(time.time()-start,2)
end
y_predicted=SVM.predict(X_test_Tfidf)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
#SVM classification metrics
print(classification_report(y_test, y_predicted))
# The two confusion matrices side by side
plot_confusion_matrix(SVM, X_test_Tfidf, y_test)
plot_confusion_matrix(LR_pipeline, X_test, y_test)
plt.show()
# defining parameter range to perform a grid search and optimize hyper parameters
param_grid = {'C': [ 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# defining parameter range
param_grid = {'C': [ 0.1,1, 10, 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
'''
best parameters
{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')
'''
SVM_opt = svm.SVC(C=1, kernel='linear', degree=3, gamma=1,verbose=True)
start = time.time()
SVM_opt.fit(X_train_Tfidf,y_train)# predict the labels on validation dataset
end = round(time.time()-start,2)
end
y_predicted=SVM_opt.predict(X_test_Tfidf)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
# metrics and confusion matrix
print(classification_report(y_test, y_predicted))
plot_confusion_matrix(SVM_opt, X_test_Tfidf, y_test)
plt.show()
*[1]: https://drive.google.com/drive/folders/1nKINXfDG4IbEw4YlUzGTaRpJh4CFwGlC?usp=sharing
[2]: https://colab.research.google.com/drive/1FJNd7vXcHnrNZYz9aM56U1TalhVBCvO7?usp=sharing
[3]: https://i.stack.imgur.com/SeDh7.png
[4]: https://i.stack.imgur.com/Ksb7m.png
[5]: https://i.stack.imgur.com/Iw35J.png

Related

Scoring multiple output variables with scikit-learn

I am making a regressor which is supposed to output two continuous variables a and b.
The problem is: When you use cross_val_score from scikit-learn to evaluate the performance, then you per default get a score across the output variables. I want to get a score for each one. Specifically I want the R2 measure of both a and b.
I haven't been able to find out how to do it yet. Anyone who can help?
Reproducible code below:
import pandas as pd
import math
import os
import numpy as np
from sklearn import linear_model
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def trainModel():
# Array to store scores
nested_scores = np.zeros(NUM_TRIALS)
# Loop for each trial
for i in range(NUM_TRIALS):
# Choose cross-validation inner and outer
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
# Parameter search in outer cross val
clf = GridSearchCV(estimator=reg, param_grid=p_grid, cv=inner_cv)
clf.fit(X_values, y_values)
# Model selection in outer cross val
nested_score = cross_val_score(clf, X=X_values, y=y_values, cv=outer_cv, scoring='r2')
nested_scores[i] = nested_score.mean()
print("nested_score",nested_score)
print("R2:",nested_scores)
df = pd.DataFrame(np.random.randint(0,100,size=(100, 6)))
df.columns=['a','b','c','d','e','f']
#PRE-PROCESSING
NUM_COLUMNS = df.shape[1]
X_values = np.array(df.iloc[:,[0,1,2,3]])
y_values = np.array(df.iloc[:,[NUM_COLUMNS-2,NUM_COLUMNS-1]])
print("pre-processing done!")
#MODEL TRAINING
NUM_TRIALS = 10
#ELASTIC NET
print("\nELASTIC NET")
p_grid = {"alpha": [0.2, 0.5, 1, 1.5, 2, 3],
"l1_ratio": [0.2, 0.3, 0.4, 0.5, 1]}
reg = ElasticNet(random_state=0)
trainModel()
#NEURAL NETWORK
print("\nNEURAL NETWORK")
p_grid = {"alpha": [0.2, 0.5, 1, 1.5, 2, 3],
"hidden_layer_sizes": list(range(1,6))}
reg = MLPRegressor(solver='lbfgs', random_state=1)
trainModel()
Dummy data:
So basically I have two y-values, and I want the R2 statistic for both of the variables instead of one statistic across the variables. Let me know if you have any questions.
X_values (the input variables (4))
y_values (the output variables (2))
Output for 10 trials with 4-fold cross val for an Elastic Net and Neural Network model. The last line with "R2:" is the average over the folds.
**

GridSearchCV gives different result than my manually tuning procedure

I get different results when i am doing GridSearch using sklearn and manually.
The first code block is my procedure when i run GridSearch using sklearn:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
X = folded_train.drop(columns = ["10_fold", "class_encoded"])
y = folded_train["class_encoded"]
ten_fold = folded_train["10_fold"]
logo = LeaveOneGroupOut()
cross_val_groups = logo.split(X, y, ten_fold)
classifier = (Pipeline([("sampling", RandomUnderSampler()),
("classifier", ensemble.RandomForestClassifier(n_jobs = -1))]))
param_grid = {
"classifier__n_estimators" : [100, 200, 300, 400, 600],
"classifier__max_depth": [1, 3, 5, 7],
"classifier__criterion": ["gini", "entropy"]
}
model = model_selection.GridSearchCV(
estimator = classifier,
param_grid = param_grid,
scoring = "roc_auc",
verbose = 10,
n_jobs = 1,
cv = cross_val_groups
)
model.fit(X,y)
And I am trying to do the same procedure manually. Here is my code:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
X = folded_train.drop(columns = ["10_fold", "class_encoded"])
y = folded_train["class_encoded"]
ten_fold = folded_train["10_fold"]
number_of_estimators = [100, 200, 300]
maximum_depths = [1, 3, 5, 7]
criterions = ["gini", "entropy"]
logo = LeaveOneGroupOut()
for criterion in criterions:
for max_depth in maximum_depths:
for n_of_estimator in number_of_estimators:
for train_index, val_index in logo.split(X, y, ten_fold):
aPipeline = (Pipeline(steps=[('sampling', RandomUnderSampler()),
('classifier', ensemble.RandomForestClassifier(criterion= criterion,
max_depth= max_depth,
n_estimators= n_of_estimator,
n_jobs=-1))]))
X_trn, X_vl = X.iloc[train_index], X.iloc[val_index]
y_trn, y_vl = y.iloc[train_index], y.iloc[val_index]
aPipeline1.fit(X_trn, y_trn)
predictions = aPipeline1.predict(X_vl)
print("Criterion", criterion, " Max Depth", max_depth, "Number of estimator ", n_of_estimator, "score ", metrics.roc_auc_score(y_vl, predictions))
For sklearn GridSearchCV, i obtained scores (roc_auc) for specific parameters as below:
For criterion = "gini", max_depth = 1 and n_estimators = 100,
[0.786, 0.799, 0.789, 0.796, 0.775, 0.776, 0.779, 0.788, 0.770, 0.769] for each cv iteration
And my manual execution for same parameters i get:
[0.730, 0.749, 0.714, 0.710, 0.732, 0.724, 0.711, 0.724, 0.715, 0.734]
And this results is valid for other parameter combinations too. What are the factors that lead to such kind of situation?
Note: I found this but it is not answer of my problem: Why GridSearchCV model results are different than the model I manually tuned?

Bad MSE while using Pipes

I'm trying to predict some prices from a dataset that I scraped. I never used Python for this (I usually use tidyverse, but this time I wanted to explore pipeline.
So here is the code snippet:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
df = pd.read_csv("https://raw.githubusercontent.com/norhther/idealista/main/idealistaBCN.csv")
df.drop("info", axis = 1, inplace = True)
df["floor"].fillna(1, inplace=True)
df.drop("neigh", axis = 1, inplace = True)
df.dropna(inplace = True)
df = df[df["habs"] < 11]
X = df.drop("price", axis = 1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
ct = ColumnTransformer(
[("standardScaler", StandardScaler(), ["habs", "m2", "floor"]),
("onehot", OneHotEncoder(), ["type"]
)], remainder="passthrough")
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR())])
param_grid = {
"svr__kernel" : ['linear', 'poly', 'rbf', 'sigmoid'],
"svr__degree" : range(3,6),
"svr__gamma" : ['scale', 'auto'],
"svr__coef0" : np.linspace(0.01, 1, 2)
}
search = GridSearchCV(pipe, param_grid, scoring = ['neg_mean_squared_error'], refit='neg_mean_squared_error')
search.fit(X_train, y_train)
print(search.best_score_)
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR(coef0 = search.best_params_["svr__coef0"],
degree = search.best_params_["svr__degree"],
kernel =
search.best_params_["svr__kernel"]))])
from sklearn.metrics import mean_squared_error
pipe.fit(X_train, y_train)
preds = pipe.predict(X_train)
mean_squared_error(preds, y_train)
And search.best_score_ here is -443829697806.1671, and the MSE is 608953977916.3896
I think I messed up with something, maybe with the transformer, but I'm not completely sure. I think this is an exagerated MSE. I did a fearly similar approach with tidymodels and I got much better results.
So here I wanted to know if there is something wrong with the transformer, or is just that the model is this bad.
The reason is that you did not include C in parameter and you need to cover a whole range of Cs to fit. If we fit it with the default C = 1, you can see where the problem lies:
import matplotlib.pyplot as plt
o = pipe.named_steps["Transformer"].fit_transform(X_train)
mdl = SVR(C=1)
mdl.fit(o,y_train)
plt.scatter(mdl.predict(o),y_train)
There are some price values that are 10x the average values (1e7 versus median of 5e5). If you use mse or r^2, these will be heavily decided by these extreme values. So we need to follow the data a bit more closely and this is decided by C, which you can read more about here. We try a range:
ct = ColumnTransformer(
[("standardScaler", StandardScaler(), ["habs", "m2", "floor"]),
("onehot", OneHotEncoder(), ["type"]
)], remainder="passthrough")
pipe = Pipeline(steps = [("Transformer", ct),
("svr", SVR())])
#, 'poly', 'rbf', 'sigmoid'
param_grid = {
"svr__kernel" : ['rbf'],
"svr__gamma" : ['auto'],
"svr__coef0" : [1,2],
"svr__C" : [1e-03,1e-01,1e1,1e3,1e5,1e7]
}
search = GridSearchCV(pipe, param_grid, scoring = ['neg_mean_squared_error'],
refit='neg_mean_squared_error')
search.fit(X_train, y_train)
print(search.best_score_)
-132061065775.25969
Your y values are high and the MSE values are going to be in the range of the the variance of your y values, so if we check that:
y_train.var()
545423126823.4545
132061065775.25969 / y_train.var()
0.24212590057261346
It is pretty ok, you reduce MSE to about 25% of the variance. We can check this with the test data, and I guess in this case it is quite lucky that the C values are pretty ok:
from sklearn.metrics import mean_squared_error
o = pipe.named_steps["Transformer"].fit_transform(X_train)
mdl = SVR(C=10000000.0, coef0=1, gamma='auto')
mdl.fit(o,y_train)
o_test = pipe.named_steps["Transformer"].fit_transform(X_test)
pred = mdl.predict(o_test)
print( mean_squared_error(pred,y_test) , mean_squared_error(pred,y_test)/y_test.var())
plt.scatter(mdl.predict(o_test),y_test)

MLPRegressor working but results don't make any sense

I am building a neural network with my research data in two ways: with a statistical programm (SPSS) and with python.
I am using the scikit learn MLPRegressor. The problem I have is that whereas my code is , apparently, well written (because it runs), the results do not make sense. The r2score should be around 0.70 ( it is-4147.64) and the correlation represented in the graph should be almost linear. (it is just a straight line at a constant distance from X axis). Also the x and y axis should have values ranging from 0 to 180, which is not the case ( X from 20 to 100, y from -4100 to -3500)
If any of you can give a hand I would really appreciate it.
Thank you!!!!!!
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
vhdata = pd.read_csv('vhrawdata.csv')
vhdata.head()
X = vhdata[['PA NH4', 'PH NH4', 'PA K', 'PH K', 'PA NH4 + PA K', 'PH NH4 + PH K', 'PA IS', 'PH IS']]
y = vhdata['PMI']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)
nnref = MLPRegressor(hidden_layer_sizes = [4], activation = 'logistic', solver = 'sgd', alpha = 1,
learning_rate= 'constant', learning_rate_init= 0.6, max_iter=40000, momentum=
0.3).fit(X_train, y_train)
y_predictions= nnref.predict(X_test)
print('Accuracy of NN classifier on training set (R2 score): {:.2f}'.format(nnref.score(X_train_norm, y_train)))
print('Accuracy of NN classifier on test set (R2 score): {:.2f}'.format(nnref.score(X_test_norm, y_test)))
plt.figure()
plt.scatter(y_test,y_predictions, marker = 'o', color='red')
plt.xlabel('PMI expected (hrs)')
plt.ylabel('PMI predicted (hrs)')
plt.title('Correlation of PMI predicted by MLP regressor and the actual PMI')
plt.show()
You have a couple of issues. First, it is important to use the right scaler or normalization to work with an MLP. NNs work best between 0 and 1, so consider using sklearn's MinMaxScaler to accomplish this.
So:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)
Should be:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.fit_transform(X_test)
Next, you are training and testing on the unscaled data, but then performing your scores on the scaled data. Meaning:
nnref = MLPRegressor(hidden_layer_sizes = [4], activation = 'logistic', solver = 'sgd', alpha = 1,
learning_rate= 'constant', learning_rate_init= 0.6, max_iter=40000, momentum=
0.3).fit(X_train, y_train)
should be:
nnref = MLPRegressor(hidden_layer_sizes = [4], activation = 'logistic', solver = 'sgd', alpha = 1,
learning_rate= 'constant', learning_rate_init= 0.6, max_iter=40000, momentum=
0.3).fit(X_train_norm , y_train)
And...
y_predictions= nnref.predict(X_test)
Should be:
y_predictions= nnref.predict(X_test_norm)
Additional notes...
It doesn't make any sense to predict on your training data. That provides no value, as it is testing the same data it learned from and should predict 100%. That is an example of overfitting.
Well, I found a mistake:
You train the model on samples, that weren't normalized:
nnref = MLPRegressor(...).fit(X_train, y_train)
But later you're trying to predict values from normalized samples:
nnref.score(X_train_norm, y_train)
Also the x and y axis should have values ranging from 0 to 180, which is not the case ( X from 20 to 100, y from -4100 to -3500)
Scikit-learn do not change values by itself. If X is not in range you need, it means that you've changed it somehow. Or, maybe your vision of X values is incorrect.

how to print estimated coefficients after a (GridSearchCV) fit a model? (SGDRegressor)

I am new to scikit-learn, but it did what I was hoping for. Now, maddeningly, the only remaining issue is that I don't find how I could print (or even better, write to a small text file) all the coefficients it estimated, all the features it selected. What is the way to do this?
Same with SGDClassifier, but I think it is the same for all base objects that can be fit, with cross validation or without. Full script below.
import scipy as sp
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn import grid_search
from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
def main():
print("Started.")
# n = 10**6
# notreatadapter = iopro.text_adapter('S:/data/controls/notreat.csv', parser='csv')
# X = notreatadapter[1:][0:n]
# y = notreatadapter[0][0:n]
notreatdata = pd.read_stata('S:/data/controls/notreat.dta')
notreatdata = notreatdata.iloc[:10000,:]
X = notreatdata.iloc[:,1:]
y = notreatdata.iloc[:,0]
n = y.shape[0]
print("Data lodaded.")
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
print("Data split.")
scaler = StandardScaler()
scaler.fit(X_train) # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) # apply same transformation to test data
print("Data scaled.")
# build a model
model = SGDClassifier(penalty='elasticnet',n_iter = np.ceil(10**6 / n),shuffle=True)
#model.fit(X,y)
print("CV starts.")
# run grid search
param_grid = [{'alpha' : 10.0**-np.arange(1,7),'l1_ratio':[.05, .15, .5, .7, .9, .95, .99, 1]}]
gs = grid_search.GridSearchCV(model,param_grid,n_jobs=8,verbose=1)
gs.fit(X_train, y_train)
print("Scores for alphas:")
print(gs.grid_scores_)
print("Best estimator:")
print(gs.best_estimator_)
print("Best score:")
print(gs.best_score_)
print("Best parameters:")
print(gs.best_params_)
if __name__=='__main__':
mp.freeze_support()
main()
The SGDClassifier instance fitted with the best hyperparameters is stored in gs.best_estimator_. The coef_ and intercept_ are the fitted parameters of that best model.
From an estimator, you can get the coefficients with coef_ attribute.
From a pipeline you can get the model with the named_steps attribute then get the coefficients with coef_.
From a grid search, you can get the model (best model) with best_estimator_, then get the named_steps to get the pipeline and then get the coef_.
Example:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([
("scale", StandardScaler()),
("model", LinearSVC())
])
# from pipe:
pipe.fit(X, y);
coefs = pipe.named_steps.model.coef_
# from gridsearch:
gs_svc_model = GridSearchCV(estimator=pipe,
param_grid={
'model__C': [.01, .1, 10, 100, 1000],
},
cv=5,
n_jobs = -1)
gs_svc_model.fit(X, y);
coefs = gs_svc_model.best_estimator_.named_steps.model.coef_
I think you might be looking for estimated parameters of the "best" model rather than the hyper-parameters determined through grid-search. You can plug the best hyper-parameters from grid-search ('alpha' and 'l1_ratio' in your case) back to the model ('SGDClassifier' in your case) to train again. You can then find the parameters from the fitted model object.
The code could be something like this:
model2 = SGDClassifier(penalty='elasticnet',n_iter = np.ceil(10**6 / n),shuffle=True, alpha = gs.best_params_['alpha'], l1_ratio=gs.best_params_['l1_ratio'])
print(model2.coef_)

Categories

Resources