I'm trying to use RFE from scikit-learn with an estimator from statsmodels NegativeBinomial.
So I created my own class:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm
class MyEstimator(BaseEstimator):
def __init__(self, formula_, data_, family_):
self.model = sm.formula.glm(formula, data=data_, family=family_)
def fit(self, **kwargs):
self.model.fit()
self.coef_ = self.model.params.values
def predict(self, X):
result = self.model.predict(X)
return np.array(result)
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
dataset = pd.DataFrame({'X1':X[:,0], 'X2':X[:,1], 'X3':X[:,2], 'y':y})
estimator = MyEstimator("y ~ X1 + X2 + X3", dataset, sm.families.NegativeBinomial())
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit()
But I get this error:
TypeError: fit() missing 2 required positional arguments: 'X' and 'y'
Does someone has an idea?
You can modify your code to require endog and exog variables, instead of using the formula API:
import numpy as np
import pandas as pd
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm
class MyEstimator(BaseEstimator):
def __init__(self, family_):
self.family_ = family_
def fit(self, exog, endog):
self.model = sm.GLM(endog, exog, family=self.family_)
fit_results = self.model.fit()
self.coef_ = fit_results.params
def predict(self, X):
result = self.model.predict(X)
return np.array(result)
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = MyEstimator(sm.families.NegativeBinomial())
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y.reshape(-1,1))
print(selector.ranking_)
# [1 1 3 1 1 5 1 6 4 2]
I am spot checking bunch of regression models.how do a fit multiple ml models, Would i use a for loop and do model.fit
#Variables
alpha= [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
#function
def get_models(model=dict()):
model['lr'] = LinearRegression()
for values in alpha:
model["Lasso"]=Lasso(alpha=values)
model["Ridge"]=Ridge(alpha=values)
model["Huber"]=HuberRegressor()
model["Lars"]=Lars()
model["Lasso_l"]=LassoLars()
model["PA"]=PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
model["RANSAC"]=RANSACRegressor()
model["SGD"]=SGDRegressor(max_iter=1000, tol=1e-3)
model["theil"]=TheilSenRegressor()
model["cart"] = DecisionTreeRegressor()
model["extra"] = ExtraTreeRegressor()
model["svml"] = SVR(kernel='linear')
model["svmp"] = SVR(kernel='poly')
#Loaded data and have X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
#fitting models
Yes, once your dict is filled with get_models(), you can fit the models with a for loop:
for model in models:
model.fit(X_train, y_train)
You can easily loop through several Scikit Learn models, and do all the fitting too. Try the sample code below directly below and take at the links towards the bottom of my post.
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import datasets
from sklearn.linear_model import SGDClassifier, LogisticRegression, \
Perceptron, PassiveAggressiveClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct, Matern, StationaryKernelMixin, WhiteKernel
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from utilities import *
from universal_params import *
def gen_classification_data(n=None):
"""
uses the iris data
:return: x, y
"""
iris = datasets.load_iris()
x = iris.data
y = iris.target
if n:
half = int(n/2)
np.concatenate((x[:half], x[-half:]), 1), np.concatenate((y[:half], y[-half:]), 0)
return x, y
linear_models_n_params = [
(SGDClassifier,
{'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
'alpha': [0.0001, 0.001, 0.1],
**penalty_12none
}),
(LogisticRegression,
{**penalty_12, **max_iter, **tol, ** warm_start, **C,
'solver': ['liblinear']
}),
(Perceptron,
{**penalty_all, **alpha, **n_iter, **eta0, **warm_start
}),
(PassiveAggressiveClassifier,
{**C, **n_iter, **warm_start,
'loss': ['hinge', 'squared_hinge'],
})
]
linear_models_n_params_small = linear_models_n_params
svm_models_n_params = [
(SVC,
{**C, **kernel, **degree, **gamma, **coef0, **shrinking, **tol, **max_iter_inf2}),
(NuSVC,
{**nu, **kernel, **degree, **gamma, **coef0, **shrinking, **tol
}),
(LinearSVC,
{ **C, **penalty_12, **tol, **max_iter,
'loss': ['hinge', 'squared_hinge'],
})
]
svm_models_n_params_small = [
(SVC,
{**kernel, **degree, **shrinking
}),
(NuSVC,
{**nu_small, **kernel, **degree, **shrinking
}),
(LinearSVC,
{ **C_small,
'penalty': ['l2'],
'loss': ['hinge', 'squared_hinge'],
})
]
neighbor_models_n_params = [
(KMeans,
{'algorithm': ['auto', 'full', 'elkan'],
'init': ['k-means++', 'random']}),
(KNeighborsClassifier,
{**n_neighbors, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2]
}),
(NearestCentroid,
{**neighbor_metric,
'shrink_threshold': [1e-3, 1e-2, 0.1, 0.5, 0.9, 2]
}),
(RadiusNeighborsClassifier,
{**neighbor_radius, **neighbor_algo, **neighbor_leaf_size, **neighbor_metric,
'weights': ['uniform', 'distance'],
'p': [1, 2],
'outlier_label': [-1]
})
]
gaussianprocess_models_n_params = [
(GaussianProcessClassifier,
{**warm_start,
'kernel': [RBF(), ConstantKernel(), DotProduct(), WhiteKernel()],
'max_iter_predict': [500],
'n_restarts_optimizer': [3],
})
]
bayes_models_n_params = [
(GaussianNB, {})
]
nn_models_n_params = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(16,), (64,), (100,), (32, 32)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
**alpha, **learning_rate, **tol, **warm_start,
'batch_size': ['auto', 50],
'max_iter': [1000],
'early_stopping': [True, False],
'epsilon': [1e-8, 1e-5]
})
]
nn_models_n_params_small = [
(MLPClassifier,
{ 'hidden_layer_sizes': [(64,), (32, 64)],
'batch_size': ['auto', 50],
'activation': ['identity', 'tanh', 'relu'],
'max_iter': [500],
'early_stopping': [True],
**learning_rate_small
})
]
tree_models_n_params = [
(RandomForestClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **n_estimators, **max_depth,
**min_samples_split, **min_impurity_split, **warm_start, **min_samples_leaf,
}),
(DecisionTreeClassifier,
{'criterion': ['gini', 'entropy'],
**max_features, **max_depth, **min_samples_split, **min_impurity_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators, **max_features, **max_depth,
**min_samples_split, **min_samples_leaf, **min_impurity_split, **warm_start,
'criterion': ['gini', 'entropy']})
]
tree_models_n_params_small = [
(RandomForestClassifier,
{**max_features_small, **n_estimators_small, **min_samples_split, **max_depth_small, **min_samples_leaf
}),
(DecisionTreeClassifier,
{**max_features_small, **max_depth_small, **min_samples_split, **min_samples_leaf
}),
(ExtraTreesClassifier,
{**n_estimators_small, **max_features_small, **max_depth_small,
**min_samples_split, **min_samples_leaf})
]
def run_linear_models(x, y, small = True, normalize_x = True):
return big_loop(linear_models_n_params_small if small else linear_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_svm_models(x, y, small = True, normalize_x = True):
return big_loop(svm_models_n_params_small if small else svm_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_neighbor_models(x, y, normalize_x = True):
return big_loop(neighbor_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_gaussian_models(x, y, normalize_x = True):
return big_loop(gaussianprocess_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_nn_models(x, y, small = True, normalize_x = True):
return big_loop(nn_models_n_params_small if small else nn_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_tree_models(x, y, small = True, normalize_x = True):
return big_loop(tree_models_n_params_small if small else tree_models_n_params,
StandardScaler().fit_transform(x) if normalize_x else x, y, isClassification=True)
def run_all(x, y, small = True, normalize_x = True, n_jobs=cpu_count()-1):
all_params = (linear_models_n_params_small if small else linear_models_n_params) + \
(nn_models_n_params_small if small else nn_models_n_params) + \
([] if small else gaussianprocess_models_n_params) + \
neighbor_models_n_params + \
(svm_models_n_params_small if small else svm_models_n_params) + \
(tree_models_n_params_small if small else tree_models_n_params)
return big_loop(all_params,
StandardScaler().fit_transform(x) if normalize_x else x, y,
isClassification=True, n_jobs=n_jobs)
if __name__ == '__main__':
x, y = gen_classification_data()
run_all(x, y, n_jobs=1)
Here are a couple examples that you can follow.
https://github.com/PyDataBlog/Python-for-Data-Science/blob/master/Tutorials/Yellow%20brick.ipynb
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
I have been trying to get the feature names on my model for quite some time now but have a hard time understanding how to do it. I have tried many posts on here but can't get it to work. Here is my code:
loading the classes I need to combine tfidfvectorizer with other features
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
Then the code to put everything in a pipeline and run the regressor
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
I can run the model.coef_ to get all the coefficients but I want to see how each item of the TEXT_COLUMN is affected by which weight. I have tried calling get_feature_names() or tried passing them in the pipeline but with no succes (most of google's results are purple by now).
Anyone that can give me a bit of guidance how to pass the feature names to the end of the pipeline? The ideal result would be a dataframe with the feature (row from the TEXT_COLUMN) and feature_weight as value.
happy friday.
I am trying to create a pipeline for multiple classifiers.
I started off by finding this
Unfortunately it is a little bit above my skill level right now and I could not get it to work properly, I ended up going the more verbose and lengthy way:
def multi_tester(X_train, y_train):
pipe_1 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', MultiOutputClassifier(RandomForestClassifier()))
])
pipe_2 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', ExtraTreesClassifier())
])
pipe_3 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', AdaBoostClassifier())
])
pipe_4 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', GradientBoostingClassifier())
])
pars = [
{'clf': [MultiOutputClassifier()]},
{'clf': [ExtraTreesClassifier()]},
{'clf': [AdaBoostClassifier()]},
{'clf': [GradientBoostingClassifier()]}
]
pips = [pipe_1, pipe_2, pipe_3, pipe_4]
pip_names = ['MultiOutputClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier']
scoring = {'AUC': 'roc_auc',
'F1': metrics.make_scorer(metrics.f1_score),
'recall': metrics.make_scorer(metrics.recall_score),
'precision': metrics.make_scorer(metrics.precision_score)}
print ("starting Gridsearch")
for i in range(len(pars)):
gs = GridSearchCV(pips[i], pars[i], scoring = scoring,
cv = 5, verbose=2, refit=False, n_jobs=-1, return_train_score = True)
gs = gs.fit(X_train, y_train)
print ("finished Gridsearch for: ", pip_names[i])
print (gs.best_score_)
Unfortunately I think I declared either the estimators or the params incorrectly because when I run this:
multi_tester(X_train, y_train)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-32-fd713f1ef4da> in <module>
----> 1 multi_tester(X_train, y_train)
<ipython-input-30-287aec48dbda> in multi_tester(X_train, y_train)
25
26 pars = [
---> 27 {'clf__estimator': [MultiOutputClassifier()]},
28 {'clf__estimator': [ExtraTreesClassifier()]},
29 {'clf__estimator': [AdaBoostClassifier()]},
TypeError: __init__() missing 1 required positional argument: 'estimator'
I've gone over the documentation and thought the way I instantiated it covered the default params but, I clearly got it wrong.
If you have any suggestions or input on how I could deal with this it would be greatly appreciated.
-#####################################################################################################-
Also for future reference in case you end up finding stackoverflow to attempt the David Batista code, this is how far I got:
and here are some SO questions that I thought were really helpful:
1
2
For now I can't troubleshoot it but it seems like figuring out the params in a way that they are a list will fix the issues.
class ClfSwitcher(BaseEstimator):
def __init__(self, estimator = SGDClassifier(),):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
And then I defined some of the parameters for the models I want to look at like this:
search_space = [{
'ExtraTreesClassifier': { 'n_estimators': [200] },
'RandomForestClassifier': { 'n_estimators': [200] },
'AdaBoostClassifier': { 'n_estimators': [200] },
'GradientBoostingClassifier': { 'n_estimators': [200], 'learning_rate': [0.8, 1.0] },
'SVC': [{'kernel': ['linear'], 'C': [1, 10]}, {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}],
'MultiOutputClassifier': { 'n_estimators': [200] }
}]
finally the pipeline was defined in this way:
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', ClfSwitcher())
])
and the respective scoring:
scoring = {'AUC': 'roc_auc',
'F1': metrics.make_scorer(metrics.f1_score),
'recall': metrics.make_scorer(metrics.recall_score),
'precision': metrics.make_scorer(metrics.precision_score)}
but when I run the grid search it returns an error:
grid = GridSearchCV(estimator = pipeline, param_grid = search_space, cv = 10, scoring = scoring, return_train_score = True,
n_jobs = -1, refit = 'AUC')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-26-7b76d42489a9> in <module>
1 grid = GridSearchCV(estimator = pipeline, param_grid = search_space, cv = 10, scoring = scoring, return_train_score = True,
----> 2 n_jobs = -1, refit = 'AUC')
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in __init__(self, estimator, param_grid, scoring, n_jobs, iid, refit, cv, verbose, pre_dispatch, error_score, return_train_score)
1143 return_train_score=return_train_score)
1144 self.param_grid = param_grid
-> 1145 _check_param_grid(param_grid)
1146
1147 def _run_search(self, evaluate_candidates):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _check_param_grid(param_grid)
369 raise ValueError("Parameter values for parameter ({0}) need "
370 "to be a sequence(but not a string) or"
--> 371 " np.ndarray.".format(name))
372
373 if len(v) == 0:
ValueError: Parameter values for parameter (ExtraTreesClassifier) need to be a sequence(but not a string) or np.ndarray.
I am trying to make this estimator scikit-learn-compatible so that I can search the parameter space with GridSearchCV.
EDIT:
I have modified the script as suggested (see below).
the fit signature is modified to fit(self, X, y)
All parameters are passed in __init__
There is still a compatibility issue with GripdSearchCV, possibly because the estimator is a multilabel classifier.
ValueError: Can't handle mix of multilabel-indicator and continuous-multioutput
But that is beyond the point; the attribute error is now gone. So, we can safely conclude that the modifications suggested made the estimator scikit-learn-compatible.
Final code script:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
class LogisticClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, basis=None, itrs=100, learn_rate=0.1, reg=0.1, momentum=0.5, proj_layer_size=10):
self.W = []
self.A = None
if basis == 'rectifier':
self.basisfunc = self.rectifier_basis
else:
self.basisfunc = self.identity
self.itrs = itrs
self.learn_rate = learn_rate
self.reg = reg
self.momentum = momentum
self.proj_layer_size = proj_layer_size
def identity(self, x):
return np.hstack((x, 1))
def rectifier_basis(self, x):
xn = np.dot(self.A, x)
return self.identity(np.maximum(xn, 0))
def basismap(self, X):
new_dimensions = self.basisfunc(X[0,:]).shape[0]
Xn = np.zeros((X.shape[0], new_dimensions))
for i, xi in enumerate(X):
Xn[i,:] = self.basisfunc(xi)
return Xn
def fit(self, X, Y):
self.A = np.random.uniform(-1, 1, (self.proj_layer_size, X.shape[1]))
Xn = self.basismap(X)
self.W = np.array(np.random.uniform(-0.1, 0.1, (Y.shape[1], Xn.shape[1])))
costs_train, costs_test = [], []
previous_grad = np.zeros(self.W.shape)
for i in range(self.itrs):
grad = self.grad(Xn, Y)
self.W = self.W - self.learn_rate*(grad+self.momentum*previous_grad)
previous_grad = grad
costs_train.append(self.loss(X, Y))
#costs_test.append(self.loss(Xtest, Ytest))
#return (costs_train, costs_test)
return costs_train
def softmax(self, Z):
Z = np.maximum(Z, -1e3)
Z = np.minimum(Z, 1e3)
numerator = np.exp(Z)
return numerator/np.sum(numerator, axis=1).reshape((-1,1))
def predict(self, X):
Xn = self.basismap(X)
return self.softmax(np.dot(Xn, self.W.T))
def grad(self, Xn, Y):
Yh = self.softmax(np.dot(Xn, self.W.T))
return -np.dot(Y.T-Yh.T,Xn)/Xn.shape[0] + self.reg*self.W
def loss(self, X, Y):
Yh = self.predict(X)
return -np.mean(np.mean(Y*np.log(Yh)))-self.reg*np.trace(np.dot(self.W,self.W.T))/self.W.shape[0]
def get_params(self, deep=True):
return {"itrs": self.itrs, "learn_rate": self.learn_rate, "reg": self.reg, "momentum": self.momentum,
"report_cost": self.report_cost, "proj_layer_size": self.proj_layer_size, "iseed": self.iseed}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
#make data
X, Y = make_classification(n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3,
n_clusters_per_class=1, random_state=31)
lb = LabelBinarizer()
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.25, random_state=5)
#model optimization
param_grid = {'learn_rate': [0.1, 0.01, 0.001],
'reg': [0.001, 0.01]
}
clf = LogisticClassifier(basis='rectifier')
gs_cv = GridSearchCV(clf, param_grid, scoring='accuracy').fit(Xtrain, Ytrain)
print('Best hyperparameters: %r' % gs_cv.best_params_)
In the get_params method you call self.itrs, but your object doesn't have such attribute.
Also I suggest you to change fit signature to something like fit(self, X, y) and
Pass all the parameters in __init__
Split X and y to train and test using sklearn.cross_validation.train_test_split.
That'd make your code more sklearn-like and more compatible with library functions.