I am trying different machine learning projects from Kaggle to make myself better. Here is the model that I am using:
from sklearn.ensemble import RandomForestClassifier
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit = (X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index = False)
print('Your submission was successfully saved!')
Here is the error I get:
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
/tmp/ipykernel_33/1528591149.py in <module>
9 forest_clf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
10 forest_clf.fit = (X, y)
---> 11 predictions = forest_clf.predict(X_test)
12
13 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict(self, X)
806 The predicted classes.
807 """
--> 808 proba = self.predict_proba(X)
809
810 if self.n_outputs_ == 1:
/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X)
846 classes corresponds to that in the attribute :term:`classes_`.
847 """
--> 848 check_is_fitted(self)
849 # Check data
850 X = self._validate_X_predict(X)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
1220
1221 if not fitted:
-> 1222 raise NotFittedError(msg % {"name": type(estimator).__name__})
1223
1224
NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
I think this is an example of the estimator cloning itself, but I am not sure which line is the issue here. This is the Titanic project that is seen on Kaggle, whose tutorial code I have copied amidst trying to learn. Any help is appreciated.
As #Blackgaurd pointed out just change model.fit = (X, y) to model.fit(X, y)
Your current code overwrites the fit method of your Random Forest Classifier.
Full code of yours with correction:
from sklearn.ensemble import RandomForestClassifier
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit(X, y) # <- line of code fixed
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index = False)
print('Your submission was successfully saved!')
Related
I am attempting to build a stacking classifier using multiple combinations of available models, however, when I have a RandomForestClassifier the loop throws an error. Here is what I have attempted:
'RandomForestClassifier' object has no attribute 'estimators_'. Did you mean: 'estimator_'?
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
RF = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.5)
RF1 = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.2, oob_score=True)
ABC = AdaBoostClassifier(random_state=1250)
GBC = GradientBoostingClassifier(random_state=1250)
stackModels = [RF, RF1, GBC, ABC]
from itertools import combinations
classifier_combinations = [ list(np.array(stackModels)[list(x)]) for x in list(combinations(range(len(stackModels)), 2))]
Stackresults = {'estimators': [],'final_estimaor': [], 'accuracy': []}
for list_class in classifier_combinations:
for classify in stackModels:
CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
CLASS.fit(X_train, y_train)
ypred = CLASS.predict(X_test)
accuracy = accuracy_score(y_test, ypred)
Stackresults['accuracy'].append(accuracy)
Stackresults['estimators'].append(list_class)
Stackresults['final_estimator'].append(classify)
FULL TRACEBACK:
/var/folders/dr/9wh_z8y10fl79chj86pq7knc0000gn/T/ipykernel_7755/3533362225.py in <module>
24 for classify in stackModels:
25 CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
---> 26 CLASS.fit(X_train, y_train)
27 ypred = CLASS.predict(X_test)
28 accuracy = accuracy_score(y_test, ypred)
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
486 self._le = LabelEncoder().fit(y)
487 self.classes_ = self._le.classes_
--> 488 return super().fit(X, self._le.transform(y), sample_weight)
489
490 #if_delegate_has_method(delegate="final_estimator_")
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
148 # all_estimators contains all estimators, the one to be fitted and the
149 # 'drop' string.
--> 150 names, all_estimators = self._validate_estimators()
151 self._validate_final_estimator()
152
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_base.py in _validate_estimators(self)
245 " of (string, estimator) tuples."
...
--> 188 return iter(self.estimators_)
189
190
AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'
I'm building a sentiment analyzer. I built a model that successfully predicts the sentiment of texts, but I can't figure out how to save my entire model with pickle. I can save clf, but I can't save the vectorizer correctly.
In the function trainModel, I return featuresTrain and save it after training my model. After loading both files I run predict(), which gives the error mentioned in the title after it runs vectorizer.transform(). I thought featuresTrain contained the fitted vocabulary, so I'm confused. Any insights?
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
def trainModel(df, category, quantity):
df = pd.read_csv('/Users/NOT/Desktop/VSA/datasets/cleanedData.csv')
train = df.sample(frac=0.8)
test = pd.concat([df,train]).drop_duplicates(keep=False)
featuresTrain = vectorizer.fit_transform(train[category].values.astype('U'))
featuresTest = vectorizer.transform(test[category].values.astype('U'))
trainLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in train[quantity]]
testLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in test[quantity]]
clf = sklearn.linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 100000)
clf.fit(featuresTrain, trainLabels)
return clf, featuresTrain
model, vector = trainModel(data, 'tweet', 'sentiment')
def predict(modelName, text):
vec = vectorizer.transform([text])
prediction = list(modelName.predict(vec))[0]
probs = modelName.predict_proba(vec)
if probs[0][0] <= .6 and probs[0][2] <= .6:
prediction = 1
return prediction
filenameP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisModel_pkl'
filenameVP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisVectorizer_pkl'
pickle.dump(model, open(filenameP, 'wb'))
pickle.dump(vector, open(filenameVP, "wb"))
LM = pickle.load(open(filenameP, 'rb'))
LVM = pickle.load(open(filenameVP, 'rb'))
sentiment = predict(LM, transcript)
Error Traceback:
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
<ipython-input-33-f35d352629a9> in <module>
1 LM = pickle.load(open(filenameP, 'rb'))
2 LVM = pickle.load(open(filenameVP, 'rb'))
----> 3 sentiment = predict(LM, transcript)
<ipython-input-30-f97b97e7bbd5> in predict(modelName, text)
8 def predict(modelName, text):
9
---> 10 vec = vectorizer.transform([text])
11 prediction = list(modelName.predict(vec))[0]
12 probs = modelName.predict_proba(vec)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in transform(self, raw_documents)
1250 "Iterable over raw text documents expected, "
1251 "string object received.")
-> 1252 self._check_vocabulary()
1253
1254 # use the same matrix-building strategy as fit_transform
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _check_vocabulary(self)
470 self._validate_vocabulary()
471 if not self.fixed_vocabulary_:
--> 472 raise NotFittedError("Vocabulary not fitted or provided")
473
474 if len(self.vocabulary_) == 0:
NotFittedError: Vocabulary not fitted or provided
I'm trying to make a classifier with XGBoost, I fit it with RandomizedSearchCV.
Here is the code of my function:
def xgboost_classifier_rscv(x,y):
from scipy import stats
from xgboost import XGBClassifier
from sklearn.metrics import fbeta_score, make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
#splitting the dataset into training and test parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#bag of words implmentation
cv = CountVectorizer()
x_train = cv.fit_transform(x_train).toarray()
#TF-IDF implementation
vector = TfidfTransformer()
x_train = vector.fit_transform(x_train).toarray()
x_test = cv.transform(x_test)
scorers = {
'f1_score':make_scorer(f1_score),
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
param_dist = {'n_estimators': stats.randint(150, 1000),
'learning_rate': stats.uniform(0.01, 0.59),
'subsample': stats.uniform(0.3, 0.6),
'max_depth': [3, 4, 5, 6, 7, 8, 9],
'colsample_bytree': stats.uniform(0.5, 0.4),
'min_child_weight': [1, 2, 3, 4]
}
n_folds = numFolds)
skf = StratifiedKFold(n_splits=3, shuffle = True)
gridCV = RandomizedSearchCV(xgb_model,
param_distributions = param_dist,
cv = skf,
n_iter = 5,
scoring = scorers,
verbose = 3,
n_jobs = -1,
return_train_score=True,
refit = precision_score)
gridCV.fit(x_train,y_train)
best_pars = gridCV.best_params_
print("best params : ", best_pars)
xgb_predict = gridCV.predict(x_test)
xgb_pred_prob = gridCV.predict_proba(x_test)
print('best scores : ', gridCV.grid_scores_)
scores = [x[1] for x in gridCV.grid_scores_]
print("best scores : ", scores)
return y_test, xgb_predict, xgb_pred_prob
When I run the code, I get an error, reported below:
TypeError Traceback (most recent call last)
<ipython-input-30-9adf84d48e5c> in <module>
1 print("********** Xgboost classifier *************")
2 start_time = time.monotonic()
----> 3 y_test, xgb_predict, xgb_pred_prob = xgboost_classifier_rscv(x,y)
4 end_time = time.monotonic()
5 print("the time consumed is : ", timedelta(seconds=end_time - start_time))
<ipython-input-29-e0c6ae026076> in xgboost_classifier_rscv(x, y)
70 # verbose=3, random_state=1001, refit='precision_score' )
71
---> 72 gridCV.fit(x_train,y_train)
73 best_pars = gridCV.best_params_
74 print("best params : ", best_pars)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
858 # parameter set.
859 if callable(self.refit):
--> 860 self.best_index_ = self.refit(results)
861 if not isinstance(self.best_index_, numbers.Integral):
862 raise TypeError('best_index_ returned is not an integer')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
TypeError: precision_score() missing 1 required positional argument: 'y_pred'
When I do the same thing but with GridSearchCV instead of RandomizedSearchCV, the code runs without any problems!
It's not precision_score it's 'precision_score' (with ' '), like this-
gridCV = RandomizedSearchCV(xgb_model,
param_distributions = param_dist,
cv = skf,
n_iter = 5,
scoring = scorers,
verbose = 3,
n_jobs = -1,
return_train_score=True,
refit = 'precision_score')
Another error:
grid_scores_ has been removed, so changed it to cv_results_ (in the last 3rd and 4th line)
print('best scores : ', gridCV.cv_results_)
scores = [x[1] for x in gridCV.cv_results_]
One more error:
You have not defined that xgb_model, so add that.
xgb_model = XGBClassifier(n_jobs = -1, random_state = 42)
I'm having problems using functional api for estimating by maximizing
First I minimize the error vector by maximizing the probability layer loss, and then I want to use the mean vector layer to rank xc_hat similar embeddings.
The code is as follows:
import random as rdn
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
import tensorflow_probability as tfp
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
tfd = tfp.distributions
n_observations = 2000
n_features = 5
d_dim = 3
lr = 0.005
# Generate toy data
def make_relations(x_tr, y_tr, c_tr):
# Generate input data centers being labels (xc) for two random
# in-cluster (xa, xb)
xa = []
xc = []
xb = []
for l in y_tr:
kone = [k for k, lab in zip(x_tr, y_tr) if lab==l]
if len(kone) < 3:
continue
for i, x in enumerate(kone):
if np.isclose(x, c_tr[l]).all():
continue
kone_minus_x = kone.copy()
kone_minus_x.pop(i)
print
xa.append(x)
xc.append(c_tr[l])
xb.append(rdn.choice(kone_minus_x))
return np.vstack(xa), np.vstack(xb), np.vstack(xc)
X, Y, C = make_blobs(n_samples=n_observations,
n_features=n_features,
centers=int(n_observations*0.2),
return_centers=True)
x_a, x_b, x_c = make_relations(X, Y, C)
Xa_train, Xa_test = train_test_split(x_a, test_size=.4)
Xb_train, Xb_test = train_test_split(x_b, test_size=.4)
Xc_train, Xc_test = train_test_split(x_c, test_size=.4)
Xa_train = Xa_train[np.newaxis]
Xb_train = Xb_train[np.newaxis]
Xc_train = Xc_train[np.newaxis]
Xa_test = Xa_test[np.newaxis]
Xb_test = Xb_test[np.newaxis]
Xc_test = Xc_test[np.newaxis]
neg_log_likelihood = lambda y, rv_y: -rv_y.log_prob(y)
ones_train = tf.keras.backend.ones((1, Xc_train.shape[1], d_dim)).numpy()
ones_test = tf.keras.backend.ones((1, Xc_test.shape[1], d_dim)).numpy()
# Build model.
xa_xb = tf.keras.layers.Input(shape=(None, n_features), name='Xa-Xb')
L_xa_xb = tf.keras.layers.Dense(d_dim, activation='sigmoid', name='L_Xa-Xb')(xa_xb)
xb = tf.keras.layers.Input(shape=(None, n_features), name='Xb')
L_xb = tf.keras.layers.Dense(d_dim, activation='sigmoid', name='L_Xb')(xb)
mu = tf.keras.layers.Add(name='mean_vector')([L_xa_xb, L_xb])
xc = tf.keras.layers.Input(shape=(None, n_features), name='Xc')
L_xc = tf.keras.layers.Dense(d_dim, name='L_Xc')(xc)
error_vector = tf.keras.layers.Subtract(name='error_vector')([L_xc, mu])
p_xc_given_xa_xb = tfp.layers.DistributionLambda(
lambda t: tfd.Normal(loc=t, scale=tf.exp(t)), name='Gaussian')(error_vector)
model = tf.keras.Model(inputs=[xa_xb, xb, xc],
outputs=p_xc_given_xa_xb, name="inner_model")
model.compile(
optimizer=tf.optimizers.Adam(learning_rate=lr),
loss=neg_log_likelihood)
model.fit([Xa_train - Xb_train, Xb_train, Xc_train], ones_train,
validation_data=([Xa_test - Xb_test, Xb_test, Xc_test], ones_test),
epochs=1000,
verbose=True)
# After trained rebuild the part of the model I will use for prediction
xa_xb = model.get_layer('Xa-Xb')
L_xa_xb = model.get_layer('L_Xa-Xb')(xa_xb)
xb = model.get_layer('Xb')
L_xb = model.get_layer('L_Xb')(xb)
xc = model.get_layer('mean_vector')([L_xa_xb, L_xb])
model = tf.keras.Model(inputs=[xa_xb, xb],
outputs=xc, name="inner_model")
xc_hat = model([Xa_test - Xb_test, Xb_test])
The idea is to estimate xc However, I have the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-57-d94b1e8a583c> in <module>()
2
3 xa_xb = model.get_layer('Xa-Xb')
----> 4 L_xa_xb = model.get_layer('L_Xa-Xb')(xa_xb)
5
6 xb = model.get_layer('Xb')
1 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
966 with base_layer_utils.autocast_context_manager(
967 self._compute_dtype):
--> 968 outputs = self.call(cast_inputs, *args, **kwargs)
969 self._handle_activity_regularization(inputs, outputs)
970 self._set_mask_metadata(inputs, outputs, input_masks)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/layers/core.py in call(self, inputs)
1178
1179 def call(self, inputs):
-> 1180 rank = inputs.shape.rank
1181 if rank is not None and rank > 2:
1182 # Broadcasting is required for the inputs.
AttributeError: 'InputLayer' object has no attribute 'shape'
Im usign Google Colaboratory
I am trying to train a XGBoost classification model and I had done it several times. This time I am trying to do a hyper parameter gridsearch and doing a CV using xgboost.cv. Everytime I run my code it gives a Key error:
I also tried to use just xgboost.train with some default parameters, which when I use to predict for the same DMatrix, it predicts everything as null.
Here is my DMatrix, where I have missing values in 4 features, for which I specified missing = np.nan in DMatrix
xgbmat_train = xgb.DMatrix(X_train.values,label=
Y_train.values,missing=np.nan,weight = train_weights)
xgbmat_test = xgb.DMatrix(X_test.values,label=Y_test.values,missing=np.nan,weight=test_weights)
These are my initial parameters
initial_params = {'learning_rate':0.1,'n_estimators':1000,'objective':'binary:logistic','booster':'gbtree','reg_alpha':0,
'reg_lambda':1,'max_depth':5,'min_child_weight':1,'gamma':0,'subsample':0.8,'colsample_bytree':0.8,
'scale_pos_weight':1,'missing':np.nan,'seed':27,'eval_metric':'auc','n_jobs':32,'silent':True}
These are my gridsearch parameters
gridsearch_params = [(max_depth,min_child_weight)
for max_depth in range(4,10)
for min_child_weight in range(1,6)]
Below is the loop where I am doing a gridsearch
max_auc = 0.0
best_params = ''
print(gc.collect())
for max_depth, min_child_weight in gridsearch_params:
print(gc.collect())
print("CV with max_depth = {}, min_child_weight=
{}".format(max_depth,min_child_weight))
initial_params['max_depth'] = max_depth
initial_params['min_child_weight'] = min_child_weight
cv_results = xgb.cv(initial_params,
xgbmat_train,
num_boost_round = 200,
seed = 42,
stratified = True,
shuffle=True,
nfold=3,
metrics={'auc'},
early_stopping_rounds = 50)
mean_auc = cv_results['test-auc-mean'].max()
boost_rounds = cv_results['test-auc-mean'].argmax()
cv_results = cv_results.append(cv_results)
if mean_auc > max_auc:
max_auc = mean_auc
best_params = (max_depth,min_child_weight)
print(gc.collect())
print(cv_results)
print(mean_auc)
print(boost_rounds)
print("Best param: {}, {}, aucpr: {}".format(best_params[0],best_params[1],max_auc))
This is the error I am getting while running the above code
KeyError Traceback (most recent call
last)
<ipython-input-15-f546ef27594f> in <module>
15 nfold=3,
16 metrics={'auc'},
---> 17 early_stopping_rounds = 50)
18 mean_auc = cv_results['test-auc-mean'].max()
19 boost_rounds = cv_results['test-auc-mean'].argmax()
~/anaconda3/lib/python3.7/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
461 end_iteration=num_boost_round,
462 rank=0,
--> 463 evaluation_result_list=res))
464 except EarlyStopException as e:
465 for k in results:
~/anaconda3/lib/python3.7/site-packages/xgboost/callback.py in callback(env)
243 best_msg=state['best_msg'])
244 elif env.iteration - best_iteration >= stopping_rounds:
--> 245 best_msg = state['best_msg']
246 if verbose and env.rank == 0:
247 msg = "Stopping. Best iteration:\n{}\n\n"
KeyError: 'best_msg'
I tried filling NAs with -9999.0 and specified the same in missing argument in DMatrix, but throws the same error. I am running on some hard deadline, any help will be deeply appriciated