im trying to run an average model with python. But when i try to run it, i got a ValueError. What is wrong with it and how can i fix my code?
My code looks like this:
xx_train = x_train[cols_selected_boruta]
xx_test = x_test[cols_selected_boruta]
Two functions:
def mean_absolute_percentage_error(y,yhat):
return np.mean(np.abs((y - yhat)/ y))
def ml_error( model_name, y, yhat ):
mae = mean_absolute_error( y, yhat )
mape = mean_absolute_percentage_error( y, yhat )
rmse = np.sqrt( mean_squared_error( y, yhat ) )
return pd.DataFrame( { 'Model Name': model_name,'MAE':mae,'MAPE': mape,'RMSE': rmse }, index=[0] )`
when i try to run this snippet, i get the error:
aux1 = xx_test.copy()
aux1['sales'] = y_test.copy()
#predicition
aux2 = aux1[['store','sales']].groupby('store').mean().reset_index().rename(columns={'sales':'predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='store')yhat_baseline = aux1['predictions']
#performance
baseline_result = ml_error('Average Model', np.expm1(y_test), np.expm1(yhat_baseline))`
Full error message:
`---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[96], line 10
7 yhat_baseline = aux1['predictions']
9 #performance
---> 10 baseline_result = ml_error('Average Model', np.expm1(y_test), np.expm1(yhat_baseline))
12 baseline_result
Cell In[95], line 6, in ml_error(model_name, y, yhat)
5 def ml_error( model_name, y, yhat ):
----> 6 mae = mean_absolute_error( y, yhat )
7 mape = mean_absolute_percentage_error( y, yhat )
8 rmse = np.sqrt( mean_squared_error( y, yhat ) )
File ~/opt/anaconda3/envs/ds_em_prod/lib/python3.9/site-packages/sklearn/metrics/_regression.py:191, in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
135 def mean_absolute_error(
136 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
137 ):
138 """Mean absolute error regression loss.
139
140 Read more in the :ref:`User Guide <mean_absolute_error>`.
(...)
189 0.85...
190 """
--> 191 y_type, y_true, y_pred, multioutput = _check_reg_targets(
192 y_true, y_pred, multioutput
193 )
194 check_consistent_length(y_true, y_pred, sample_weight)
195 output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)
File ~/opt/anaconda3/envs/ds_em_prod/lib/python3.9/site-packages/sklearn/metrics/_regression.py:94, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
60 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
61 """Check that y_true and y_pred belong to the same regression task.
62
63 Parameters
(...)
92 the dtype argument passed to check_array.
93 """
---> 94 check_consistent_length(y_true, y_pred)
95 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
96 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~/opt/anaconda3/envs/ds_em_prod/lib/python3.9/site-packages/sklearn/utils/validation.py:332, in check_consistent_length(*arrays)
330 uniques = np.unique(lengths)
331 if len(uniques) > 1:
--> 332 raise ValueError(
333 "Found input variables with inconsistent numbers of samples: %r"
334 % [int(l) for l in lengths]
335 )
ValueError: Found input variables with inconsistent numbers of samples: [802942, 41396]
`
i was expecting to get the baseline_result
Related
I'm brand new to Python and machine learning and I'm surely missing something.
I'm training a RandomForest model through nested CV for hyperparameter tuning and RFECV using a pipeline. I retrieved best_estimator_.n_features and it stills shows me the 17 original features before RFECV narrowing down to 3.
X
1182 rows × 17 columns
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
clf = RandomForestClassifier(random_state=42, n_jobs=-1, criterion='entropy', bootstrap=False)
space = {'n_estimators': [900, 1000, 1100],
'max_depth': [25, 50, 100],
'min_samples_split': [500, 750, 1000],
'min_samples_leaf': [32, 64]
}
search = GridSearchCV(clf, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
rfe = RFECV(estimator=RandomForestClassifier())
ppln = Pipeline(steps=[('rfe',rfe),('grid',search)])
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(ppln, X, y.ravel(), scoring='accuracy', cv=cv_outer, n_jobs=-1)
ppln.fit(X, y.ravel())
After I fitted pipeline I tried to predict a new data (fixt) with original 17 features. However the error message shown was: "ValueError: Number of features of the model must match the input. Model n_features is 17 and input n_features is 3."
fixtureXLS = pd.read_excel('aaafixtures.xlsx')
fixtureXLS.to_csv('bbbfixtures.csv', encoding='utf-8')
fixt = pd.read_csv('bbbfixtures.csv')
fixt = fixt.loc[:, ~fixt.columns.str.contains('^Unnamed')]
if 'Result' in fixt.columns:
fixt = fixt.drop(['Result'], axis=1)
fixt
287 rows × 17 columns
fixt['Predicted'] = ppln.predict(fixt)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-164-e54f4c6f6e05> in <module>
----> 1 temp = ppln.predict(fixt)
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
406 for _, name, transform in self._iter(with_final=False):
407 Xt = transform.transform(Xt)
--> 408 return self.steps[-1][-1].predict(Xt, **predict_params)
409
410 #if_delegate_has_method(delegate='_final_estimator')
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in predict(self, X)
485 """
486 self._check_is_fitted('predict')
--> 487 return self.best_estimator_.predict(X)
488
489 #if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in predict(self, X)
627 The predicted classes.
628 """
--> 629 proba = self.predict_proba(X)
630
631 if self.n_outputs_ == 1:
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in predict_proba(self, X)
671 check_is_fitted(self)
672 # Check data
--> 673 X = self._validate_X_predict(X)
674
675 # Assign chunk of trees to jobs
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in _validate_X_predict(self, X)
419 check_is_fitted(self)
420
--> 421 return self.estimators_[0]._validate_X_predict(X, check_input=True)
422
423 #property
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
394 n_features = X.shape[1]
395 if self.n_features_ != n_features:
--> 396 raise ValueError("Number of features of the model must "
397 "match the input. Model n_features is %s and "
398 "input n_features is %s "
ValueError: Number of features of the model must match the input. Model n_features is 17 and input n_features is 3
I transformed fixt to 3 features and predicted pipeline:
X_new = rfe.transform(fixt)
print(X_new.shape[1])
fixt['Predicted'] = ppln.predict(X_new)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-161-02280f45be5a> in <module>
----> 1 fixt['Predicted'] = ppln.predict(X_new)
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
405 Xt = X
406 for _, name, transform in self._iter(with_final=False):
--> 407 Xt = transform.transform(Xt)
408 return self.steps[-1][-1].predict(Xt, **predict_params)
409
~\anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
82 return np.empty(0).reshape((X.shape[0], 0))
83 if len(mask) != X.shape[1]:
---> 84 raise ValueError("X has a different shape than during fitting.")
85 return X[:, safe_mask(X, mask)]
86
ValueError: X has a different shape than during fitting.
Can you help me sending some light, please?!
I don't know if there is an automated way to make it but I created a new pipeline with RandomForestClassfiers taken from the best estimator from previous pipeline, fitted and then predicted. I had to RFE it before tough.
Instead ppln.fit(X, y.ravel()) the final code was
params = search.best_estimator_.get_params()
rfc = RandomForestClassifier(**params)
ppln_new = Pipeline(steps=[('rfe',rfe),('pred',rfc)])
ppln_new.fit(X, y.ravel())
fixt['Predicted'] = ppln_new.predict(fixt)
I am testing an open-source code but I keep running into this error. I am passing a crf model into cross_val_predict and that's where the error starts. Pystruct is being used for the crf model. The crf model is built from sparse matrices which I think is causing the problem but I am not entirely sure.
This is the code:
def evaluate_dataset(config):
# Parameters
folds = config['folds']
results = {}
crf = config['clf']
sample_size = config['sample_size'] if 'sample_size' in config else None
paths = list(Path(config['dir']).iterdir())
preprocessor = SectionPreprocess(ground_truth=True)
paths, X, y = preprocessor.preprocess(paths, sample_size=sample_size)
# Dataset information
results['num_binaries'] = len(y)
results['avg_code_fraction'] = np.mean([np.mean(yy) for yy in y])
results['params'] = crf.get_params()
# Cross-validation
# logreg = LogisticRegression()
results['cv_folds'] = folds
start = time.time()
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
end = time.time()
results['cv_time'] = (end - start) / folds
values = [
(metrics.accuracy_score(true, pred),) + metrics.precision_recall_fscore_support(true, pred, average='binary')
for (true, pred) in zip(y, y_pred)
]
accuracy, precision, recall, f1, _ = zip(*values)
results['cv_metrics'] = {
'accuracy': (np.mean(accuracy), np.std(accuracy)),
'precision': (np.mean(precision), np.std(precision)),
'recall': (np.mean(recall), np.std(recall)),
'f1': (np.mean(f1), np.std(f1)),
}
return results
evaluation_folder = Path('evaluation')
evaluation_folder.mkdir(exist_ok=True)
for (name, config) in configurations.items():
print(config)
%time results = evaluate_dataset(config)
print("## " + name)
print(results)
print('')
file_path = evaluation_folder / (name + '.json')
with file_path.open('w') as f:
json.dump(results, f)
And this is the error:
ValueError Traceback (most recent call last)
in
in evaluate_dataset(config)
24
25 start = time.time()
---> 26 y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
27 end = time.time()
28 results['cv_time'] = (end - start) / folds
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
399 prediction_blocks = parallel(delayed(_fit_and_predict)(
400 clone(estimator), X, y, train, test, verbose, fit_params, method)
--> 401 for train, test in cv_iter)
402
403 # Concatenate the predictions
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in (.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
472 estimator.fit(X_train, **fit_params)
473 else:
--> 474 estimator.fit(X_train, y_train, **fit_params)
475 func = getattr(estimator, method)
476 predictions = func(X_test)
~/Google Drive (gdk244#nyu.edu)/MoMa Lab /CRF/code_section_identification/crf_models/CRFModel.py in fit(self, X, y, **fit_params)
74 verbose=self.verbose
75 )
---> 76 self._ssvm.fit(X, y)
77
78 return self
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in fit(self, X, Y, constraints, initialize)
295 self._frank_wolfe_batch(X, Y)
296 else:
--> 297 self._frank_wolfe_bc(X, Y)
298 except KeyboardInterrupt:
299 pass
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in _frank_wolfe_bc(self, X, Y)
220 i = perm[j]
221 x, y = X[i], Y[i]
--> 222 y_hat, delta_joint_feature, slack, loss = find_constraint(self.model, x, y, w)
223 # ws and ls
224 ws = delta_joint_feature * self.C
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/utils/inference.py in find_constraint(model, x, y, w, y_hat, relaxed, compute_difference)
63
64 if y_hat is None:
---> 65 y_hat = model.loss_augmented_inference(x, y, w, relaxed=relaxed)
66 joint_feature = model.joint_feature
67 if getattr(model, 'rescale_C', False):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/models/crf.py in loss_augmented_inference(self, x, y, w, relaxed, return_energy)
104 pairwise_potentials = self._get_pairwise_potentials(x, w)
105 edges = self._get_edges(x)
--> 106 loss_augment_unaries(unary_potentials, np.asarray(y), self.class_weight)
107
108 return inference_dispatch(unary_potentials, pairwise_potentials, edges,
utils.pyx in utils.loss_augment_unaries (src/utils.c:5132)()
ValueError: Buffer dtype mismatch, expected 'double' but got Python object
I need to write a metric that would allow me to score some models on an extremely unbalanced multiclass classification problem.
For this purpose I have written my own custom metric for tensorflow.keras:
import tensorflow as tf
import tensorflow.keras.backend as K
weight_vector = [class_weights[i] for i in range(n_classes)]
# adapted from https://www.kaggle.com/guglielmocamporese/macro-f1-score-keras
def weighted_macro_f1_score(y_true, y_pred):
y_true = tf.reshape(y_true, shape=(-1,n_classes))
y_pred = tf.reshape(y_pred, shape=(-1,n_classes))
y_pred = tf.argmax(y_pred, axis=-1)
y_pred = tf.one_hot(y_pred, depth=n_classes)
aux = np.array(weight_vector, dtype=np.float32)
weights = np.vstack([aux]*y_true.shape[0])
tp = K.sum(K.cast(y_true*y_pred*weights, 'float'))
fp = K.sum(K.cast((1-y_true)*y_pred*weights, 'float'))
fn = K.sum(K.cast(y_true*(1-y_pred)*weights, 'float'))
p = tp / (tp + fp + K.epsilon())
#print(f"p = {p}")
r = tp / (tp + fn + K.epsilon())
#print(f"r = {r}")
f1 = 2*p*r / (p+r+K.epsilon())
f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
return K.mean(f1)
When I test it on some numpy arrays on eager mode I get the expected results.
However, when I try feeding it to model.fit I get this error:
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=metrics,
sample_weight_mode="temporal")
history = model.fit(X, Y, epochs=20, sample_weight=sample_weight)
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-6dbe62cf6d4e> in <module>()
3 loss='categorical_crossentropy',
4 metrics=metrics,
----> 5 sample_weight_mode="temporal")
6
7 history = model.fit(X, Y, epochs=20, sample_weight=sample_weight)
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/checkpointable/base.py in _method_wrapper(self, *args, **kwargs)
440 self._setattr_tracking = False # pylint: disable=protected-access
441 try:
--> 442 method(self, *args, **kwargs)
443 finally:
444 self._setattr_tracking = previous_value # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in compile(self, optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, target_tensors, distribute, **kwargs)
497 targets=self.targets,
498 skip_target_indices=skip_target_indices,
--> 499 sample_weights=self.sample_weights)
500
501 # Prepare gradient updates and state updates.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _handle_metrics(self, outputs, skip_target_indices, targets, sample_weights, masks, return_stateful_result)
1842 output,
1843 output_mask,
-> 1844 return_stateful_result=return_stateful_result))
1845 metric_results.extend(
1846 self._handle_per_output_metrics(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _handle_per_output_metrics(self, metrics_dict, y_true, y_pred, mask, weights, return_stateful_result)
1798 # In graph mode, we build the sub-graph for both the stateful and the
1799 # stateless fns.
-> 1800 stateful_metric_result = _call_stateful_fn(stateful_fn)
1801 metric_result = _call_stateless_fn(metric_fn)
1802 _track_metric_tensors(metric_name, metric_result,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _call_stateful_fn(fn)
1771 def _call_stateful_fn(fn):
1772 return training_utils.call_metric_function(
-> 1773 fn, y_true, y_pred, weights=weights, mask=mask)
1774
1775 def _call_stateless_fn(fn):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training_utils.py in call_metric_function(metric_fn, y_true, y_pred, weights, mask)
850 """Invokes metric function and returns the metric result tensor."""
851 if mask is None:
--> 852 return metric_fn(y_true, y_pred, sample_weight=weights)
853
854 mask = math_ops.cast(mask, y_pred.dtype)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/metrics.py in __call__(self, *args, **kwargs)
436 The metric value tensor.
437 """
--> 438 update_op = self.update_state(*args, **kwargs)
439 with ops.control_dependencies([update_op]):
440 result_t = self.result()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/metrics.py in decorated(metric_obj, *args, **kwargs)
96 """Decorated function with `add_update()`."""
97
---> 98 update_op = update_state_fn(*args, **kwargs)
99 if update_op is not None: # update_op will be None in eager execution.
100 metric_obj.add_update(update_op, inputs=True)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/metrics.py in update_state(self, y_true, y_pred, sample_weight)
647 y_pred, y_true, sample_weight)
648
--> 649 matches = self._fn(y_true, y_pred, **self._fn_kwargs)
650 return super(MeanMetricWrapper, self).update_state(
651 matches, sample_weight=sample_weight)
<ipython-input-11-030fcade0122> in weighted_macro_f1_score(y_true, y_pred)
8
9 aux = np.array(weight_vector, dtype=np.float32)
---> 10 weights = np.vstack([aux]*y_true.shape[0])
11
12 tp = K.sum(K.cast(y_true*y_pred*weights, 'float'))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_shape.py in __rmul__(self, other)
408 A Dimension whose value is the product of `self` and `other`.
409 """
--> 410 return self * other
411
412 def __floordiv__(self, other):
TypeError: __index__ returned non-int (type NoneType)
I am a bit baffled with this result. When I tried the metric without the modifications for weight balance it worked just fine, but now it refuses to work.
What is going on? How do I fix it?
I am trying to predict the prices of items using Dnnregressor and I couldn't figure out this error that keeps coming. I created tf numeric and categorical columns from pandas dataframe and fed it into the DNNRegressor. There is not much help online regarding this particular error.
Please help me fix this error. Thanks
AttributeError Traceback (most recent call last)
<ipython-input-27-790ecef8c709> in <module>()
92
93 if __name__ == '__main__':
---> 94 main()
<ipython-input-27-790ecef8c709> in main()
81 # learning_rate=0.1, l1_regularization_strength=0.001))
82 est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
---> 83 est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
84 scores = est.evaluate(input_fn = get_test_input_fn(Xtest, ytest))
85 print('Loss Score: {0:f}' .format(scores['average_loss']))
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps)
239 hooks.append(training.StopAtStepHook(steps, max_steps))
240
--> 241 loss = self._train_model(input_fn=input_fn, hooks=hooks)
242 logging.info('Loss for final step: %s.', loss)
243 return self
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks)
628 input_fn, model_fn_lib.ModeKeys.TRAIN)
629 estimator_spec = self._call_model_fn(features, labels,
--> 630 model_fn_lib.ModeKeys.TRAIN)
631 ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
632 all_hooks.extend(hooks)
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode)
613 if 'config' in model_fn_args:
614 kwargs['config'] = self.config
--> 615 model_fn_results = self._model_fn(features=features, **kwargs)
616
617 if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
389 dropout=dropout,
390 input_layer_partitioner=input_layer_partitioner,
--> 391 config=config)
392 super(DNNRegressor, self).__init__(
393 model_fn=_model_fn, model_dir=model_dir, config=config)
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config)
100 net = feature_column_lib.input_layer(
101 features=features,
--> 102 feature_columns=feature_columns)
103
104 for layer_id, num_hidden_units in enumerate(hidden_units):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in input_layer(features, feature_columns, weight_collections, trainable)
205 ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
206 """
--> 207 _check_feature_columns(feature_columns)
208 for column in feature_columns:
209 if not isinstance(column, _DenseColumn):
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\feature_column\feature_column.py in _check_feature_columns(feature_columns)
1660 name_to_column = dict()
1661 for column in feature_columns:
-> 1662 if column.name in name_to_column:
1663 raise ValueError('Duplicate feature column name found for columns: {} '
1664 'and {}. This usually means that these columns refer to '
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in name(self)
2451 #property
2452 def name(self):
-> 2453 return '{}_indicator'.format(self.categorical_column.name)
2454
2455 def _transform_feature(self, inputs):
AttributeError: 'str' object has no attribute 'name'
And below is code:
def get_train_input_fn(Xtrain, ytrain):
return tf.estimator.inputs.pandas_input_fn(
x = Xtrain,
y = ytrain,
batch_size = 30,
num_epochs = None,
shuffle = True)
def get_test_input_fn(Xtest, ytest):
return tf.estimator.inputs.pandas_input_fn(
x = Xtest,
y = ytest,
batch_size = 32,
num_epochs = 1,
shuffle = False)
def main():
Xtrain, Xtest, ytrain, ytest = train_test_split(merc, ytr, test_size = 0.4, random_state = 42)
feature_columns = []
brand_rating = tf.feature_column.numeric_column('brand_rating')
feature_columns.append(brand_rating)
sentiment = tf.feature_column.numeric_column('description_polarity')
feature_columns.append(sentiment)
item_condition = tf.feature_column.numeric_column('item_condition_id')
feature_columns.append(item_condition)
shipping = tf.feature_column.indicator_column('shipping')
feature_columns.append(shipping)
name = tf.feature_column.embedding_column('item_name', 34) #(column name, dimension(no. of unique values ** 0.25))
feature_columns.append(name)
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
feature_columns.append(general)
sc1 = tf.feature_column.categorical_column_with_hash_bucket('SC1', 120)
feature_columns.append(sc1)
sc2 = tf.feature_column.categorical_column_with_hash_bucket('SC2', 900)
feature_columns.append(sc2)
print(feature_columns)
#est = tf.estimator.DNNRegressor(feature_columns, hidden_units = [10, 10], optimizer=tf.train.ProximalAdagradOptimizer(
# learning_rate=0.1, l1_regularization_strength=0.001))
est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
The first argument to tf.feature_column.embedding_column must be a categorical column, not a string. See API spec.
The offending line in your code is:
tf.feature_column.embedding_column('item_name', 34)
After using
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
and other feature_column.categorical_column_with..., you should use
general_indicator = tf.feature_column.indicator_column(general)
and then append it to your feature_columns list.
feature_columns.append(general_indicator)
Want to use Gridsearch to find best parameters and use f1 as the scoring metric.
If i remove the scoring function, all works well and i get no errors.
Here is my code:
from sklearn import grid_search
parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
train_classifier(reg, X_train, y_train)
train_f1_score = predict_labels(reg, X_train, y_train)
print reg.best_params_
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(predict_labels(reg, X_test, y_test))
When i execute i get pages upon pages as errors, and i cannot make heads or tails of it :(
ValueError Traceback (most recent call last)
<ipython-input-17-3083ff8a20ea> in <module>()
3 parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
4 reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
----> 5 train_classifier(reg, X_train, y_train)
6 train_f1_score = predict_labels(reg, X_train, y_train)
7 print reg.best_params_
<ipython-input-9-b56ce25fd90b> in train_classifier(clf, X_train, y_train)
5 print "Training {}...".format(clf.__class__.__name__)
6 start = time.time()
----> 7 clf.fit(X_train, y_train)
8 end = time.time()
9 print "Done!\nTraining time (secs): {:.3f}".format(end - start)
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
802 self._iterating = True
803
--> 804 while self.dispatch_one_batch(iterator):
805 pass
806
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
660 return False
661 else:
--> 662 self._dispatch(tasks)
663 return True
664
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
568
569 if self._pool is None:
--> 570 job = ImmediateComputeBatch(batch)
571 self._jobs.append(job)
572 self.n_dispatched_batches += 1
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
181 # Don't delay the application, to avoid keeping the input
182 # arguments in memory
--> 183 self.results = batch()
184
185 def get(self):
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1604 score = scorer(estimator, X_test)
1605 else:
-> 1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
//anaconda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, estimator, X, y_true, sample_weight)
88 else:
89 return self._sign * self._score_func(y_true, y_pred,
---> 90 **self._kwargs)
91
92
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
637 return fbeta_score(y_true, y_pred, 1, labels=labels,
638 pos_label=pos_label, average=average,
--> 639 sample_weight=sample_weight)
640
641
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
754 average=average,
755 warn_for=('f-score',),
--> 756 sample_weight=sample_weight)
757 return f
758
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
982 else:
983 raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984 (pos_label, present_labels))
985 labels = [pos_label]
986 if labels is None:
ValueError: pos_label=1 is not a valid label: array(['no', 'yes'],
dtype='|S3')
Seems that you have label array with values 'no' and 'yes', you should convert them to binary 1-0 numerical representation, because your error states that scoring function cannot understand where 0's and 1's are in your label array.
Other possible way to solve it without modifying your label array:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="yes")
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring=f1_scorer)