Xgboost 'DataFrame' object has no attribute 'num_row' - python

I am working on a multi-class classification problem using xgboost.
The shape of my data is
print(train_ohe.shape, test_ohe.shape)
# (43266, 190) (18543, 190)
Custom F1 eval function and model training code
def f1_eval(y_pred, dtrain):
y_true = dtrain.get_label()
err = 1-f1_score(y_true, np.round(y_pred),average='weighted')
return 'f1_err', err
def train_model(algo,train,test,predictors,useTrainCV=True,
cv_folds=5,early_stopping_rounds=50):
if useTrainCV:
xgb_param = algo.get_params()
xgb_train = xgb.DMatrix(train[predictors].values,label=train[target].values)
xgb_test = xgb.DMatrix(test[predictors].values)
print(xgb_train.num_row())
print(xgb_test.num_row())
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
# Fit algorithm on data
algo.fit(train[predictors],train[target],eval_metric=f1_eval)
# Predict train data
train_predictions = algo.predict(train[predictors])
train_pred_prob = algo.predict_proba(train[predictors])[:,1]
# Report model performance
print("Model performance")
print("F1 Score Train {}".format(f1_score(train[target].values,train_predictions)))
# Predict test data
test_predictions = algo.predict(test[predictors])
# Performance
print("F1 Score Test {}".format(f1_score(test[target].values,test_predictions)))
Here is my XgbClassifier code. Trying to find the number of estimators for a high learning rate.
target = 'Complaint-Status'
predictors = [x for x in train_ohe.columns if x not in target]
xgb1 = XGBClassifier(learning_rate=0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=8,
scale_pos_weight=1,
seed=145)
train_model(xgb1, train_ohe, test_ohe, predictors)
I am getting following Attribute error saying 'DataFrame' object has no attribute 'num_row'in the xgb.cv line in train_model function.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-116-5933227c171d> in <module>
18 seed=145)
19 print(xgb1.get_params())
---> 20 train_model(xgb1, train_ohe, test_ohe, predictors)
21 # xgb_param = xgb1.get_params()
22 # cv_folds=5
<ipython-input-114-a9df39c19abf> in train_model(algo, train, test, predictors, useTrainCV, cv_folds, early_stopping_rounds)
19 nfold=cv_folds,
20 metrics='f1_eval',
---> 21 early_stopping_rounds=early_stopping_rounds)
22 algo.set_params(n_estimators=cv_result.shape[0])
23
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
413 results = {}
414 cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
--> 415 stratified, folds, shuffle)
416
417 # setup callbacks
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in mknfold(dall, nfold, param, seed, evals, fpreproc, stratified, folds, shuffle)
246 # Do standard k-fold cross validation
247 if shuffle is True:
--> 248 idx = np.random.permutation(dall.num_row())
249 else:
250 idx = np.arange(dall.num_row())
/opt/virtual_env/py3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
4374 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4375 return self[name]
-> 4376 return object.__getattribute__(self, name)
4377
4378 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'num_row'

Saw your post when I was searching around for the same error.
Your second parameter train of the code:
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
should be a matrix such as
train = xgb.DMatrix(X_train, y_train)
hope this helps

Related

Train Pytorch Autoencoder with custom dataset

I am new to Pytorch. I was able to build an autoencoder model and train it using the MINST dataset.
However, I need to train the model using a custom dataset.
I am getting the error 'ToTensor' object is not iterable when i try to train with the custom dataset.
Below is a code of my dataset class
class AutoEncoderDataSet(Dataset):
def __init__(self, in_dir, transform):
self._transforms = transform
self.img_paths = []
files = os.listdir(in_dir)
for file in files:
self.img_paths.append(os.path.join(in_dir, file))
def __getitem__(self, index):
img, img_trans = Image.open(self.img_paths[index]), Image.open(self.img_paths[index])
x, y = transform(img), transform(img_trans)
return x, y
def __len__(self):
return len(self.img_paths)
Here is how I am generating the dataloader
transform = transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = AutoEncoderDataSet('./datasets/train/', transform)
batch_size = 512
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True
When I try to train using the data generated with the custom dataset class, I am getting the error mentioned above.
Below is code for training the model
epochs = 2048
for epoch in range(epochs):
loss = 0
for batch_features, _ in train_loader:
# reshape mini-batch data to [N, 784] matrix
# load it to the active device
batch_features = batch_features.view(-1, 250*250).to(device)
# reset the gradients back to zero
# PyTorch accumulates gradients on subsequent backward passes
optimizer.zero_grad()
# compute ecoder output
outputs = model(batch_features)
# compute training reconstruction loss
train_loss = criterion(outputs, batch_features)
# compute accumulated gradients
train_loss.backward()
# perform parameter update based on current gradients
optimizer.step()
# add the mini-batch training loss to epoch loss
loss += train_loss.item()
# compute the epoch training loss
loss = loss / len(train_loader)
# display the epoch training loss
print("epoch : {}/{}, recon loss = {:.8f}".format(epoch + 1, epochs, loss))
And this is the error I am getting
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11164/1462449221.py in <module>
3 for epoch in range(epochs):
4 loss = 0
----> 5 for batch_features, _ in test_loader:
6 # reshape mini-batch data to [N, 784] matrix
7 # load it to the active device
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torchvision\datasets\folder.py in __getitem__(self, index)
232 sample = self.loader(path)
233 if self.transform is not None:
--> 234 sample = self.transform(sample)
235 if self.target_transform is not None:
236 target = self.target_transform(target)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torchvision\transforms\transforms.py in __call__(self, img)
58
59 def __call__(self, img):
---> 60 for t in self.transforms:
61 img = t(img)
62 return img
TypeError: 'ToTensor' object is not iterable
Any suggestions would be greatly appreciated.

Xgboost predicting everything as Null

I am trying to train a XGBoost classification model and I had done it several times. This time I am trying to do a hyper parameter gridsearch and doing a CV using xgboost.cv. Everytime I run my code it gives a Key error:
I also tried to use just xgboost.train with some default parameters, which when I use to predict for the same DMatrix, it predicts everything as null.
Here is my DMatrix, where I have missing values in 4 features, for which I specified missing = np.nan in DMatrix
xgbmat_train = xgb.DMatrix(X_train.values,label=
Y_train.values,missing=np.nan,weight = train_weights)
xgbmat_test = xgb.DMatrix(X_test.values,label=Y_test.values,missing=np.nan,weight=test_weights)
These are my initial parameters
initial_params = {'learning_rate':0.1,'n_estimators':1000,'objective':'binary:logistic','booster':'gbtree','reg_alpha':0,
'reg_lambda':1,'max_depth':5,'min_child_weight':1,'gamma':0,'subsample':0.8,'colsample_bytree':0.8,
'scale_pos_weight':1,'missing':np.nan,'seed':27,'eval_metric':'auc','n_jobs':32,'silent':True}
These are my gridsearch parameters
gridsearch_params = [(max_depth,min_child_weight)
for max_depth in range(4,10)
for min_child_weight in range(1,6)]
Below is the loop where I am doing a gridsearch
max_auc = 0.0
best_params = ''
print(gc.collect())
for max_depth, min_child_weight in gridsearch_params:
print(gc.collect())
print("CV with max_depth = {}, min_child_weight=
{}".format(max_depth,min_child_weight))
initial_params['max_depth'] = max_depth
initial_params['min_child_weight'] = min_child_weight
cv_results = xgb.cv(initial_params,
xgbmat_train,
num_boost_round = 200,
seed = 42,
stratified = True,
shuffle=True,
nfold=3,
metrics={'auc'},
early_stopping_rounds = 50)
mean_auc = cv_results['test-auc-mean'].max()
boost_rounds = cv_results['test-auc-mean'].argmax()
cv_results = cv_results.append(cv_results)
if mean_auc > max_auc:
max_auc = mean_auc
best_params = (max_depth,min_child_weight)
print(gc.collect())
print(cv_results)
print(mean_auc)
print(boost_rounds)
print("Best param: {}, {}, aucpr: {}".format(best_params[0],best_params[1],max_auc))
This is the error I am getting while running the above code
KeyError Traceback (most recent call
last)
<ipython-input-15-f546ef27594f> in <module>
15 nfold=3,
16 metrics={'auc'},
---> 17 early_stopping_rounds = 50)
18 mean_auc = cv_results['test-auc-mean'].max()
19 boost_rounds = cv_results['test-auc-mean'].argmax()
~/anaconda3/lib/python3.7/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
461 end_iteration=num_boost_round,
462 rank=0,
--> 463 evaluation_result_list=res))
464 except EarlyStopException as e:
465 for k in results:
~/anaconda3/lib/python3.7/site-packages/xgboost/callback.py in callback(env)
243 best_msg=state['best_msg'])
244 elif env.iteration - best_iteration >= stopping_rounds:
--> 245 best_msg = state['best_msg']
246 if verbose and env.rank == 0:
247 msg = "Stopping. Best iteration:\n{}\n\n"
KeyError: 'best_msg'
I tried filling NAs with -9999.0 and specified the same in missing argument in DMatrix, but throws the same error. I am running on some hard deadline, any help will be deeply appriciated

XGBoost: AttributeError: 'DataFrame' object has no attribute 'feature_names'

I've trained an XGBoost Classifier for binary classification. While training the model on train data using CV and predicting on the test data, I face the error AttributeError: 'DataFrame' object has no attribute 'feature_names'.
My code is as follows:
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("Fold {}".format(fold_+1))
trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
clf = xgb.train(params = best_params,
dtrain = trn_data,
num_boost_round = 2000,
evals = [(trn_data, 'train'), (val_data, 'valid')],
maximize = False,
early_stopping_rounds = 100,
verbose_eval=100)
oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
How to deal with it?
Here is the complete error trace:
Fold 1
[0] train-auc:0.919667 valid-auc:0.822968
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.
Will train until valid-auc hasn't improved in 100 rounds.
[100] train-auc:1 valid-auc:0.974659
[200] train-auc:1 valid-auc:0.97668
[300] train-auc:1 valid-auc:0.977696
[400] train-auc:1 valid-auc:0.977704
Stopping. Best iteration:
[376] train-auc:1 valid-auc:0.977862
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f3d9c285550>>
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/xgboost/core.py", line 368, in __del__
if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-d52b20cc0183> in <module>()
19 verbose_eval=100)
20
---> 21 oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
22
23 predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs)
1042 option_mask |= 0x08
1043
-> 1044 self._validate_features(data)
1045
1046 length = c_bst_ulong()
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _validate_features(self, data)
1271 else:
1272 # Booster can't accept data with different feature names
-> 1273 if self.feature_names != data.feature_names:
1274 dat_missing = set(self.feature_names) - set(data.feature_names)
1275 my_missing = set(data.feature_names) - set(self.feature_names)
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
3612 if name in self._info_axis:
3613 return self[name]
-> 3614 return object.__getattribute__(self, name)
3615
3616 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'feature_names'
The problem has been solved. The problem is, I didn't converted the X_train.iloc[val_idx] to xgb.DMatrix. After converting X_train.iloc[val_idx] and X_test to xgb.DMatrix the plroblem was gone!
Updated the following two lines:
oof[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)/folds.n_splits

Tensorflow DNNclassifier: error wile training (numpy.ndarray has no attribute index)

I am trying to train a DNNClassifier in tensorflow
Here is my code
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=X_train,
y=y_train,
batch_size=1000,
shuffle = True
)
nn_classifier = tf.estimator.DNNClassifier(hidden_units=[1300,1300,1300], feature_columns=X_train, n_classes=200)
nn_classifier.train(input_fn = train_input_fn, steps=2000)
Here is how y_train looks
[450 450 450 ... 327 327 327]
type : numpy.ndarray
And here is how X_train looks
[[ 9.79285 11.659035 1.279528 ... 1.258979 1.063923 -2.45522 ]
[ 8.711333 13.92955 1.117603 ... 3.588921 1.231256 -3.180302]
[ 5.159803 14.059619 1.740708 ... 0.28172 -0.506701 -1.326669]
...
[ 2.418473 0.542642 -3.658447 ... 4.631474 4.544892 -4.595605]
[ 6.51176 4.321688 -1.483697 ... 3.13299 5.476103 -2.833903]
[ 6.894113 5.986267 -1.178247 ... 2.305603 7.217919 -2.152574]]
type : numpy.ndarray
Error :
in pandas_input_fn(x, y, batch_size, num_epochs, shuffle, queue_capacity, num_threads, target_column)
85 'Cannot use name %s for target column: DataFrame already has a '
86 'column with that name: %s' % (target_column, x.columns))
---> 87 if not np.array_equal(x.index, y.index):
88 raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
89 'Index for y: %s\n' % (x.index, y.index))
Update 1: Using numpy_input_fn
train_input_fn= tf.estimator.inputs.numpy_input_fn(
x=X_train,
y=y_train,
batch_size=1000,
shuffle = True
)
Error:
INFO:tensorflow:Calling model_fn.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-23-3b7c6b879e38> in <module>()
10 start_time = time.time()
11 nn_classifier = tf.estimator.DNNClassifier(hidden_units=[1300,1300,1300], feature_columns=X_train, n_classes=200)
---> 12 nn_classifier.train(input_fn = train_input_fn, steps=2000)
13 total_time = start_time - time.time()
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
353
354 saving_listeners = _check_listeners_type(saving_listeners)
--> 355 loss = self._train_model(input_fn, hooks, saving_listeners)
356 logging.info('Loss for final step: %s.', loss)
357 return self
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
822 worker_hooks.extend(input_hooks)
823 estimator_spec = self._call_model_fn(
--> 824 features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
825
826 if self._warm_start_settings:
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode, config)
803
804 logging.info('Calling model_fn.')
--> 805 model_fn_results = self._model_fn(features=features, **kwargs)
806 logging.info('Done calling model_fn.')
807
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
347 head=head,
348 hidden_units=hidden_units,
--> 349 feature_columns=tuple(feature_columns or []),
350 optimizer=optimizer,
351 activation_fn=activation_fn,
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Any clue what I am doing wrong?
The problem is with feature_columns argument on the estimator. Take a look at tf.estimator.DNNClassifier documentation:
feature_columns: An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from _FeatureColumn.
There is also an example usage in the doc. Your X_train looks like a number of numeric columns, in this case you can simply create a list like this:
feature_columns = [tf.feature_column.numeric_column(i) for i in range(...)]
I came across this error today and thought it would be great if I proved a solution.
The problem is brought about by tf.estimator.inputs.numpy_input_fn. according to the TensorFlow docs, X must be a pandas.DataFrame instance and y must be a pandas.Series or a pandas.DataFrame instance. The type() function can help determine the data types of your X_train and y_train values. Changing X_train and y_train to the appropriate data type solves the problem.

XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'

I am training a XGBoostClassifier for my training set.
My training features are in the shape of (45001, 10338) which is a numpy array and my training labels are in the shape of (45001,) [I have 1161 unique labels so I have done a label encoding for the labels] which is also a numpy array.
From the documentation, it clearly says that I can create DMatrix from numpy array. So I am using the above mentioned training features and labels as numpy arrays straightaway. But I am getting the following error
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-30-3de36245534e> in <module>()
13 scale_pos_weight=1,
14 seed=27)
---> 15 modelfit(xgb1, train_x, train_y)
<ipython-input-27-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
6 xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
7 cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8 metrics='auc',early_stopping_rounds=early_stopping_rounds)
9 alg.set_params(n_estimators=cvresult.shape[0])
10
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
399 for fold in cvfolds:
400 fold.update(i, obj)
--> 401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
403 for key, mean, std in res:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in <listcomp>(.0)
399 for fold in cvfolds:
400 fold.update(i, obj)
--> 401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
403 for key, mean, std in res:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in eval(self, iteration, feval)
221 def eval(self, iteration, feval):
222 """"Evaluate the CVPack for one iteration."""
--> 223 return self.bst.eval_set(self.watchlist, iteration, feval)
224
225
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in eval_set(self, evals, iteration, feval)
865 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
866 dmats, evnames, len(evals),
--> 867 ctypes.byref(msg)))
868 return msg.value
869 else:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
125 """
126 if ret != 0:
--> 127 raise XGBoostError(_LIB.XGBGetLastError())
128
129
XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'
Please find my model Code below:
def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgb_param['num_class'] = 1161
xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc',early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(train_data_features, train_labels, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(train_data_features)
dtrain_predprob = alg.predict_proba(train_data_features)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(train_labels, dtrain_predictions))
Where am I going wrong in the above place ?
My classifier as follows :
xgb1 = xgb.XGBClassifier(
learning_rate =0.1,
n_estimators=50,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=4,
scale_pos_weight=1,
seed=27)
EDIT - 2
After changing evaluation metric,
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-9-30c62a886c2e> in <module>()
13 scale_pos_weight=1,
14 seed=27)
---> 15 modelfit(xgb1, train_x_trail, train_y_trail)
<ipython-input-8-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
6 xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
7 cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8 metrics='auc',early_stopping_rounds=early_stopping_rounds)
9 alg.set_params(n_estimators=cvresult.shape[0])
10
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
398 evaluation_result_list=None))
399 for fold in cvfolds:
--> 400 fold.update(i, obj)
401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in update(self, iteration, fobj)
217 def update(self, iteration, fobj):
218 """"Update the boosters for one iteration"""
--> 219 self.bst.update(self.dtrain, iteration, fobj)
220
221 def eval(self, iteration, feval):
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
804
805 if fobj is None:
--> 806 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
807 else:
808 pred = self.predict(dtrain)
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
125 """
126 if ret != 0:
--> 127 raise XGBoostError(_LIB.XGBGetLastError())
128
129
XGBoostError: b'[03:43:03] src/objective/multiclass_obj.cc:42: Check failed: (info.labels.size()) != (0) label set cannot be empty'
The original error that you get is because this metric was not designed for multi-class classification (see here).
You could use scikit learn wrapper of xgboost to overcome this issue. I modified your code with this wrapper, to produce similar function. I am not sure why are you doing gridsearch though, as you are not enumerating over parameters. Instead, you are using the parameters you specified in xgb1. Here is the modified code:
import xgboost as xgb
import sklearn
import numpy as np
from sklearn.model_selection import GridSearchCV
def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5):
if useTrainCV:
params=alg.get_xgb_params()
xgb_param=dict([(key,[params[key]]) for key in params])
boost = xgb.sklearn.XGBClassifier()
cvresult = GridSearchCV(boost,xgb_param,cv=cv_folds)
cvresult.fit(X,y)
alg=cvresult.best_estimator_
#Fit the algorithm on the data
alg.fit(train_data_features, train_labels)
#Predict training set:
dtrain_predictions = alg.predict(train_data_features)
dtrain_predprob = alg.predict_proba(train_data_features)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % sklearn.metrics.accuracy_score(train_labels, dtrain_predictions))
xgb1 = xgb.sklearn.XGBClassifier(
learning_rate =0.1,
n_estimators=50,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=4,
scale_pos_weight=1,
seed=27)
X=np.random.normal(size=(200,30))
y=np.random.randint(0,5,200)
modelfit(xgb1, X, y)
The output that I get is
Model Report
Accuracy : 1
Note that I used much smaller size for the data. With the size that you mentioned, the algorithm may be very slow.
The error is b/c you are trying to use AUC evaluation metric for multiclass classification, but AUC is only applicable for two-class problems. In xgboost implementation, "auc" expects prediction size to be the same as label size, while your multiclass prediction size would be 45001*1161. Use either "mlogloss" or "merror" multiclass metrics.
P.S.: currently, xgboost would be rather slow with so many classes, as there is some inefficiency with predictions caching during training.

Categories

Resources