I am working on a multi-class classification problem using xgboost.
The shape of my data is
print(train_ohe.shape, test_ohe.shape)
# (43266, 190) (18543, 190)
Custom F1 eval function and model training code
def f1_eval(y_pred, dtrain):
y_true = dtrain.get_label()
err = 1-f1_score(y_true, np.round(y_pred),average='weighted')
return 'f1_err', err
def train_model(algo,train,test,predictors,useTrainCV=True,
cv_folds=5,early_stopping_rounds=50):
if useTrainCV:
xgb_param = algo.get_params()
xgb_train = xgb.DMatrix(train[predictors].values,label=train[target].values)
xgb_test = xgb.DMatrix(test[predictors].values)
print(xgb_train.num_row())
print(xgb_test.num_row())
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
# Fit algorithm on data
algo.fit(train[predictors],train[target],eval_metric=f1_eval)
# Predict train data
train_predictions = algo.predict(train[predictors])
train_pred_prob = algo.predict_proba(train[predictors])[:,1]
# Report model performance
print("Model performance")
print("F1 Score Train {}".format(f1_score(train[target].values,train_predictions)))
# Predict test data
test_predictions = algo.predict(test[predictors])
# Performance
print("F1 Score Test {}".format(f1_score(test[target].values,test_predictions)))
Here is my XgbClassifier code. Trying to find the number of estimators for a high learning rate.
target = 'Complaint-Status'
predictors = [x for x in train_ohe.columns if x not in target]
xgb1 = XGBClassifier(learning_rate=0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=8,
scale_pos_weight=1,
seed=145)
train_model(xgb1, train_ohe, test_ohe, predictors)
I am getting following Attribute error saying 'DataFrame' object has no attribute 'num_row'in the xgb.cv line in train_model function.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-116-5933227c171d> in <module>
18 seed=145)
19 print(xgb1.get_params())
---> 20 train_model(xgb1, train_ohe, test_ohe, predictors)
21 # xgb_param = xgb1.get_params()
22 # cv_folds=5
<ipython-input-114-a9df39c19abf> in train_model(algo, train, test, predictors, useTrainCV, cv_folds, early_stopping_rounds)
19 nfold=cv_folds,
20 metrics='f1_eval',
---> 21 early_stopping_rounds=early_stopping_rounds)
22 algo.set_params(n_estimators=cv_result.shape[0])
23
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
413 results = {}
414 cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
--> 415 stratified, folds, shuffle)
416
417 # setup callbacks
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in mknfold(dall, nfold, param, seed, evals, fpreproc, stratified, folds, shuffle)
246 # Do standard k-fold cross validation
247 if shuffle is True:
--> 248 idx = np.random.permutation(dall.num_row())
249 else:
250 idx = np.arange(dall.num_row())
/opt/virtual_env/py3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
4374 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4375 return self[name]
-> 4376 return object.__getattribute__(self, name)
4377
4378 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'num_row'
Saw your post when I was searching around for the same error.
Your second parameter train of the code:
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
should be a matrix such as
train = xgb.DMatrix(X_train, y_train)
hope this helps
Related
I am new to Pytorch. I was able to build an autoencoder model and train it using the MINST dataset.
However, I need to train the model using a custom dataset.
I am getting the error 'ToTensor' object is not iterable when i try to train with the custom dataset.
Below is a code of my dataset class
class AutoEncoderDataSet(Dataset):
def __init__(self, in_dir, transform):
self._transforms = transform
self.img_paths = []
files = os.listdir(in_dir)
for file in files:
self.img_paths.append(os.path.join(in_dir, file))
def __getitem__(self, index):
img, img_trans = Image.open(self.img_paths[index]), Image.open(self.img_paths[index])
x, y = transform(img), transform(img_trans)
return x, y
def __len__(self):
return len(self.img_paths)
Here is how I am generating the dataloader
transform = transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = AutoEncoderDataSet('./datasets/train/', transform)
batch_size = 512
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True
When I try to train using the data generated with the custom dataset class, I am getting the error mentioned above.
Below is code for training the model
epochs = 2048
for epoch in range(epochs):
loss = 0
for batch_features, _ in train_loader:
# reshape mini-batch data to [N, 784] matrix
# load it to the active device
batch_features = batch_features.view(-1, 250*250).to(device)
# reset the gradients back to zero
# PyTorch accumulates gradients on subsequent backward passes
optimizer.zero_grad()
# compute ecoder output
outputs = model(batch_features)
# compute training reconstruction loss
train_loss = criterion(outputs, batch_features)
# compute accumulated gradients
train_loss.backward()
# perform parameter update based on current gradients
optimizer.step()
# add the mini-batch training loss to epoch loss
loss += train_loss.item()
# compute the epoch training loss
loss = loss / len(train_loader)
# display the epoch training loss
print("epoch : {}/{}, recon loss = {:.8f}".format(epoch + 1, epochs, loss))
And this is the error I am getting
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11164/1462449221.py in <module>
3 for epoch in range(epochs):
4 loss = 0
----> 5 for batch_features, _ in test_loader:
6 # reshape mini-batch data to [N, 784] matrix
7 # load it to the active device
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torchvision\datasets\folder.py in __getitem__(self, index)
232 sample = self.loader(path)
233 if self.transform is not None:
--> 234 sample = self.transform(sample)
235 if self.target_transform is not None:
236 target = self.target_transform(target)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\torchvision\transforms\transforms.py in __call__(self, img)
58
59 def __call__(self, img):
---> 60 for t in self.transforms:
61 img = t(img)
62 return img
TypeError: 'ToTensor' object is not iterable
Any suggestions would be greatly appreciated.
I am trying to train a XGBoost classification model and I had done it several times. This time I am trying to do a hyper parameter gridsearch and doing a CV using xgboost.cv. Everytime I run my code it gives a Key error:
I also tried to use just xgboost.train with some default parameters, which when I use to predict for the same DMatrix, it predicts everything as null.
Here is my DMatrix, where I have missing values in 4 features, for which I specified missing = np.nan in DMatrix
xgbmat_train = xgb.DMatrix(X_train.values,label=
Y_train.values,missing=np.nan,weight = train_weights)
xgbmat_test = xgb.DMatrix(X_test.values,label=Y_test.values,missing=np.nan,weight=test_weights)
These are my initial parameters
initial_params = {'learning_rate':0.1,'n_estimators':1000,'objective':'binary:logistic','booster':'gbtree','reg_alpha':0,
'reg_lambda':1,'max_depth':5,'min_child_weight':1,'gamma':0,'subsample':0.8,'colsample_bytree':0.8,
'scale_pos_weight':1,'missing':np.nan,'seed':27,'eval_metric':'auc','n_jobs':32,'silent':True}
These are my gridsearch parameters
gridsearch_params = [(max_depth,min_child_weight)
for max_depth in range(4,10)
for min_child_weight in range(1,6)]
Below is the loop where I am doing a gridsearch
max_auc = 0.0
best_params = ''
print(gc.collect())
for max_depth, min_child_weight in gridsearch_params:
print(gc.collect())
print("CV with max_depth = {}, min_child_weight=
{}".format(max_depth,min_child_weight))
initial_params['max_depth'] = max_depth
initial_params['min_child_weight'] = min_child_weight
cv_results = xgb.cv(initial_params,
xgbmat_train,
num_boost_round = 200,
seed = 42,
stratified = True,
shuffle=True,
nfold=3,
metrics={'auc'},
early_stopping_rounds = 50)
mean_auc = cv_results['test-auc-mean'].max()
boost_rounds = cv_results['test-auc-mean'].argmax()
cv_results = cv_results.append(cv_results)
if mean_auc > max_auc:
max_auc = mean_auc
best_params = (max_depth,min_child_weight)
print(gc.collect())
print(cv_results)
print(mean_auc)
print(boost_rounds)
print("Best param: {}, {}, aucpr: {}".format(best_params[0],best_params[1],max_auc))
This is the error I am getting while running the above code
KeyError Traceback (most recent call
last)
<ipython-input-15-f546ef27594f> in <module>
15 nfold=3,
16 metrics={'auc'},
---> 17 early_stopping_rounds = 50)
18 mean_auc = cv_results['test-auc-mean'].max()
19 boost_rounds = cv_results['test-auc-mean'].argmax()
~/anaconda3/lib/python3.7/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
461 end_iteration=num_boost_round,
462 rank=0,
--> 463 evaluation_result_list=res))
464 except EarlyStopException as e:
465 for k in results:
~/anaconda3/lib/python3.7/site-packages/xgboost/callback.py in callback(env)
243 best_msg=state['best_msg'])
244 elif env.iteration - best_iteration >= stopping_rounds:
--> 245 best_msg = state['best_msg']
246 if verbose and env.rank == 0:
247 msg = "Stopping. Best iteration:\n{}\n\n"
KeyError: 'best_msg'
I tried filling NAs with -9999.0 and specified the same in missing argument in DMatrix, but throws the same error. I am running on some hard deadline, any help will be deeply appriciated
I've trained an XGBoost Classifier for binary classification. While training the model on train data using CV and predicting on the test data, I face the error AttributeError: 'DataFrame' object has no attribute 'feature_names'.
My code is as follows:
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("Fold {}".format(fold_+1))
trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
clf = xgb.train(params = best_params,
dtrain = trn_data,
num_boost_round = 2000,
evals = [(trn_data, 'train'), (val_data, 'valid')],
maximize = False,
early_stopping_rounds = 100,
verbose_eval=100)
oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
How to deal with it?
Here is the complete error trace:
Fold 1
[0] train-auc:0.919667 valid-auc:0.822968
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.
Will train until valid-auc hasn't improved in 100 rounds.
[100] train-auc:1 valid-auc:0.974659
[200] train-auc:1 valid-auc:0.97668
[300] train-auc:1 valid-auc:0.977696
[400] train-auc:1 valid-auc:0.977704
Stopping. Best iteration:
[376] train-auc:1 valid-auc:0.977862
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f3d9c285550>>
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/xgboost/core.py", line 368, in __del__
if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-d52b20cc0183> in <module>()
19 verbose_eval=100)
20
---> 21 oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
22
23 predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs)
1042 option_mask |= 0x08
1043
-> 1044 self._validate_features(data)
1045
1046 length = c_bst_ulong()
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _validate_features(self, data)
1271 else:
1272 # Booster can't accept data with different feature names
-> 1273 if self.feature_names != data.feature_names:
1274 dat_missing = set(self.feature_names) - set(data.feature_names)
1275 my_missing = set(data.feature_names) - set(self.feature_names)
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
3612 if name in self._info_axis:
3613 return self[name]
-> 3614 return object.__getattribute__(self, name)
3615
3616 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'feature_names'
The problem has been solved. The problem is, I didn't converted the X_train.iloc[val_idx] to xgb.DMatrix. After converting X_train.iloc[val_idx] and X_test to xgb.DMatrix the plroblem was gone!
Updated the following two lines:
oof[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)/folds.n_splits
I am trying to train a DNNClassifier in tensorflow
Here is my code
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=X_train,
y=y_train,
batch_size=1000,
shuffle = True
)
nn_classifier = tf.estimator.DNNClassifier(hidden_units=[1300,1300,1300], feature_columns=X_train, n_classes=200)
nn_classifier.train(input_fn = train_input_fn, steps=2000)
Here is how y_train looks
[450 450 450 ... 327 327 327]
type : numpy.ndarray
And here is how X_train looks
[[ 9.79285 11.659035 1.279528 ... 1.258979 1.063923 -2.45522 ]
[ 8.711333 13.92955 1.117603 ... 3.588921 1.231256 -3.180302]
[ 5.159803 14.059619 1.740708 ... 0.28172 -0.506701 -1.326669]
...
[ 2.418473 0.542642 -3.658447 ... 4.631474 4.544892 -4.595605]
[ 6.51176 4.321688 -1.483697 ... 3.13299 5.476103 -2.833903]
[ 6.894113 5.986267 -1.178247 ... 2.305603 7.217919 -2.152574]]
type : numpy.ndarray
Error :
in pandas_input_fn(x, y, batch_size, num_epochs, shuffle, queue_capacity, num_threads, target_column)
85 'Cannot use name %s for target column: DataFrame already has a '
86 'column with that name: %s' % (target_column, x.columns))
---> 87 if not np.array_equal(x.index, y.index):
88 raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
89 'Index for y: %s\n' % (x.index, y.index))
Update 1: Using numpy_input_fn
train_input_fn= tf.estimator.inputs.numpy_input_fn(
x=X_train,
y=y_train,
batch_size=1000,
shuffle = True
)
Error:
INFO:tensorflow:Calling model_fn.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-23-3b7c6b879e38> in <module>()
10 start_time = time.time()
11 nn_classifier = tf.estimator.DNNClassifier(hidden_units=[1300,1300,1300], feature_columns=X_train, n_classes=200)
---> 12 nn_classifier.train(input_fn = train_input_fn, steps=2000)
13 total_time = start_time - time.time()
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
353
354 saving_listeners = _check_listeners_type(saving_listeners)
--> 355 loss = self._train_model(input_fn, hooks, saving_listeners)
356 logging.info('Loss for final step: %s.', loss)
357 return self
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
822 worker_hooks.extend(input_hooks)
823 estimator_spec = self._call_model_fn(
--> 824 features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
825
826 if self._warm_start_settings:
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode, config)
803
804 logging.info('Calling model_fn.')
--> 805 model_fn_results = self._model_fn(features=features, **kwargs)
806 logging.info('Done calling model_fn.')
807
c:\users\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
347 head=head,
348 hidden_units=hidden_units,
--> 349 feature_columns=tuple(feature_columns or []),
350 optimizer=optimizer,
351 activation_fn=activation_fn,
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Any clue what I am doing wrong?
The problem is with feature_columns argument on the estimator. Take a look at tf.estimator.DNNClassifier documentation:
feature_columns: An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from _FeatureColumn.
There is also an example usage in the doc. Your X_train looks like a number of numeric columns, in this case you can simply create a list like this:
feature_columns = [tf.feature_column.numeric_column(i) for i in range(...)]
I came across this error today and thought it would be great if I proved a solution.
The problem is brought about by tf.estimator.inputs.numpy_input_fn. according to the TensorFlow docs, X must be a pandas.DataFrame instance and y must be a pandas.Series or a pandas.DataFrame instance. The type() function can help determine the data types of your X_train and y_train values. Changing X_train and y_train to the appropriate data type solves the problem.
I am training a XGBoostClassifier for my training set.
My training features are in the shape of (45001, 10338) which is a numpy array and my training labels are in the shape of (45001,) [I have 1161 unique labels so I have done a label encoding for the labels] which is also a numpy array.
From the documentation, it clearly says that I can create DMatrix from numpy array. So I am using the above mentioned training features and labels as numpy arrays straightaway. But I am getting the following error
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-30-3de36245534e> in <module>()
13 scale_pos_weight=1,
14 seed=27)
---> 15 modelfit(xgb1, train_x, train_y)
<ipython-input-27-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
6 xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
7 cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8 metrics='auc',early_stopping_rounds=early_stopping_rounds)
9 alg.set_params(n_estimators=cvresult.shape[0])
10
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
399 for fold in cvfolds:
400 fold.update(i, obj)
--> 401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
403 for key, mean, std in res:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in <listcomp>(.0)
399 for fold in cvfolds:
400 fold.update(i, obj)
--> 401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
403 for key, mean, std in res:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in eval(self, iteration, feval)
221 def eval(self, iteration, feval):
222 """"Evaluate the CVPack for one iteration."""
--> 223 return self.bst.eval_set(self.watchlist, iteration, feval)
224
225
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in eval_set(self, evals, iteration, feval)
865 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
866 dmats, evnames, len(evals),
--> 867 ctypes.byref(msg)))
868 return msg.value
869 else:
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
125 """
126 if ret != 0:
--> 127 raise XGBoostError(_LIB.XGBGetLastError())
128
129
XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'
Please find my model Code below:
def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgb_param['num_class'] = 1161
xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc',early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(train_data_features, train_labels, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(train_data_features)
dtrain_predprob = alg.predict_proba(train_data_features)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(train_labels, dtrain_predictions))
Where am I going wrong in the above place ?
My classifier as follows :
xgb1 = xgb.XGBClassifier(
learning_rate =0.1,
n_estimators=50,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=4,
scale_pos_weight=1,
seed=27)
EDIT - 2
After changing evaluation metric,
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-9-30c62a886c2e> in <module>()
13 scale_pos_weight=1,
14 seed=27)
---> 15 modelfit(xgb1, train_x_trail, train_y_trail)
<ipython-input-8-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
6 xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
7 cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8 metrics='auc',early_stopping_rounds=early_stopping_rounds)
9 alg.set_params(n_estimators=cvresult.shape[0])
10
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
398 evaluation_result_list=None))
399 for fold in cvfolds:
--> 400 fold.update(i, obj)
401 res = aggcv([f.eval(i, feval) for f in cvfolds])
402
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in update(self, iteration, fobj)
217 def update(self, iteration, fobj):
218 """"Update the boosters for one iteration"""
--> 219 self.bst.update(self.dtrain, iteration, fobj)
220
221 def eval(self, iteration, feval):
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
804
805 if fobj is None:
--> 806 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
807 else:
808 pred = self.predict(dtrain)
/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
125 """
126 if ret != 0:
--> 127 raise XGBoostError(_LIB.XGBGetLastError())
128
129
XGBoostError: b'[03:43:03] src/objective/multiclass_obj.cc:42: Check failed: (info.labels.size()) != (0) label set cannot be empty'
The original error that you get is because this metric was not designed for multi-class classification (see here).
You could use scikit learn wrapper of xgboost to overcome this issue. I modified your code with this wrapper, to produce similar function. I am not sure why are you doing gridsearch though, as you are not enumerating over parameters. Instead, you are using the parameters you specified in xgb1. Here is the modified code:
import xgboost as xgb
import sklearn
import numpy as np
from sklearn.model_selection import GridSearchCV
def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5):
if useTrainCV:
params=alg.get_xgb_params()
xgb_param=dict([(key,[params[key]]) for key in params])
boost = xgb.sklearn.XGBClassifier()
cvresult = GridSearchCV(boost,xgb_param,cv=cv_folds)
cvresult.fit(X,y)
alg=cvresult.best_estimator_
#Fit the algorithm on the data
alg.fit(train_data_features, train_labels)
#Predict training set:
dtrain_predictions = alg.predict(train_data_features)
dtrain_predprob = alg.predict_proba(train_data_features)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % sklearn.metrics.accuracy_score(train_labels, dtrain_predictions))
xgb1 = xgb.sklearn.XGBClassifier(
learning_rate =0.1,
n_estimators=50,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=4,
scale_pos_weight=1,
seed=27)
X=np.random.normal(size=(200,30))
y=np.random.randint(0,5,200)
modelfit(xgb1, X, y)
The output that I get is
Model Report
Accuracy : 1
Note that I used much smaller size for the data. With the size that you mentioned, the algorithm may be very slow.
The error is b/c you are trying to use AUC evaluation metric for multiclass classification, but AUC is only applicable for two-class problems. In xgboost implementation, "auc" expects prediction size to be the same as label size, while your multiclass prediction size would be 45001*1161. Use either "mlogloss" or "merror" multiclass metrics.
P.S.: currently, xgboost would be rather slow with so many classes, as there is some inefficiency with predictions caching during training.