I've trained an XGBoost Classifier for binary classification. While training the model on train data using CV and predicting on the test data, I face the error AttributeError: 'DataFrame' object has no attribute 'feature_names'.
My code is as follows:
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("Fold {}".format(fold_+1))
trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
clf = xgb.train(params = best_params,
dtrain = trn_data,
num_boost_round = 2000,
evals = [(trn_data, 'train'), (val_data, 'valid')],
maximize = False,
early_stopping_rounds = 100,
verbose_eval=100)
oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
How to deal with it?
Here is the complete error trace:
Fold 1
[0] train-auc:0.919667 valid-auc:0.822968
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.
Will train until valid-auc hasn't improved in 100 rounds.
[100] train-auc:1 valid-auc:0.974659
[200] train-auc:1 valid-auc:0.97668
[300] train-auc:1 valid-auc:0.977696
[400] train-auc:1 valid-auc:0.977704
Stopping. Best iteration:
[376] train-auc:1 valid-auc:0.977862
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f3d9c285550>>
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/xgboost/core.py", line 368, in __del__
if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-d52b20cc0183> in <module>()
19 verbose_eval=100)
20
---> 21 oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
22
23 predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs)
1042 option_mask |= 0x08
1043
-> 1044 self._validate_features(data)
1045
1046 length = c_bst_ulong()
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _validate_features(self, data)
1271 else:
1272 # Booster can't accept data with different feature names
-> 1273 if self.feature_names != data.feature_names:
1274 dat_missing = set(self.feature_names) - set(data.feature_names)
1275 my_missing = set(data.feature_names) - set(self.feature_names)
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
3612 if name in self._info_axis:
3613 return self[name]
-> 3614 return object.__getattribute__(self, name)
3615
3616 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'feature_names'
The problem has been solved. The problem is, I didn't converted the X_train.iloc[val_idx] to xgb.DMatrix. After converting X_train.iloc[val_idx] and X_test to xgb.DMatrix the plroblem was gone!
Updated the following two lines:
oof[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)/folds.n_splits
Related
so I'm running into the error that my class ESC50Data does not have any length.
from torch.utils.data import Dataset, DataLoader
class ESC50Data(Dataset):
def __init__(self, base, df, in_col, out_col):
self.df = df
self.data = []
self.labels = []
self.c2i={}
self.i2c={}
self.categories = sorted(df[out_col].unique())
for i, category in enumerate(self.categories):
self.c2i[category]=i
self.i2c[i]=category
for ind in tqdm(range(len(df))):
row = df.iloc[ind]
file_path = os.path.join(base,row[in_col])
self.data.append(spec_to_image(get_melspectrogram(file_path))[np.newaxis,...])
self.labels.append(self.c2i[row['category']])
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], self.labels[idx]
train_data = ESC50Data('audio', train, 'filename', 'category')
valid_data = ESC50Data('audio', valid, 'filename', 'category')
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)
This is the point at which I get my error. Using Jypter Notebooks as a sidenote.
TypeError Traceback (most recent call last)
Input In [47], in <cell line: 1>()
----> 1 train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
2 valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)
File ~/opt/anaconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py:353, in DataLoader.__init__(self, dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, prefetch_factor, persistent_workers, pin_memory_device)
351 else: # map-style
352 if shuffle:
--> 353 sampler = RandomSampler(dataset, generator=generator) # type: ignore[arg-type]
354 else:
355 sampler = SequentialSampler(dataset) # type: ignore[arg-type]
File ~/opt/anaconda3/lib/python3.9/site-packages/torch/utils/data/sampler.py:106, in RandomSampler.__init__(self, data_source, replacement, num_samples, generator)
102 if not isinstance(self.replacement, bool):
103 raise TypeError("replacement should be a boolean value, but got "
104 "replacement={}".format(self.replacement))
--> 106 if not isinstance(self.num_samples, int) or self.num_samples <= 0:
107 raise ValueError("num_samples should be a positive integer "
108 "value, but got num_samples={}".format(self.num_samples))
File ~/opt/anaconda3/lib/python3.9/site-packages/torch/utils/data/sampler.py:114, in RandomSampler.num_samples(self)
110 #property
111 def num_samples(self) -> int:
112 # dataset size might change at runtime
113 if self._num_samples is None:
--> 114 return len(self.data_source)
115 return self._num_samples
TypeError: object of type 'ESC50Data' has no len()
Any ideas as to what could be happening? I created the class ESC50Data and then I gave it the child class called Dataset that will inherent the properties of ESC50Data. I also loaded the data into pytorch with train and valid data.
Check the indentation of __len__(self) and __getitem__(self, idx) methods in your class ESC50Data code. Right now, it seems like these methods are defined inside the __init__ method, and not under the class itself.
See, e.g., this answer.
from sklearn.tree import DecisionTreeRegressor
#loading my train dataset into python
train = df[msk]
test = df[~msk]
#factors that will predict the fault
desired_factors = ['Burning rate','Air network','Number of
users','Circuit Queue','POWER']
#set my model to DecisionTree
model = DecisionTreeRegressor()
#set prediction data to factors that will predict, and set target to
fault
train_data = train[desired_factors]
test_data = test[desired_factors]
target = train.fault
#fitting model with prediction data and telling it my target
model.fit(train_data, target)
model.predict(test_data.head())
AttributeError Traceback (most recent call
last)
Cell In [68], line 15
13 train_data = train[desired_factors]
14 test_data = test[desired_factors]
---> 15 target = train.fault
18 #fitting model with prediction data and telling it my target
19 model.fit(train_data, target)
packages\pandas\core\generic.py:5575, in NDFrame.__getattr__(self,
name)
5568 if (
5569 name not in self._internal_names_set
5570 and name not in self._metadata
5571 and name not in self._accessors
5572 and
self._info_axis._can_hold_identifiers_and_holds_name(name)
5573 ): 5574 return self[name]
-> 5575 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'fault'
As metioned in the title, I am getting this TyperError for the following code
I am using google collab and is set to GPU runtime type.
%%time
history = [evaluate(model, valid_dl)]
history
%%time
history += fit_OneCycle(epochs, max_lr, model, train_dl, valid_dl,
grad_clip=grad_clip,
weight_decay=1e-4,
opt_func=opt_func)
def plot_losses(history):
train_losses = [x.get('train_loss') for x in history]
val_losses = [x['val_loss'] for x in history]
plt.plot(train_losses, '-bx')
plt.plot(val_losses, '-rx')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['Training', 'Validation'])
plt.title('Loss vs. No. of epochs')
for the following line i am getting the error :-
plot_losses(history)
And i am getting the following error message:
AttributeError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py in index_of(y)
1626 try:
-> 1627 return y.index.values, y.values
1628 except AttributeError:
AttributeError: 'builtin_function_or_method' object has no attribute 'values'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
----------------------------------9 frames------------------------------------------
<__array_function__ internals> in atleast_1d(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/torch/_tensor.py in __array__(self, dtype)
676 return handle_torch_function(Tensor.__array__, (self,), self, dtype=dtype)
677 if dtype is None:
--> 678 return self.numpy()
679 else:
680 return self.numpy().astype(dtype, copy=False)
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
The line of error is not shown here, however, you can replace the array (if it is an array) which you have with:
Thearray.cpu().numpy()
I have written the following pandas/sklearn algorithm to predict the movie genre based on words occuring in the movie. Find the dataset here
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
data=pd.read_csv('movies.csv',sep=',');
df1 = data[["marri", "huh", "hous", "mother", "nice", "home", "miss", "play", "happi", "write","wouldnt","power", "captain", "ship", "weve", "move", "ship", "system", "world", "command", "three"]]
predicted_genre=data[["Genre"]]
X = np.c_[np.ones((df1.shape[0], 1)), df1]
predicted_genre = predicted_genre[:, np.newaxis]
Here, the error occurs but I included the code below since it is relevant for the second error:
theta = np.zeros((X.shape[1], 1))
model = LogisticRegression()
model.fit(X, predicted_genre.values.ravel())
parameters = model.coef_
predicted_classes = model.predict(X)
accuracy = accuracy_score(predicted_genre.flatten(),predicted_classes)
This is the error I observe:
TypeError Traceback (most recent call last)
<ipython-input-48-1b8359fe335e> in <module>
12 #model.fit(df1, predicted_genre)
13 X = np.c_[np.ones((df1.shape[0], 1)), df1]
---> 14 predicted_genre = predicted_genre[:, np.newaxis]
15 theta = np.zeros((X.shape[1], 1))
16 model = LogisticRegression()
/srv/app/venv/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
/srv/app/venv/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
/srv/app/venv/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
2485 """Return the cached item, item represents a label indexer."""
2486 cache = self._item_cache
-> 2487 res = cache.get(item)
2488 if res is None:
2489 values = self._data.get(item)
TypeError: unhashable type: 'slice'
When not including this line: predicted_genre = predicted_genre[:, np.newaxis] I get this error:
--------------------------------------------------------------------------- AttributeError
Traceback (most recent call last) <ipython-input-49-0fede056d645>
in <module>
18 parameters = model.coef_
19 predicted_classes = model.predict(X) --->
20 accuracy = accuracy_score(predicted_genre.flatten(),predicted_classes)
21 parameters /srv/app/venv/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name) 4374 if self._info_axis._can_hold_identifiers_and_holds_name(name): 4375 return self[name] -> 4376 return object.__getattribute__(self, name) 4377 4378 def __setattr__(self, name, value): AttributeError: 'DataFrame' object has no attribute 'flatten'
Thank you for your help!
I am working on a multi-class classification problem using xgboost.
The shape of my data is
print(train_ohe.shape, test_ohe.shape)
# (43266, 190) (18543, 190)
Custom F1 eval function and model training code
def f1_eval(y_pred, dtrain):
y_true = dtrain.get_label()
err = 1-f1_score(y_true, np.round(y_pred),average='weighted')
return 'f1_err', err
def train_model(algo,train,test,predictors,useTrainCV=True,
cv_folds=5,early_stopping_rounds=50):
if useTrainCV:
xgb_param = algo.get_params()
xgb_train = xgb.DMatrix(train[predictors].values,label=train[target].values)
xgb_test = xgb.DMatrix(test[predictors].values)
print(xgb_train.num_row())
print(xgb_test.num_row())
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
# Fit algorithm on data
algo.fit(train[predictors],train[target],eval_metric=f1_eval)
# Predict train data
train_predictions = algo.predict(train[predictors])
train_pred_prob = algo.predict_proba(train[predictors])[:,1]
# Report model performance
print("Model performance")
print("F1 Score Train {}".format(f1_score(train[target].values,train_predictions)))
# Predict test data
test_predictions = algo.predict(test[predictors])
# Performance
print("F1 Score Test {}".format(f1_score(test[target].values,test_predictions)))
Here is my XgbClassifier code. Trying to find the number of estimators for a high learning rate.
target = 'Complaint-Status'
predictors = [x for x in train_ohe.columns if x not in target]
xgb1 = XGBClassifier(learning_rate=0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=8,
scale_pos_weight=1,
seed=145)
train_model(xgb1, train_ohe, test_ohe, predictors)
I am getting following Attribute error saying 'DataFrame' object has no attribute 'num_row'in the xgb.cv line in train_model function.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-116-5933227c171d> in <module>
18 seed=145)
19 print(xgb1.get_params())
---> 20 train_model(xgb1, train_ohe, test_ohe, predictors)
21 # xgb_param = xgb1.get_params()
22 # cv_folds=5
<ipython-input-114-a9df39c19abf> in train_model(algo, train, test, predictors, useTrainCV, cv_folds, early_stopping_rounds)
19 nfold=cv_folds,
20 metrics='f1_eval',
---> 21 early_stopping_rounds=early_stopping_rounds)
22 algo.set_params(n_estimators=cv_result.shape[0])
23
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
413 results = {}
414 cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
--> 415 stratified, folds, shuffle)
416
417 # setup callbacks
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in mknfold(dall, nfold, param, seed, evals, fpreproc, stratified, folds, shuffle)
246 # Do standard k-fold cross validation
247 if shuffle is True:
--> 248 idx = np.random.permutation(dall.num_row())
249 else:
250 idx = np.arange(dall.num_row())
/opt/virtual_env/py3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
4374 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4375 return self[name]
-> 4376 return object.__getattribute__(self, name)
4377
4378 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'num_row'
Saw your post when I was searching around for the same error.
Your second parameter train of the code:
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
should be a matrix such as
train = xgb.DMatrix(X_train, y_train)
hope this helps