'DataFrame' object has no attribute 'fault' - python

from sklearn.tree import DecisionTreeRegressor
#loading my train dataset into python
train = df[msk]
test = df[~msk]
#factors that will predict the fault
desired_factors = ['Burning rate','Air network','Number of
users','Circuit Queue','POWER']
#set my model to DecisionTree
model = DecisionTreeRegressor()
#set prediction data to factors that will predict, and set target to
fault
train_data = train[desired_factors]
test_data = test[desired_factors]
target = train.fault
#fitting model with prediction data and telling it my target
model.fit(train_data, target)
model.predict(test_data.head())
AttributeError Traceback (most recent call
last)
Cell In [68], line 15
13 train_data = train[desired_factors]
14 test_data = test[desired_factors]
---> 15 target = train.fault
18 #fitting model with prediction data and telling it my target
19 model.fit(train_data, target)
packages\pandas\core\generic.py:5575, in NDFrame.__getattr__(self,
name)
5568 if (
5569 name not in self._internal_names_set
5570 and name not in self._metadata
5571 and name not in self._accessors
5572 and
self._info_axis._can_hold_identifiers_and_holds_name(name)
5573 ): 5574 return self[name]
-> 5575 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'fault'

Related

Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15])), multi-class classification using hugging face Roberta

I am using hugging face Roberta to classify multi-class dataset, but now I got an error “Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15]))”.
I am not sure what should I do now, could anyone provide some suggestions?
Below is my codes and you can also find the error message in the bottom:
from datasets import load_dataset
from transformers import RobertaTokenizerFast, Trainer, DataCollatorWithPadding
dataset = load_dataset('csv', data_files=data_path,split = 'train')
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.2)
checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(checkpoint)
def tokenization(example):
return tokenizer(example['text'], truncation=True,max_length = 256, padding = True)
train_data = test_valid['train']
test_data= test_valid['test']
train_data = train_data.map(tokenization, batched = True)
test_data = test_data.map(tokenization, batched = True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_data = train_data.remove_columns(["Unnamed: 0", "text"])
test_data = test_data.remove_columns(["Unnamed: 0", "text"])
train_data.set_format("torch")
test_data.set_format("torch")
train_data.column_names
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
from transformers import RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=15)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
from transformers import Trainer
trainer = Trainer(
model,
training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=test_data,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
ValueError Traceback (most recent call last)
<ipython-input-9-3435b262f1ae> in <module>
----> 1 trainer.train()
8 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
3146
3147 if not (target.size() == input.size()):
-> 3148 raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
3149
3150 return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15]))
I got the same error. For me the issue was that my "label" column was a Float-Type. Converting it to Int solved this problem for me:
df.label = df.label.astype(int)

Python Scikit, Loaded Pickle Vectorizer Data: "NotFittedError: Vocabulary not fitted or provided"

I'm building a sentiment analyzer. I built a model that successfully predicts the sentiment of texts, but I can't figure out how to save my entire model with pickle. I can save clf, but I can't save the vectorizer correctly.
In the function trainModel, I return featuresTrain and save it after training my model. After loading both files I run predict(), which gives the error mentioned in the title after it runs vectorizer.transform(). I thought featuresTrain contained the fitted vocabulary, so I'm confused. Any insights?
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
def trainModel(df, category, quantity):
df = pd.read_csv('/Users/NOT/Desktop/VSA/datasets/cleanedData.csv')
train = df.sample(frac=0.8)
test = pd.concat([df,train]).drop_duplicates(keep=False)
featuresTrain = vectorizer.fit_transform(train[category].values.astype('U'))
featuresTest = vectorizer.transform(test[category].values.astype('U'))
trainLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in train[quantity]]
testLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in test[quantity]]
clf = sklearn.linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 100000)
clf.fit(featuresTrain, trainLabels)
return clf, featuresTrain
model, vector = trainModel(data, 'tweet', 'sentiment')
def predict(modelName, text):
vec = vectorizer.transform([text])
prediction = list(modelName.predict(vec))[0]
probs = modelName.predict_proba(vec)
if probs[0][0] <= .6 and probs[0][2] <= .6:
prediction = 1
return prediction
filenameP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisModel_pkl'
filenameVP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisVectorizer_pkl'
pickle.dump(model, open(filenameP, 'wb'))
pickle.dump(vector, open(filenameVP, "wb"))
LM = pickle.load(open(filenameP, 'rb'))
LVM = pickle.load(open(filenameVP, 'rb'))
sentiment = predict(LM, transcript)
Error Traceback:
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
<ipython-input-33-f35d352629a9> in <module>
1 LM = pickle.load(open(filenameP, 'rb'))
2 LVM = pickle.load(open(filenameVP, 'rb'))
----> 3 sentiment = predict(LM, transcript)
<ipython-input-30-f97b97e7bbd5> in predict(modelName, text)
8 def predict(modelName, text):
9
---> 10 vec = vectorizer.transform([text])
11 prediction = list(modelName.predict(vec))[0]
12 probs = modelName.predict_proba(vec)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in transform(self, raw_documents)
1250 "Iterable over raw text documents expected, "
1251 "string object received.")
-> 1252 self._check_vocabulary()
1253
1254 # use the same matrix-building strategy as fit_transform
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _check_vocabulary(self)
470 self._validate_vocabulary()
471 if not self.fixed_vocabulary_:
--> 472 raise NotFittedError("Vocabulary not fitted or provided")
473
474 if len(self.vocabulary_) == 0:
NotFittedError: Vocabulary not fitted or provided

Subset of K-Fold Dataset with PyTorch

I have spent all day trying to read everywhere about implementing K-Fold cross-validation in PyTorch but can't get it to work. I have a custom subset written for my dataset but that does not work either... The code having issues is this:
for epoch in range(training_configuration.epochs_count):
mean = [0.5768, 0.4622, 0.3460]
std = [0.2383, 0.2464, 0.2465]
if data_augmentation:
train_transforms = data_augmentation_transforms(mean, std)
print('Data Augmentation: On')
else:
train_transforms = image_common_transforms(mean, std)
print('Data Augmentation: Off')
test_transforms = image_common_transforms(mean, std)
dataset = KenyanFood13Dataset(
root_dir = train_config.root_dir,
img_path = train_config.img_path,
csv_file = train_config.csv_path,
transform = train_transforms)
kf = KFold(n_splits=5, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(dataset)):
train = torch.utils.data.Subset(dataset, train_index)
test = torch.utils.data.Subset(dataset, test_index)
train_loader = DataLoader(train, batch_size=train_config.batch_size, shuffle=True, num_workers=train_config.num_workers)
test_loader = DataLoader(test, batch_size=train_config.batch_size, shuffle=False, num_workers=train_config.num_workers)
print('Data: New fold successfully')
init_val_loss, init_val_accuracy = validate(training_configuration, model, test_loader)
print(f'Initial Validation Loss: {init_val_loss:.6f}, Initial Validation Accuracy: {init_val_accuracy*100:.3f}%\n')
# --------- THIS IS WHERE THE ERROR IS -------------
train_loss, train_acc = train(
train_config=training_configuration,
model=model,
optimizer=optimizer,
train_loader=train_loader,
epoch_idx=epoch)
Traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-c606f47fba53> in <module>
34 tb_writer,
35 scheduler=scheduler,
---> 36 data_augmentation=True)
<ipython-input-18-05bc5c268921> in main(model, optimizer, tb_writer, scheduler, training_configuration, data_augmentation)
65 optimizer=optimizer,
66 train_loader=train_loader,
---> 67 epoch_idx=epoch)
68
69 epoch_train_loss = np.append(epoch_train_loss, [train_loss])
TypeError: 'Subset' object is not callable
Any thoughts on how I can implement this?

XGBoost: AttributeError: 'DataFrame' object has no attribute 'feature_names'

I've trained an XGBoost Classifier for binary classification. While training the model on train data using CV and predicting on the test data, I face the error AttributeError: 'DataFrame' object has no attribute 'feature_names'.
My code is as follows:
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("Fold {}".format(fold_+1))
trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
clf = xgb.train(params = best_params,
dtrain = trn_data,
num_boost_round = 2000,
evals = [(trn_data, 'train'), (val_data, 'valid')],
maximize = False,
early_stopping_rounds = 100,
verbose_eval=100)
oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
How to deal with it?
Here is the complete error trace:
Fold 1
[0] train-auc:0.919667 valid-auc:0.822968
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.
Will train until valid-auc hasn't improved in 100 rounds.
[100] train-auc:1 valid-auc:0.974659
[200] train-auc:1 valid-auc:0.97668
[300] train-auc:1 valid-auc:0.977696
[400] train-auc:1 valid-auc:0.977704
Stopping. Best iteration:
[376] train-auc:1 valid-auc:0.977862
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f3d9c285550>>
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/xgboost/core.py", line 368, in __del__
if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-d52b20cc0183> in <module>()
19 verbose_eval=100)
20
---> 21 oof[val_idx] = clf.predict(X_train.iloc[val_idx], ntree_limit=clf.best_ntree_limit)
22
23 predictions += clf.predict(X_test, ntree_limit=clf.best_ntree_limit)/folds.n_splits
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs)
1042 option_mask |= 0x08
1043
-> 1044 self._validate_features(data)
1045
1046 length = c_bst_ulong()
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _validate_features(self, data)
1271 else:
1272 # Booster can't accept data with different feature names
-> 1273 if self.feature_names != data.feature_names:
1274 dat_missing = set(self.feature_names) - set(data.feature_names)
1275 my_missing = set(data.feature_names) - set(self.feature_names)
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
3612 if name in self._info_axis:
3613 return self[name]
-> 3614 return object.__getattribute__(self, name)
3615
3616 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'feature_names'
The problem has been solved. The problem is, I didn't converted the X_train.iloc[val_idx] to xgb.DMatrix. After converting X_train.iloc[val_idx] and X_test to xgb.DMatrix the plroblem was gone!
Updated the following two lines:
oof[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)/folds.n_splits

Xgboost 'DataFrame' object has no attribute 'num_row'

I am working on a multi-class classification problem using xgboost.
The shape of my data is
print(train_ohe.shape, test_ohe.shape)
# (43266, 190) (18543, 190)
Custom F1 eval function and model training code
def f1_eval(y_pred, dtrain):
y_true = dtrain.get_label()
err = 1-f1_score(y_true, np.round(y_pred),average='weighted')
return 'f1_err', err
def train_model(algo,train,test,predictors,useTrainCV=True,
cv_folds=5,early_stopping_rounds=50):
if useTrainCV:
xgb_param = algo.get_params()
xgb_train = xgb.DMatrix(train[predictors].values,label=train[target].values)
xgb_test = xgb.DMatrix(test[predictors].values)
print(xgb_train.num_row())
print(xgb_test.num_row())
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
# Fit algorithm on data
algo.fit(train[predictors],train[target],eval_metric=f1_eval)
# Predict train data
train_predictions = algo.predict(train[predictors])
train_pred_prob = algo.predict_proba(train[predictors])[:,1]
# Report model performance
print("Model performance")
print("F1 Score Train {}".format(f1_score(train[target].values,train_predictions)))
# Predict test data
test_predictions = algo.predict(test[predictors])
# Performance
print("F1 Score Test {}".format(f1_score(test[target].values,test_predictions)))
Here is my XgbClassifier code. Trying to find the number of estimators for a high learning rate.
target = 'Complaint-Status'
predictors = [x for x in train_ohe.columns if x not in target]
xgb1 = XGBClassifier(learning_rate=0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softmax',
nthread=8,
scale_pos_weight=1,
seed=145)
train_model(xgb1, train_ohe, test_ohe, predictors)
I am getting following Attribute error saying 'DataFrame' object has no attribute 'num_row'in the xgb.cv line in train_model function.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-116-5933227c171d> in <module>
18 seed=145)
19 print(xgb1.get_params())
---> 20 train_model(xgb1, train_ohe, test_ohe, predictors)
21 # xgb_param = xgb1.get_params()
22 # cv_folds=5
<ipython-input-114-a9df39c19abf> in train_model(algo, train, test, predictors, useTrainCV, cv_folds, early_stopping_rounds)
19 nfold=cv_folds,
20 metrics='f1_eval',
---> 21 early_stopping_rounds=early_stopping_rounds)
22 algo.set_params(n_estimators=cv_result.shape[0])
23
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)
413 results = {}
414 cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
--> 415 stratified, folds, shuffle)
416
417 # setup callbacks
/opt/virtual_env/py3/lib/python3.6/site-packages/xgboost/training.py in mknfold(dall, nfold, param, seed, evals, fpreproc, stratified, folds, shuffle)
246 # Do standard k-fold cross validation
247 if shuffle is True:
--> 248 idx = np.random.permutation(dall.num_row())
249 else:
250 idx = np.arange(dall.num_row())
/opt/virtual_env/py3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
4374 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4375 return self[name]
-> 4376 return object.__getattribute__(self, name)
4377
4378 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'num_row'
Saw your post when I was searching around for the same error.
Your second parameter train of the code:
cv_result = xgb.cv(xgb_param,
train,
num_boost_round=xgb_param['n_estimators'],
nfold=cv_folds,
metrics='f1_eval',
early_stopping_rounds=early_stopping_rounds)
algo.set_params(n_estimators=cv_result.shape[0])
should be a matrix such as
train = xgb.DMatrix(X_train, y_train)
hope this helps

Categories

Resources