I am trying to predict the prices of items using Dnnregressor and I couldn't figure out this error that keeps coming. I created tf numeric and categorical columns from pandas dataframe and fed it into the DNNRegressor. There is not much help online regarding this particular error.
Please help me fix this error. Thanks
AttributeError Traceback (most recent call last)
<ipython-input-27-790ecef8c709> in <module>()
92
93 if __name__ == '__main__':
---> 94 main()
<ipython-input-27-790ecef8c709> in main()
81 # learning_rate=0.1, l1_regularization_strength=0.001))
82 est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
---> 83 est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
84 scores = est.evaluate(input_fn = get_test_input_fn(Xtest, ytest))
85 print('Loss Score: {0:f}' .format(scores['average_loss']))
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps)
239 hooks.append(training.StopAtStepHook(steps, max_steps))
240
--> 241 loss = self._train_model(input_fn=input_fn, hooks=hooks)
242 logging.info('Loss for final step: %s.', loss)
243 return self
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks)
628 input_fn, model_fn_lib.ModeKeys.TRAIN)
629 estimator_spec = self._call_model_fn(features, labels,
--> 630 model_fn_lib.ModeKeys.TRAIN)
631 ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
632 all_hooks.extend(hooks)
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode)
613 if 'config' in model_fn_args:
614 kwargs['config'] = self.config
--> 615 model_fn_results = self._model_fn(features=features, **kwargs)
616
617 if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
389 dropout=dropout,
390 input_layer_partitioner=input_layer_partitioner,
--> 391 config=config)
392 super(DNNRegressor, self).__init__(
393 model_fn=_model_fn, model_dir=model_dir, config=config)
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config)
100 net = feature_column_lib.input_layer(
101 features=features,
--> 102 feature_columns=feature_columns)
103
104 for layer_id, num_hidden_units in enumerate(hidden_units):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in input_layer(features, feature_columns, weight_collections, trainable)
205 ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
206 """
--> 207 _check_feature_columns(feature_columns)
208 for column in feature_columns:
209 if not isinstance(column, _DenseColumn):
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\feature_column\feature_column.py in _check_feature_columns(feature_columns)
1660 name_to_column = dict()
1661 for column in feature_columns:
-> 1662 if column.name in name_to_column:
1663 raise ValueError('Duplicate feature column name found for columns: {} '
1664 'and {}. This usually means that these columns refer to '
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in name(self)
2451 #property
2452 def name(self):
-> 2453 return '{}_indicator'.format(self.categorical_column.name)
2454
2455 def _transform_feature(self, inputs):
AttributeError: 'str' object has no attribute 'name'
And below is code:
def get_train_input_fn(Xtrain, ytrain):
return tf.estimator.inputs.pandas_input_fn(
x = Xtrain,
y = ytrain,
batch_size = 30,
num_epochs = None,
shuffle = True)
def get_test_input_fn(Xtest, ytest):
return tf.estimator.inputs.pandas_input_fn(
x = Xtest,
y = ytest,
batch_size = 32,
num_epochs = 1,
shuffle = False)
def main():
Xtrain, Xtest, ytrain, ytest = train_test_split(merc, ytr, test_size = 0.4, random_state = 42)
feature_columns = []
brand_rating = tf.feature_column.numeric_column('brand_rating')
feature_columns.append(brand_rating)
sentiment = tf.feature_column.numeric_column('description_polarity')
feature_columns.append(sentiment)
item_condition = tf.feature_column.numeric_column('item_condition_id')
feature_columns.append(item_condition)
shipping = tf.feature_column.indicator_column('shipping')
feature_columns.append(shipping)
name = tf.feature_column.embedding_column('item_name', 34) #(column name, dimension(no. of unique values ** 0.25))
feature_columns.append(name)
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
feature_columns.append(general)
sc1 = tf.feature_column.categorical_column_with_hash_bucket('SC1', 120)
feature_columns.append(sc1)
sc2 = tf.feature_column.categorical_column_with_hash_bucket('SC2', 900)
feature_columns.append(sc2)
print(feature_columns)
#est = tf.estimator.DNNRegressor(feature_columns, hidden_units = [10, 10], optimizer=tf.train.ProximalAdagradOptimizer(
# learning_rate=0.1, l1_regularization_strength=0.001))
est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
The first argument to tf.feature_column.embedding_column must be a categorical column, not a string. See API spec.
The offending line in your code is:
tf.feature_column.embedding_column('item_name', 34)
After using
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
and other feature_column.categorical_column_with..., you should use
general_indicator = tf.feature_column.indicator_column(general)
and then append it to your feature_columns list.
feature_columns.append(general_indicator)
Related
Goal: implement bidirectionality in LSTM.
I'm new to Deep Learning and chose pytorch-lightening for minimal coding. Progress has been made, thanks to responses from prior posts.
forward() now needs to facilitate nn.LSTM(... bidirectional=True).
I'm basing my latest amendments on this disscuss.pytorch.org response.
Error
Error is based on mismatch of shapes.
Which data needs to be shaped for which layers?
I'm far out of my depths.
RuntimeError: shape '[-1, 38]' is invalid for input of size 1
Code
from argparse import ArgumentParser
import torchmetrics
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
class LSTMClassifier(nn.Module):
def __init__(self,
num_classes,
batch_size=10,
embedding_dim=100,
hidden_dim=50,
vocab_size=128):
super(LSTMClassifier, self).__init__()
initrange = 0.1
self.num_labels = num_classes
n = len(self.num_labels)
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.num_layers = 1
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.word_embeddings.weight.data.uniform_(-initrange, initrange)
self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=self.num_layers, batch_first=True, bidirectional=True) # !
#self.classifier = nn.Linear(hidden_dim, self.num_labels[0])
self.classifier = nn.Linear(2 * hidden_dim, self.num_labels[0]) # !
def repackage_hidden(h):
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance(h, torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
def forward(self, sentence, labels=None):
embeds = self.word_embeddings(sentence)
# lstm_out, _ = self.lstm(embeds) # lstm_out - 2 tensors, _ - hidden layer
lstm_out, hidden = self.lstm(embeds)
# Calculate number of directions
self.num_directions = 2 if self.lstm.bidirectional == True else 1
# Extract last hidden state
# final_state = hidden.view(self.num_layers, self.num_directions, self.batch_size, self.hidden_dim)[-1]
final_state = hidden[0].view(self.num_layers, self.num_directions, self.batch_size, self.hidden_dim)[-1]
# Handle directions
final_hidden_state = None
if self.num_directions == 1:
final_hidden_state = final_state.squeeze(0)
elif self.num_directions == 2:
h_1, h_2 = final_state[0], final_state[1]
# final_hidden_state = h_1 + h_2 # Add both states (requires changes to the input size of first linear layer + attention layer)
final_hidden_state = torch.cat((h_1, h_2), 1) # Concatenate both states
print("len(final_hidden_state)", len(final_hidden_state))
print("len(labels)", len(labels))
print("final_hidden_state.shape", final_hidden_state.shape)
print("labels", labels)
self.linear_dims = [0]
# Define set of fully connected layers (Linear Layer + Activation Layer) * #layers
self.linears = nn.ModuleList()
for i in range(0, len(self.linear_dims)-1):
linear_layer = nn.Linear(self.linear_dims[i], self.linear_dims[i+1])
self.init_weights(linear_layer)
self.linears.append(linear_layer)
if i == len(self.linear_dims) - 1:
break # no activation after output layer!!!
self.linears.append(nn.ReLU())
X = final_hidden_state
# Push through linear layers
for l in self.linears:
X = l(X)
# tag_space = self.classifier(hidden[:,0,:] + hidden[:,-1,:]) # ! # torch.flip(lstm_out[:,-1,:], [0, 1]) - 1 tensor
#logits = F.log_softmax(final_hidden_state, dim=1)
logits = F.cross_entropy(final_hidden_state, labels[0].view(-1))
loss = None
if labels:
# print("len(logits.view(-1, self.num_labels[0]))", len(logits.view(-1, self.num_labels[0])))
print("len(self.num_labels)", len(self.num_labels))
print("self.num_labels[0]", self.num_labels[0])
print("len(labels[0].view(-1))", len(labels[0].view(-1)))
loss = F.cross_entropy(logits.view(-1, self.num_labels[0]), labels[0].view(-1))
return loss, logits
class LSTMTaggerModel(pl.LightningModule):
def __init__(
self,
num_classes,
class_map,
from_checkpoint=False,
model_name='last.ckpt',
learning_rate=3e-6,
**kwargs,
):
super().__init__()
self.save_hyperparameters()
self.learning_rate = learning_rate
self.model = LSTMClassifier(num_classes=num_classes)
# self.model.load_state_dict(torch.load(model_name), strict=False) # !
self.class_map = class_map
self.num_classes = num_classes
self.valid_acc = torchmetrics.Accuracy()
self.valid_f1 = torchmetrics.F1()
def forward(self, *input, **kwargs):
return self.model(*input, **kwargs)
def training_step(self, batch, batch_idx):
x, y_true = batch
loss, _ = self(x, labels=y_true)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
x, y_true = batch
_, y_pred = self(x, labels=y_true)
preds = torch.argmax(y_pred, axis=1)
self.valid_acc(preds, y_true[0])
self.log('val_acc', self.valid_acc, prog_bar=True)
self.valid_f1(preds, y_true[0])
self.log('f1', self.valid_f1, prog_bar=True)
def configure_optimizers(self):
'Prepare optimizer and schedule (linear warmup and decay)'
opt = torch.optim.Adam(params=self.parameters(), lr=self.learning_rate)
sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)
return [opt], [sch]
def training_epoch_end(self, training_step_outputs):
avg_loss = torch.tensor([x['loss']
for x in training_step_outputs]).mean()
self.log('train_loss', avg_loss)
print(f'###score: train_loss### {avg_loss}')
def validation_epoch_end(self, val_step_outputs):
acc = self.valid_acc.compute()
f1 = self.valid_f1.compute()
self.log('val_score', acc)
self.log('f1', f1)
print(f'###score: val_score### {acc}')
def add_model_specific_args(parent_parser):
parser = parent_parser.add_argument_group("OntologyTaggerModel")
parser = ArgumentParser(parents=[parent_parser], add_help=False)
parser.add_argument("--learning_rate", default=2e-3, type=float)
return parent_parser
Traceback:
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
| Name | Type | Params
---------------------------------------------
0 | model | LSTMClassifier | 77.4 K
1 | valid_acc | Accuracy | 0
2 | valid_f1 | F1 | 0
---------------------------------------------
77.4 K Trainable params
0 Non-trainable params
77.4 K Total params
0.310 Total estimated model params size (MB)
Validation sanity check: 0it [00:00, ?it/s]
len(final_hidden_state) 10
len(labels) 1
final_hidden_state.shape torch.Size([10, 100])
labels [tensor([ 2, 31, 26, 37, 22, 5, 31, 36, 5, 10])]
len(self.num_labels) 1
self.num_labels[0] 38
len(labels[0].view(-1)) 10
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-16-3f817f701f20> in <module>
11 """.split()
12
---> 13 run_training(args)
<ipython-input-5-bb0d8b014e32> in run_training(input)
66 shutil.copyfile(labels_file_orig, labels_file_cp)
67 trainer = pl.Trainer.from_argparse_args(args, callbacks=[checkpoint_callback], logger=loggers)
---> 68 trainer.fit(model, dm)
69 model_file = os.path.join(args.modeldir, 'last.ckpt')
70 trainer.save_checkpoint(model_file, weights_only=True)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
497
498 # dispath `start_training` or `start_testing` or `start_predicting`
--> 499 self.dispatch()
500
501 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
544
545 else:
--> 546 self.accelerator.start_training(self)
547
548 def train_or_test_or_predict(self):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
71
72 def start_training(self, trainer):
---> 73 self.training_type_plugin.start_training(trainer)
74
75 def start_testing(self, trainer):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
112 def start_training(self, trainer: 'Trainer') -> None:
113 # double dispatch to initiate the training loop
--> 114 self._results = trainer.run_train()
115
116 def start_testing(self, trainer: 'Trainer') -> None:
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
605 self.progress_bar_callback.disable()
606
--> 607 self.run_sanity_check(self.lightning_module)
608
609 # set stage for logging
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_sanity_check(self, ref_model)
858
859 # run eval step
--> 860 _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
861
862 self.on_sanity_check_end()
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, max_batches, on_epoch)
723 # lightning module methods
724 with self.profiler.profile("evaluation_step_and_end"):
--> 725 output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx)
726 output = self.evaluation_loop.evaluation_step_end(output)
727
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/trainer/evaluation_loop.py in evaluation_step(self, batch, batch_idx, dataloader_idx)
164 model_ref._current_fx_name = "validation_step"
165 with self.trainer.profiler.profile("validation_step"):
--> 166 output = self.trainer.accelerator.validation_step(args)
167
168 # capture any logged information
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in validation_step(self, args)
175
176 with self.precision_plugin.val_step_context(), self.training_type_plugin.val_step_context():
--> 177 return self.training_type_plugin.validation_step(*args)
178
179 def test_step(self, args):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in validation_step(self, *args, **kwargs)
129
130 def validation_step(self, *args, **kwargs):
--> 131 return self.lightning_module.validation_step(*args, **kwargs)
132
133 def test_step(self, *args, **kwargs):
<ipython-input-15-6ef4e0993417> in validation_step(self, batch, batch_idx)
130 def validation_step(self, batch, batch_idx):
131 x, y_true = batch
--> 132 _, y_pred = self(x, labels=y_true)
133 preds = torch.argmax(y_pred, axis=1)
134 self.valid_acc(preds, y_true[0])
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
<ipython-input-15-6ef4e0993417> in forward(self, *input, **kwargs)
120
121 def forward(self, *input, **kwargs):
--> 122 return self.model(*input, **kwargs)
123
124 def training_step(self, batch, batch_idx):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
<ipython-input-15-6ef4e0993417> in forward(self, sentence, labels)
93 print("self.num_labels[0]", self.num_labels[0])
94 print("len(labels[0].view(-1))", len(labels[0].view(-1)))
---> 95 loss = F.cross_entropy(logits.view(-1, self.num_labels[0]), labels[0].view(-1))
96 return loss, logits
97
RuntimeError: shape '[-1, 38]' is invalid for input of size 1
My problem was 2 things.
One, I had to run classifier() before calculating cross_entropy().
Secondly, I had to pass X, final_hidden_layer.flatten().
X = final_hidden_state
# Push through linear layers
for l in self.linears:
X = l(X)
logits = self.classifier(X)
This achieves a working model. However, the first epoch's validation score is 0%.
This will require further work.
As written in the title above it is the Pytroch Error: "IndexError: index out of range in self". This error occurs as soon as a dataset of more than 500 rows is used. Also when I reload the model and try to run a second data set, I get this error. I have tried everything possible to manually set the embedding size, so everything I have found on the net has not worked. Attached the model, optimizer and runtime, would be very grateful for your help.
class Model(nn.Module):
def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
super().__init__()
self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = nn.Dropout(p)
self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)
all_layers = []
num_categorical_cols = sum((nf for ni, nf in embedding_size))
input_size = num_categorical_cols + num_numerical_cols
for i in layers:
all_layers.append(nn.Linear(input_size, i))
all_layers.append(nn.ReLU(inplace=True))
all_layers.append(nn.BatchNorm1d(i))
all_layers.append(nn.Dropout(p))
input_size = i
all_layers.append(nn.Linear(layers[-1], output_size))
self.layers = nn.Sequential(*all_layers)
def forward(self, x_categorical, x_numerical):
embeddings = []
for i,e in enumerate(self.all_embeddings):
embeddings.append(e(x_categorical[:,i]))
x = torch.cat(embeddings, 1)
x = self.embedding_dropout(x)
x_numerical = self.batch_norm_num(x_numerical)
x = torch.cat([x, x_numerical], 1)
x = self.layers(x)
return x
model = Model(categorical_embedding_sizes, numerical_data.shape[1], 5, [400,100,50], p=0.4)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 100
aggregated_losses = []
for i in range(epochs):
i += 1
y_pred = model(categorical_train_data, numerical_train_data)
single_loss = loss_function(y_pred, train_outputs)
aggregated_losses.append(single_loss)
print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')
optimizer.zero_grad()
single_loss.backward()
optimizer.step()
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
Here is the error as described:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-157-202810d3193a> in <module>
4 for i in range(epochs):
5 i += 1
----> 6 y_pred = model(categorical_train_data, numerical_train_data)
7 single_loss = loss_function(y_pred, train_outputs)
8 aggregated_losses.append(single_loss)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
<ipython-input-117-fd6404aba4b5> in forward(self, x_categorical, x_numerical)
25 embeddings = []
26 for i,e in enumerate(self.all_embeddings):
---> 27 embeddings.append(e(x_categorical[:,i]))
28 x = torch.cat(embeddings, 1)
29 x = self.embedding_dropout(x)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
--> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
115
116 def extra_repr(self):
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1722 # remove once script supports set_grad_enabled
1723 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1724 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1725
1726
IndexError: index out of range in self
I'm brand new to Python and machine learning and I'm surely missing something.
I'm training a RandomForest model through nested CV for hyperparameter tuning and RFECV using a pipeline. I retrieved best_estimator_.n_features and it stills shows me the 17 original features before RFECV narrowing down to 3.
X
1182 rows × 17 columns
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
clf = RandomForestClassifier(random_state=42, n_jobs=-1, criterion='entropy', bootstrap=False)
space = {'n_estimators': [900, 1000, 1100],
'max_depth': [25, 50, 100],
'min_samples_split': [500, 750, 1000],
'min_samples_leaf': [32, 64]
}
search = GridSearchCV(clf, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
rfe = RFECV(estimator=RandomForestClassifier())
ppln = Pipeline(steps=[('rfe',rfe),('grid',search)])
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(ppln, X, y.ravel(), scoring='accuracy', cv=cv_outer, n_jobs=-1)
ppln.fit(X, y.ravel())
After I fitted pipeline I tried to predict a new data (fixt) with original 17 features. However the error message shown was: "ValueError: Number of features of the model must match the input. Model n_features is 17 and input n_features is 3."
fixtureXLS = pd.read_excel('aaafixtures.xlsx')
fixtureXLS.to_csv('bbbfixtures.csv', encoding='utf-8')
fixt = pd.read_csv('bbbfixtures.csv')
fixt = fixt.loc[:, ~fixt.columns.str.contains('^Unnamed')]
if 'Result' in fixt.columns:
fixt = fixt.drop(['Result'], axis=1)
fixt
287 rows × 17 columns
fixt['Predicted'] = ppln.predict(fixt)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-164-e54f4c6f6e05> in <module>
----> 1 temp = ppln.predict(fixt)
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
406 for _, name, transform in self._iter(with_final=False):
407 Xt = transform.transform(Xt)
--> 408 return self.steps[-1][-1].predict(Xt, **predict_params)
409
410 #if_delegate_has_method(delegate='_final_estimator')
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in predict(self, X)
485 """
486 self._check_is_fitted('predict')
--> 487 return self.best_estimator_.predict(X)
488
489 #if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in predict(self, X)
627 The predicted classes.
628 """
--> 629 proba = self.predict_proba(X)
630
631 if self.n_outputs_ == 1:
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in predict_proba(self, X)
671 check_is_fitted(self)
672 # Check data
--> 673 X = self._validate_X_predict(X)
674
675 # Assign chunk of trees to jobs
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in _validate_X_predict(self, X)
419 check_is_fitted(self)
420
--> 421 return self.estimators_[0]._validate_X_predict(X, check_input=True)
422
423 #property
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
394 n_features = X.shape[1]
395 if self.n_features_ != n_features:
--> 396 raise ValueError("Number of features of the model must "
397 "match the input. Model n_features is %s and "
398 "input n_features is %s "
ValueError: Number of features of the model must match the input. Model n_features is 17 and input n_features is 3
I transformed fixt to 3 features and predicted pipeline:
X_new = rfe.transform(fixt)
print(X_new.shape[1])
fixt['Predicted'] = ppln.predict(X_new)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-161-02280f45be5a> in <module>
----> 1 fixt['Predicted'] = ppln.predict(X_new)
~\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
117
118 # lambda, but not partial, allows help() to work with update_wrapper
--> 119 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
120 # update the docstring of the returned function
121 update_wrapper(out, self.fn)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
405 Xt = X
406 for _, name, transform in self._iter(with_final=False):
--> 407 Xt = transform.transform(Xt)
408 return self.steps[-1][-1].predict(Xt, **predict_params)
409
~\anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
82 return np.empty(0).reshape((X.shape[0], 0))
83 if len(mask) != X.shape[1]:
---> 84 raise ValueError("X has a different shape than during fitting.")
85 return X[:, safe_mask(X, mask)]
86
ValueError: X has a different shape than during fitting.
Can you help me sending some light, please?!
I don't know if there is an automated way to make it but I created a new pipeline with RandomForestClassfiers taken from the best estimator from previous pipeline, fitted and then predicted. I had to RFE it before tough.
Instead ppln.fit(X, y.ravel()) the final code was
params = search.best_estimator_.get_params()
rfc = RandomForestClassifier(**params)
ppln_new = Pipeline(steps=[('rfe',rfe),('pred',rfc)])
ppln_new.fit(X, y.ravel())
fixt['Predicted'] = ppln_new.predict(fixt)
I got an error of ValueError: Input tensors to a Functional must come from tf.keras.Input. Received: 0 (missing previous layer metadata) and i cant find the cause
this is my error trace and my code
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-15-8058f3a2fd50> in <module>()
6 test_loss, test_accuracy = eg.test(dg.user_test)
7 print('Test set: Loss=%.4f ; Accuracy=%.1f%%' % (test_loss, test_accuracy * 100))
----> 8 eg.save_embeddings('embeddings.csv')
7 frames
<ipython-input-5-54ff9897b1c3> in save_embeddings(self, file_name)
66 inp = self.m.input # input placeholder
67 outputs = [layer.output for layer in self.m.layers] # all layer outputs
---> 68 functor = K.function([inp, K.learning_phase()], outputs ) # evaluation function
69
70 #append embeddings to vectors
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py in function(inputs, outputs, updates, name, **kwargs)
3934 from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top
3935 from tensorflow.python.keras.utils import tf_utils # pylint: disable=g-import-not-at-top
-> 3936 model = models.Model(inputs=inputs, outputs=outputs)
3937
3938 wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in __new__(cls, *args, **kwargs)
240 # Functional model
241 from tensorflow.python.keras.engine import functional # pylint: disable=g-import-not-at-top
--> 242 return functional.Functional(*args, **kwargs)
243 else:
244 return super(Model, cls).__new__(cls, *args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in __init__(self, inputs, outputs, name, trainable)
113 # 'arguments during initialization. Got an unexpected argument:')
114 super(Functional, self).__init__(name=name, trainable=trainable)
--> 115 self._init_graph_network(inputs, outputs)
116
117 #trackable.no_automatic_dependency_tracking
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in _init_graph_network(self, inputs, outputs)
142 base_layer_utils.create_keras_history(self._nested_outputs)
143
--> 144 self._validate_graph_inputs_and_outputs()
145
146 # A Network does not create weights of its own, thus it is already
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in _validate_graph_inputs_and_outputs(self)
637 'must come from `tf.keras.Input`. '
638 'Received: ' + str(x) +
--> 639 ' (missing previous layer metadata).')
640 # Check that x is an input tensor.
641 # pylint: disable=protected-access
ValueError: Input tensors to a Functional must come from `tf.keras.Input`. Received: 0 (missing previous layer metadata).
and this is my snipped code:
class EmbeddingsGenerator:
def __init__(self, train_users, data):
self.train_users = train_users
#preprocess
self.data = data.sort_values(by=['timestamp'])
#make them start at 0
self.data['userId'] = self.data['userId'] - 1
self.data['itemId'] = self.data['itemId'] - 1
self.user_count = self.data['userId'].max() + 1
self.movie_count = self.data['itemId'].max() + 1
self.user_movies = {} #list of rated movies by each user
for userId in range(self.user_count):
self.user_movies[userId] = self.data[self.data.userId == userId]['itemId'].tolist()
self.m = self.model()
def model(self, hidden_layer_size=100):
m = Sequential()
m.add(Dense(hidden_layer_size, input_shape=(1, self.movie_count)))
m.add(Dropout(0.2))
m.add(Dense(self.movie_count, activation='softmax'))
m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return m
def generate_input(self, user_id):
'''
Returns a context and a target for the user_id
context: user's history with one random movie removed
target: id of random removed movie
'''
user_movies_count = len(self.user_movies[user_id])
#picking random movie
random_index = np.random.randint(0, user_movies_count-1) # -1 avoids taking the last movie
#setting target
target = np.zeros((1, self.movie_count))
target[0][self.user_movies[user_id][random_index]] = 1
#setting context
context = np.zeros((1, self.movie_count))
context[0][self.user_movies[user_id][:random_index] + self.user_movies[user_id][random_index+1:]] = 1
return context, target
def train(self, nb_epochs = 300, batch_size = 10000):
'''
Trains the model from train_users's history
'''
for i in range(nb_epochs):
print('%d/%d' % (i+1, nb_epochs))
batch = [self.generate_input(user_id=np.random.choice(self.train_users) - 1) for _ in range(batch_size)]
X_train = np.array([b[0] for b in batch])
y_train = np.array([b[1] for b in batch])
self.m.fit(X_train, y_train, epochs=1, validation_split=0.5)
def test(self, test_users, batch_size = 100000):
'''
Returns [loss, accuracy] on the test set
'''
batch_test = [self.generate_input(user_id=np.random.choice(test_users) - 1) for _ in range(batch_size)]
X_test = np.array([b[0] for b in batch_test])
y_test = np.array([b[1] for b in batch_test])
return self.m.evaluate(X_test, y_test)
def save_embeddings(self, file_name):
'''
Generates a csv file containg the vector embedding for each movie.
'''
inp = self.m.input # input placeholder
outputs = [layer.output for layer in self.m.layers] # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs ) # evaluation function
#append embeddings to vectors
vectors = []
for movie_id in range(self.movie_count):
movie = np.zeros((1, 1, self.movie_count))
movie[0][0][movie_id] = 1
layer_outs = functor([movie])
vector = [str(v) for v in layer_outs[0][0][0]]
vector = '|'.join(vector)
vectors.append([movie_id, vector])
#saves as a csv file
embeddings = pd.DataFrame(vectors, columns=['item_id', 'vectors']).astype({'item_id': 'int32'})
embeddings.to_csv(file_name, sep=';', index=False)
files.download(file_name)
this is the part of code which call the save_embeddings method
if True: # Generate embeddings?
eg = EmbeddingsGenerator(dg.user_train, pd.read_csv('ml-100k/u.data', sep='\t', names=['userId', 'itemId', 'rating', 'timestamp']))
eg.train(nb_epochs=300)
train_loss, train_accuracy = eg.test(dg.user_train)
print('Train set: Loss=%.4f ; Accuracy=%.1f%%' % (train_loss, train_accuracy * 100))
test_loss, test_accuracy = eg.test(dg.user_test)
print('Test set: Loss=%.4f ; Accuracy=%.1f%%' % (test_loss, test_accuracy * 100))
eg.save_embeddings('embeddings.csv')
I am trying to use DynamicRnnEstimator but I am getting a "'list' object has no attribute 'key'" Error.
Code:
feature_names = [
'FeatureA',
'FeatureB',
'FeatureC',
'FeatureD',
'FeatureE',
'FeatureF']
...
feature_columns = [tf.feature_column.numeric_column(k) for k in feature_names]
print (feature_columns)
estimator = tf.contrib.learn.DynamicRnnEstimator(problem_type = constants.ProblemType.CLASSIFICATION,
prediction_type = rnn_common.PredictionType.SINGLE_VALUE,
sequence_feature_columns = [feature_columns],
context_feature_columns = None,
num_units = 5,
num_classes = 11,
cell_type = 'lstm',
optimizer = 'SGD',
model_dir = "model",
learning_rate = 0.1)
estimator.fit(input_fn=lambda: input_fn("train.csv"), steps=STEPS)
Here's the output:
[_NumericColumn(key='FeatureA', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='FeatureB', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='FeatureC', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='FeatureD', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='FeatureE', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='FeatureF', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
...
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-83-bea117372333> in <module>()
26 learning_rate = 0.1)
27
---> 28 estimator.fit(input_fn=lambda: input_fn("train.csv"), steps=STEPS)
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/python/util/deprecation.pyc in new_func(*args, **kwargs)
314 'in a future version' if date is None else ('after %s' % date),
315 instructions)
--> 316 return func(*args, **kwargs)
317 return tf_decorator.make_decorator(func, new_func, 'deprecated',
318 _add_deprecated_arg_notice_to_docstring(
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in fit(self, x, y, input_fn, steps, batch_size, monitors, max_steps)
478 hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
479
--> 480 loss = self._train_model(input_fn=input_fn, hooks=hooks)
481 logging.info('Loss for final step: %s.', loss)
482 return self
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in _train_model(self, input_fn, hooks)
984 global_step_read_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access
985 with ops.control_dependencies([global_step_read_tensor]):
--> 986 model_fn_ops = self._get_train_ops(features, labels)
987 ops.add_to_collection(ops.GraphKeys.LOSSES, model_fn_ops.loss)
988 all_hooks.extend(hooks)
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in _get_train_ops(self, features, labels)
1200 `ModelFnOps` object.
1201 """
-> 1202 return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN)
1203
1204 def _get_eval_ops(self, features, labels, metrics):
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in _call_model_fn(self, features, labels, mode, metrics)
1164 if 'model_dir' in model_fn_args:
1165 kwargs['model_dir'] = self.model_dir
-> 1166 model_fn_results = self._model_fn(features, labels, **kwargs)
1167
1168 if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.pyc in _dynamic_rnn_model_fn(features, labels, mode)
478 sequence_input = build_sequence_input(features,
479 sequence_feature_columns,
--> 480 context_feature_columns)
481 dropout = (dropout_keep_probabilities
482 if mode == model_fn.ModeKeys.TRAIN
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.pyc in build_sequence_input(features, sequence_feature_columns, context_feature_columns, weight_collections, scope)
190 features.update(layers.transform_features(
191 features,
--> 192 list(sequence_feature_columns) + list(context_feature_columns or [])))
193 sequence_input = layers.sequence_input_from_feature_columns(
194 columns_to_tensors=features,
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.pyc in transform_features(features, feature_columns)
642 """
643 columns_to_tensor = features.copy()
--> 644 check_feature_columns(feature_columns)
645 transformer = _Transformer(columns_to_tensor)
646 for column in sorted(set(feature_columns), key=lambda x: x.key):
/home/judge/anaconda3/envs/ipykernel_py2/lib/python2.7/site-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.pyc in check_feature_columns(feature_columns)
765 seen_keys = set()
766 for f in feature_columns:
--> 767 key = f.key
768 if key in seen_keys:
769 raise ValueError('Duplicate feature column key found for column: {}. '
AttributeError: 'list' object has no attribute 'key'
Looking at the trace, it concatenates the sequence_feature_columns and context_feature_columns. It starts looking at the result but doesn't find a key. I have printed out the feature_names and they have keys.
It looks like you've wrapped feature_columns in a list a second time (giving [[...]] rather than [...]):
sequence_feature_columns = [feature_columns],