Adding 'decoder_start_token_id' with SimpleTransformers - python

Training MBART in Seq2Seq with SimpleTransformers but getting an error I am not seeing with BART:
TypeError: shift_tokens_right() missing 1 required positional argument: 'decoder_start_token_id'
So far I've tried various combinations of
model.decoder_tokenizer.add_special_tokens({"bos_token": "<s>"})
Which is already set beforehand. Using something other than bos_token indicates that the token is not a special token.
Leaving the following code:
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
# Model Config
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 4 # 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False # False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = 10 # 0
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 4 # 8
model_args.use_multiprocessing = False
model_ru = Seq2SeqModel(
encoder_decoder_type="mbart",
encoder_decoder_name="IlyaGusev/mbart_ru_sum_gazeta",
args=model_args,
use_cuda=True
)
# Add custom tokens
model_ru.encoder_tokenizer.add_tokens(["token1", "token2"])
# already set, as seen from: model_ru.decoder_tokenizer.bos_token
model_ru.decoder_tokenizer.add_special_tokens({"bos_token": "<s>"})
model_ru.model.resize_token_embeddings(len(model_ru.encoder_tokenizer))
model_ru.train_model(train, eval_data=dev)
Which throws the following error:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:3407: FutureWarning:
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.
Here is a short example:
model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]
See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
warnings.warn(formatted_warning, FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_1538/3709317111.py in <module>
15 model_ru.model.resize_token_embeddings(len(model_ru.encoder_tokenizer))
16
---> 17 model_ru.train_model(train_tydiqa_ru, eval_data=dev_tydiqa_ru)
18
19 # Evaluation and training loss can also be found WandB
5 frames
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_model.py in train_model(self, train_data, output_dir, show_running_loss, args, eval_data, verbose, **kwargs)
433 self._move_model_to_device()
434
--> 435 train_dataset = self.load_and_cache_examples(train_data, verbose=verbose)
436
437 os.makedirs(output_dir, exist_ok=True)
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_model.py in load_and_cache_examples(self, data, evaluate, no_cache, verbose, silent)
1489 if args.model_type in ["bart", "mbart", "marian"]:
1490 return SimpleSummarizationDataset(
-> 1491 encoder_tokenizer, self.args, data, mode
1492 )
1493 else:
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in __init__(self, tokenizer, args, data, mode)
423 else:
424 self.examples = [
--> 425 preprocess_fn(d) for d in tqdm(data, disable=args.silent)
426 ]
427
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in <listcomp>(.0)
423 else:
424 self.examples = [
--> 425 preprocess_fn(d) for d in tqdm(data, disable=args.silent)
426 ]
427
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in preprocess_data_mbart(data)
359 decoder_input_ids,
360 tokenizer.pad_token_id,
--> 361 tokenizer.lang_code_to_id[args.tgt_lang],
362 )
363
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in <lambda>(input_ids, pad_token_id, decoder_start_token_id)
30 shift_tokens_right = (
31 lambda input_ids, pad_token_id, decoder_start_token_id: _shift_tokens_right(
---> 32 input_ids, pad_token_id
33 )
34 )
TypeError: shift_tokens_right() missing 1 required positional argument: 'decoder_start_token_id'

Related

"ValueError: Unable to create tensor" when trying to train a hugging face transformer

I am trying to use the "visheratin/t5-efficient-mini-grammar-correction" pre-trained model for grammar correction and I would like to add my own training examples.
I've loaded the model:
model = AutoModelForSeq2SeqLM.from_pretrained("visheratin/t5-efficient-mini-grammar-correction")
tokenizer = AutoTokenizer.from_pretrained("visheratin/t5-efficient-mini-grammar-correction")
set the training arguments:
training_args = TrainingArguments(
output_dir="./models",
num_train_epochs=3,
per_device_train_batch_size=8,
learning_rate=3e-5,
weight_decay=0.01,
)
and created the training data:
training_examples = [ ('input text 1', 'output text 1'), ('input text 2', 'output text 2') ]
train_data = []
for input_text, target_text in training_examples:
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, padding=True)
target_ids = tokenizer.encode(target_text, return_tensors="pt", truncation=True, padding=True)
train_data.append({
'input_ids': input_ids,
'attention_mask': torch.ones_like(input_ids),
'labels': target_ids,
})
but when I go to train:
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_data,
)
trainer.train()
I get this error:
ValueError: Unable to create tensor, you should probably activate truncation and/or
padding with 'padding=True' 'truncation=True' to have batched tensors with the same
length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs
type `list` where type `int` is expected).
I already have 'padding=True' and 'truncation=True' to tokenizer.encode() and as far as I can tell I do not have any excessive nesting in my features.
This is the full Traceback:
ValueError Traceback (most recent call last)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:715, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
714 if not is_tensor(value):
--> 715 tensor = as_tensor(value)
717 # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
718 # # at-least2d
719 # if tensor.ndim > 2:
720 # tensor = tensor.squeeze(0)
721 # elif tensor.ndim < 2:
722 # tensor = tensor[None, :]
ValueError: expected sequence of length 37 at dim 2 (got 44)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[104], line 1
----> 1 trainer.train()
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer.py:1501, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1496 self.model_wrapped = self.model
1498 inner_training_loop = find_executable_batch_size(
1499 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1500 )
-> 1501 return inner_training_loop(
1502 args=args,
1503 resume_from_checkpoint=resume_from_checkpoint,
1504 trial=trial,
1505 ignore_keys_for_eval=ignore_keys_for_eval,
1506 )
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer.py:1723, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1720 self._load_rng_state(resume_from_checkpoint)
1722 step = -1
-> 1723 for step, inputs in enumerate(epoch_iterator):
1724
1725 # Skip past any already trained steps if resuming training
1726 if steps_trained_in_current_epoch > 0:
1727 steps_trained_in_current_epoch -= 1
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:628, in _BaseDataLoaderIter.__next__(self)
625 if self._sampler_iter is None:
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
631 self._IterableDataset_len_called is not None and \
632 self._num_yielded > self._IterableDataset_len_called:
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:671, in _SingleProcessDataLoaderIter._next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:61, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
59 else:
60 data = self.dataset[possibly_batched_index]
---> 61 return self.collate_fn(data)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer_utils.py:696, in RemoveColumnsCollator.__call__(self, features)
694 def __call__(self, features: List[dict]):
695 features = [self._remove_columns(feature) for feature in features]
--> 696 return self.data_collator(features)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/data/data_collator.py:249, in DataCollatorWithPadding.__call__(self, features)
248 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 249 batch = self.tokenizer.pad(
250 features,
251 padding=self.padding,
252 max_length=self.max_length,
253 pad_to_multiple_of=self.pad_to_multiple_of,
254 return_tensors=self.return_tensors,
255 )
256 if "label" in batch:
257 batch["labels"] = batch["label"]
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2985, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
2982 batch_outputs[key] = []
2983 batch_outputs[key].append(value)
-> 2985 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:210, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
206 n_sequences = encoding[0].n_sequences
208 self._n_sequences = n_sequences
--> 210 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:731, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
726 if key == "overflowing_tokens":
727 raise ValueError(
728 "Unable to create tensor returning overflowing tokens of different lengths. "
729 "Please see if a fast version of this tokenizer is available to have this feature available."
730 )
--> 731 raise ValueError(
732 "Unable to create tensor, you should probably activate truncation and/or padding with"
733 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
734 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
735 " expected)."
736 )
738 return self
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Could someone please help me understand what could be causing this error?

Key Error while fine tunning T5 for summarization with HuggingFace

I am trying to fine tune the T5 transformer for summarization but I am receiving a key error message:
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
The code I am using is basically this:
model_name = '...'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)
(...)
df_dataset = df_dataset[['summary','document']]
df_dataset.document = 'summarize: ' + df_dataset.document
X = list(df_dataset['document'])
y = list(df_dataset['summary'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
y_train_tokenized = tokenizer(y_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
y_val_tokenized = tokenizer(y_val, padding=True, truncation=True, max_length=512)
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
training_set = Dataset(X_train_tokenized, y_train_tokenized)
validation_set = Dataset(X_val_tokenized, y_val_tokenized)
# Define Trainer
args = TrainingArguments(
output_dir="output",
evaluation_strategy="steps",
eval_steps=500,
per_device_train_batch_size=TRAIN_BATCH_SIZE,
per_device_eval_batch_size=VALID_BATCH_SIZE,
num_train_epochs=TRAIN_EPOCHS,
save_steps=3000,
load_best_model_at_end = True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=training_set,
eval_dataset=validation_set,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
And the full error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-29-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-24-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\tokenization_utils_base.py in __getitem__(self, item)
232 return self._encodings[item]
233 else:
--> 234 raise KeyError(
235 "Indexing with integers (to access backend Encoding for a given batch index) "
236 "is not available when using Python based tokenizers"
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
And if change the line:
tokenizer = T5Tokenizer.from_pretrained(model_name)
To:
tokenizer = T5TokenizerFast.from_pretrained(model_name)
the error changes to:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-23-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
RuntimeError: Could not infer dtype of tokenizers.Encoding
Any idea of what is wrong?
This is because this tokenizer returns an object with the following structure
You have to amend the __getitem__ method of your dataset class along the lines of
class ForT5Dataset(torch.utils.data.Dataset):
def __init__(self, inputs, targets):
self.inputs = inputs
self.targets = targets
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
input_ids = torch.tensor(self.inputs["input_ids"][index]).squeeze()
target_ids = torch.tensor(self.targets["input_ids"][index]).squeeze()
return {"input_ids": input_ids, "labels": target_ids}
and pass data prop when initializing, like:
train_ds = ForT5Dataset(train_in.data, train_out.data).

Custom Multiple Input Primitive Bug returns "TypeError: issubclass() arg 1 must be a class"

I am using Featuretools library to try to generate custom features involving customer transactions. I tested the function and it returns the answer so I am not sure why I am getting this error.
I tried using the following link:
https://featuretools.alteryx.com/en/stable/getting_started/primitives.html
Thank you!
from featuretools.primitives import make_agg_primitive
from featuretools.variable_types import DatetimeTimeIndex, Numeric, Categorical
def test_fun(categorical, datetimeindex):
x = pd.DataFrame({'store_name': categorical, 'session_start_time': datetimeindex})
x_mode = list(x['store_name'].mode())[0]
x = x[x['store_name'] == x_mode]
y = x.session_start_time.diff().fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
return y.median()
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = [Numeric])
fm, fd = ft.dfs(
entityset = es,
target_entity = 'customers',
agg_primitives = [Test_Fun],
cutoff_time = lt,
cutoff_time_in_index = True,
include_cutoff_time = False,
verbose = True,
)
Results in the following error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-492-358f980bb6b0> in <module>
20 return_type = [Numeric])
21
---> 22 fm, fd = ft.dfs(
23 entityset = es,
24 target_entity = 'customers',
~\Anaconda3\lib\site-packages\featuretools\utils\entry_point.py in function_wrapper(*args, **kwargs)
38 ep.on_error(error=e,
39 runtime=runtime)
---> 40 raise e
41
42 # send return value
~\Anaconda3\lib\site-packages\featuretools\utils\entry_point.py in function_wrapper(*args, **kwargs)
30 # call function
31 start = time.time()
---> 32 return_value = func(*args, **kwargs)
33 runtime = time.time() - start
34 except Exception as e:
~\Anaconda3\lib\site-packages\featuretools\synthesis\dfs.py in dfs(entities, relationships, entityset, target_entity, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_entities, ignore_variables, primitive_options, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_variable_types, progress_callback, include_cutoff_time)
259 seed_features=seed_features)
260
--> 261 features = dfs_object.build_features(
262 verbose=verbose, return_variable_types=return_variable_types)
263
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in build_features(self, return_variable_types, verbose)
287 assert isinstance(return_variable_types, list), msg
288
--> 289 self._run_dfs(self.es[self.target_entity_id], RelationshipPath([]),
290 all_features, max_depth=self.max_depth)
291
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _run_dfs(self, entity, relationship_path, all_features, max_depth)
412 """
413
--> 414 self._build_transform_features(all_features, entity, max_depth=max_depth)
415
416 """
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _build_transform_features(self, all_features, entity, max_depth, require_direct_input)
576 input_types = input_types[0]
577
--> 578 matching_inputs = self._get_matching_inputs(all_features,
579 entity,
580 new_max_depth,
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _get_matching_inputs(self, all_features, entity, max_depth, input_types, primitive, primitive_options, require_direct_input, feature_filter)
793 primitive, primitive_options, require_direct_input=False,
794 feature_filter=None):
--> 795 features = self._features_by_type(all_features=all_features,
796 entity=entity,
797 max_depth=max_depth,
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _features_by_type(self, all_features, entity, max_depth, variable_type)
768 if (variable_type == variable_types.PandasTypes._all or
769 f.variable_type == variable_type or
--> 770 any(issubclass(f.variable_type, vt) for vt in variable_type)):
771 if max_depth is None or f.get_depth(stop_at=self.seed_features) <= max_depth:
772 selected_features.append(f)
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in <genexpr>(.0)
768 if (variable_type == variable_types.PandasTypes._all or
769 f.variable_type == variable_type or
--> 770 any(issubclass(f.variable_type, vt) for vt in variable_type)):
771 if max_depth is None or f.get_depth(stop_at=self.seed_features) <= max_depth:
772 selected_features.append(f)
TypeError: issubclass() arg 1 must be a class
I think I figured it out. If there exists a better way, please let me know!
I'm not sure why the approach in the documentation didn't work (it uses functions instead of classes and made no mention of classes).
I was able to leverage the solution from this question to solve the problem:
How to get an item's group mean but exclude the item itself?
from featuretools.primitives import AggregationPrimitive
class Test_Fun(AggregationPrimitive):
name = "test_fun"
input_types = [Categorical, DatetimeTimeIndex]
return_type = Numeric
stack_on_self = False
def get_function(self):
def mean_excluding_value(categorical, datetimeindex):
x = pd.DataFrame({'store_name': categorical, 'session_start_time': datetimeindex})
x_mode = list(x['store_name'].mode())[0]
x = x[x['store_name'] == x_mode]
y = x.session_start_time.diff().fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
return y.median()
return mean_excluding_value
fm, fd = ft.dfs(
entityset = es,
target_entity = 'customers',
agg_primitives = [Test_Fun],
cutoff_time = lt,
cutoff_time_in_index = True,
include_cutoff_time = False,
verbose = True,
)
In this section of the code:
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = [Numeric])
return_type should be set to Numeric instead of [Numeric]
This code worked for me:
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = Numeric)

torchtext field with values converted to ids got error integer is required

I followed this tutorial
http://www.programmersought.com/article/2609385756/
to create a TabularDataset with data that already tokenized and converted to ids and I do not want to use vocab or build vocab because the data is numerical
so I defined my field variable as:
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
train,val, test = data.TabularDataset.splits(path='./', train=train_path, validation=valid_path, test=test_path ,format='csv', fields=data_fields, skip_header=True)
train output:
print(vars(train[0])['src'])
#output this [101, 3177, 3702, 11293, 1116, 102]
and I used a BucketIterator:
train_iter= BucketIterator(train,
batch_size=BATCH_SIZE,
device = DEVICE,
sort_key=lambda x: (len(x.src), len(x.trg)),
train=True,
batch_size_fn=batch_size_fn,
repeat=False)
when I run this code:
batch = next(iter(train_iter))
I got
TypeError: an integer is required (got type list)
TypeError Traceback (most recent call
last) in ()
----> 1 batch = next(iter(train_iter))
3 frames
/usr/local/lib/python3.6/dist-packages/torchtext/data/iterator.py in
iter(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/usr/local/lib/python3.6/dist-packages/torchtext/data/batch.py in
init(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
numericalize(self, arr, device)
321 arr = self.postprocessing(arr, None)
322
--> 323 var = torch.tensor(arr, dtype=self.dtype, device=device)
324
325 if self.sequential and not self.batch_first:
TypeError: an integer is required (got type list)
You have to provide the pad_token while declaring the Field.
Change this
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
to
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True, pad_token=0)

CountVectorizer throws error on fit_transform after adding stop words

I have two sections of code. One works, and one does not.
The following code runs as expected without error: (Note: postrain, negtrain, postest, and negtest are lists of strings defined earlier.)
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
train_vector = vector.fit_transform(postrain+negtrain)
test_vector = vector.transform(postest+negtest)
print test_vector.shape
However, this code throws an error:
import re
stop = [re.split('\n|\t', open('stop_words.txt').read())]
vector2 = CountVectorizer(stop_words=stop)
train_vector = vector2.fit_transform(postrain+negtrain) # <-- Error occurs here
test_vector = vector2.transform(postest+negtest)
print test_vector.shape
the error:
TypeErrorTraceback (most recent call last)
<ipython-input-43-cf5f4754d58c> in <module>()
7
8 vector2 = CountVectorizer(stop_words=stop)
----> 9 train_vector = vector2.fit_transform(postrain+negtrain)
10 test_vector = vector2.transform(postest+negtest)
11
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in fit_transform(self, raw_documents, y)
815
816 vocabulary, X = self._count_vocab(raw_documents,
--> 817 self.fixed_vocabulary_)
818
819 if self.binary:
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
745 vocabulary.default_factory = vocabulary.__len__
746
--> 747 analyze = self.build_analyzer()
748 j_indices = _make_int_array()
749 indptr = _make_int_array()
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in build_analyzer(self)
232
233 elif self.analyzer == 'word':
--> 234 stop_words = self.get_stop_words()
235 tokenize = self.build_tokenizer()
236
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in get_stop_words(self)
215 def get_stop_words(self):
216 """Build or fetch the effective stop words list"""
--> 217 return _check_stop_list(self.stop_words)
218
219 def build_analyzer(self):
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _check_stop_list(stop)
92 return None
93 else: # assume it's a collection
---> 94 return frozenset(stop)
95
96
TypeError: unhashable type: 'list'
How did adding stop words cause the error?
I'm dumb. It should have been:
stop = re.split('\n|\t', open('stop_words.txt').read())
without the brackets. Not sure why it threw the error on the line after that though.

Categories

Resources