I am trying to use a custom CNN to classify spectrogram images generated for 3s audio segments. I am using GroupShuffleSplit to divide the training dataset into a training set and a validation set and to ensure that each participant is only included in one set (to prevent data leakage). I am using a Custom Class, to load the audio files and augment the training set.
If I generate a SoundDS object using the training set, randomly split the object into a training and validation subset, and pass these subsets into two respective data loaders, my model trains without any issues.
However, if I initially use GroupShuffleSplit on the dataframe train_df (grouping by the column addressfname), then generate two SoundDS objects and pass the train_DS and validation_DS into two respective data loaders, I encounter the error message shown at the bottom of this post when I try to run for i, data in enumerate(train_dl) in my train function. Does anyone know where the issue lies?
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
def __init__(self, df):
self.df = df
self.duration = 3000
self.sr = 44100
self.channel = 2
self.shift_pct = 0.4
# ----------------------------
# Number of items in dataset
# ----------------------------
def __len__(self):
return len(self.df)
# ----------------------------
# Get i'th item in dataset
# ----------------------------
def __getitem__(self, idx):
audio_file = self.df.loc[idx, "relative_path"]
class_id = self.df.loc[idx, "dx"]
aud = AudioUtil.open(audio_file)
reaud = AudioUtil.resample(aud, self.sr)
rechan = AudioUtil.rechannel(reaud, self.channel)
dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
return aug_sgram, class_id
This method which randomly splits the training set into a training and validation set works.
from torch.utils.data import random_split
myds = SoundDS(train_df)
# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])
# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)
This method which uses GroupShuffleSplit to splits the training set into a training and validation set and then generates two SoundDS objects does not work.
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=0.15, n_splits=1, random_state = 7)
split = splitter.split(train_df, groups=train_df['adressfname'])
train_inds, valid_inds = next(split)
train_data_df = train_df.iloc[train_inds]
valid_data_df = train_df.iloc[valid_inds]
train_dataset_DS = SoundDS(train_data_df)
valid_dataset_DS = SoundDS(valid_data_df)
train_dl = torch.utils.data.DataLoader(train_dataset_DS, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(valid_dataset_DS, batch_size=16, shuffle=False)
Function to train the model
def training(model, train_dl, num_epochs):
# Loss Function, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
steps_per_epoch=int(len(train_dl)),
epochs=num_epochs,
anneal_strategy='linear')
# Repeat for each epoch
for epoch in range(num_epochs):
running_loss = 0.0
correct_prediction = 0
total_prediction = 0
# Repeat for each batch in the training set
for i, data in enumerate(train_dl): ---- This is where the issue occurs
# Get the input features and target labels, and put them on the GPU
inputs, labels = data[0].to(device), data[1].to(device)
# Normalize the inputs
inputs_m, inputs_s = inputs.mean(), inputs.std()
inputs = (inputs - inputs_m) / inputs_s
This is the error message:
KeyError
Traceback (most recent call last)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
15 frames
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 3170
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-201-b60077dcefb4> in <module>
54
55 num_epochs=2 # Just for demo, adjust this higher.
---> 56 training(myModel, train_dl, num_epochs)
<ipython-input-201-b60077dcefb4> in training(model, train_dl, num_epochs)
15
16 # Repeat for each batch in the training set
---> 17 for i, data in enumerate(train_dl):
18 # Get the input features and target labels, and put them on the GPU
19 inputs, labels = data[0].to(device), data[1].to(device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in __next__(self)
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
<ipython-input-191-4c84224a9983> in __getitem__(self, idx)
27
28 # print(self.df.loc[idx, 'relative_path'])
---> 29 audio_file = self.df.loc[idx, "relative_path"]
30 class_id = self.df.loc[idx, "dx"]
31 # participant_id = self.df.loc[idx, 'adressfname']
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1098 def _getitem_tuple(self, tup: tuple):
1099 with suppress(IndexingError):
-> 1100 return self._getitem_lowerdim(tup)
1101
1102 # no multi-index, so validate all of the indexers
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
836 # We don't need to check for tuples here because those are
837 # caught by the _is_nested_tuple_indexer check above.
--> 838 section = self._getitem_axis(key, axis=i)
839
840 # We should never have a scalar section here, because
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1162 # fall thru to straight lookup
1163 self._validate_key(key, axis)
-> 1164 return self._get_label(key, axis=axis)
1165
1166 def _get_slice_axis(self, slice_obj: slice, axis: int):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis)
1111 def _get_label(self, label, axis: int):
1112 # GH#5667 this will fail if the label is not present in the axis.
-> 1113 return self.obj.xs(label, axis=axis)
1114
1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3775 else:
-> 3776 loc = index.get_loc(key)
3777
3778 if isinstance(loc, np.ndarray):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 3170
Does anyone have any idea why one approach works without any issues, but the other does not?
Related
Getting error while predicting for the test data:
CODE:
from pycaret.anomaly import *
anom_exp = setup(train,session_id = 125,
categorical_features=['date', 'hours', 'weekNumber', 'DayName', 'isWeekday'],
numeric_features=['cpu_avg'],
ignore_features = ['Timestamp', 'time'])
sod = create_model('sod',fraction = 0.1)
sod_test= predict_model(model = sod, data = test)
ERROR:
KeyError Traceback (most recent call last)
File \~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3629, in Index.get_loc(self, key, method, tolerance)
3628 try:
\-\> 3629 return self.\_engine.get_loc(casted_key)
3630 except KeyError as err:
File \~/.local/lib/python3.10/site-packages/pandas/\_libs/index.pyx:136, in pandas.\_libs.index.IndexEngine.get_loc()
File \~/.local/lib/python3.10/site-packages/pandas/\_libs/index.pyx:163, in pandas.\_libs.index.IndexEngine.get_loc()
File pandas/\_libs/hashtable_class_helper.pxi:5198, in pandas.\_libs.hashtable.PyObjectHashTable.get_item()
File pandas/\_libs/hashtable_class_helper.pxi:5206, in pandas.\_libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In\[20\], line 1
\----\> 1 sod_test= predict_model(model = sod, data = test)
File \~/.local/lib/python3.10/site-packages/pycaret/anomaly/functional.py:941, in predict_model(model, data)
938 if experiment is None:
939 experiment = \_EXPERIMENT_CLASS()
\--\> 941 return experiment.predict_model(estimator=model, data=data)
File \~/.local/lib/python3.10/site-packages/pycaret/anomaly/oop.py:87, in AnomalyExperiment.predict_model(self, estimator, data, ml_usecase)
48 def predict_model(
49 self, estimator, data: pd.DataFrame, ml_usecase: Optional\[MLUsecase\] = None
50 ) -\> pd.DataFrame:
51 """
52 This function generates anomaly labels on using a trained model.
53
(...)
85
86 """
\---\> 87 return super().predict_model(estimator, data, ml_usecase)
File \~/.local/lib/python3.10/site-packages/pycaret/internal/pycaret_experiment/unsupervised_experiment.py:1354, in \_UnsupervisedExperiment.predict_model(self, estimator, data, ml_usecase)
1351 else:
1352 raise TypeError("Model doesn't support predict parameter.")
\-\> 1354 pred = estimator.predict(data_transformed)
1355 if ml_usecase == MLUsecase.CLUSTERING:
1356 data_transformed\["Cluster"\] = \[f"Cluster {i}" for i in pred\]
File \~/.local/lib/python3.10/site-packages/pyod/models/base.py:165, in BaseDetector.predict(self, X, return_confidence)
144 """Predict if a particular sample is an outlier or not.
145
146 Parameters
(...)
161 Only if return_confidence is set to True.
162 """
164 check_is_fitted(self, \['decision_scores\_', 'threshold\_', 'labels\_'\])
\--\> 165 pred_score = self.decision_function(X)
166 prediction = (pred_score \> self.threshold\_).astype('int').ravel()
168 if return_confidence:
File \~/.local/lib/python3.10/site-packages/pyod/models/sod.py:157, in SOD.decision_function(self, X)
140 def decision_function(self, X):
141 """Predict raw anomaly score of X using the fitted detector.
142 The anomaly score of an input sample is computed based on different
143 detector algorithms. For consistency, outliers are assigned with
(...)
155 The anomaly score of the input samples.
156 """
\--\> 157 return self.\_sod(X)
File \~/.local/lib/python3.10/site-packages/pyod/models/sod.py:187, in SOD.\_sod(self, X)
185 anomaly_scores = np.zeros(shape=(X.shape\[0\],))
186 for i in range(X.shape\[0\]):
\--\> 187 obs = X\[i\]
188 ref = X\[ref_inds\[i,\],\]
189 means = np.mean(ref, axis=0) # mean of each column
File \~/.local/lib/python3.10/site-packages/pandas/core/frame.py:3505, in DataFrame.__getitem__(self, key)
3503 if self.columns.nlevels \> 1:
3504 return self.\_getitem_multilevel(key)
\-\> 3505 indexer = self.columns.get_loc(key)
3506 if is_integer(indexer):
3507 indexer = \[indexer\]
File \~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3631, in Index.get_loc(self, key, method, tolerance)
3629 return self.\_engine.get_loc(casted_key)
3630 except KeyError as err:
\-\> 3631 raise KeyError(key) from err
3632 except TypeError:
3633 # If we have a listlike key, \_check_indexing_error will raise
3634 # InvalidIndexError. Otherwise we fall through and re-raise
3635 # the TypeError.
3636 self.\_check_indexing_error(key)
KeyError: 0
I looked at the source code and I think I know where the problem is, but editing that did not predict anomalies correctly.
source code: https://pyod.readthedocs.io/en/latest/_modules/pyod/models/sod.html
There is a decision_function which is defined as:
def decision_function(self, X):
return self._sod(X)
What I think is the problem: Dataframe X should be changed to array type using check_array(X) before sending it to _sod function
I am trying to use the "visheratin/t5-efficient-mini-grammar-correction" pre-trained model for grammar correction and I would like to add my own training examples.
I've loaded the model:
model = AutoModelForSeq2SeqLM.from_pretrained("visheratin/t5-efficient-mini-grammar-correction")
tokenizer = AutoTokenizer.from_pretrained("visheratin/t5-efficient-mini-grammar-correction")
set the training arguments:
training_args = TrainingArguments(
output_dir="./models",
num_train_epochs=3,
per_device_train_batch_size=8,
learning_rate=3e-5,
weight_decay=0.01,
)
and created the training data:
training_examples = [ ('input text 1', 'output text 1'), ('input text 2', 'output text 2') ]
train_data = []
for input_text, target_text in training_examples:
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, padding=True)
target_ids = tokenizer.encode(target_text, return_tensors="pt", truncation=True, padding=True)
train_data.append({
'input_ids': input_ids,
'attention_mask': torch.ones_like(input_ids),
'labels': target_ids,
})
but when I go to train:
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_data,
)
trainer.train()
I get this error:
ValueError: Unable to create tensor, you should probably activate truncation and/or
padding with 'padding=True' 'truncation=True' to have batched tensors with the same
length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs
type `list` where type `int` is expected).
I already have 'padding=True' and 'truncation=True' to tokenizer.encode() and as far as I can tell I do not have any excessive nesting in my features.
This is the full Traceback:
ValueError Traceback (most recent call last)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:715, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
714 if not is_tensor(value):
--> 715 tensor = as_tensor(value)
717 # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
718 # # at-least2d
719 # if tensor.ndim > 2:
720 # tensor = tensor.squeeze(0)
721 # elif tensor.ndim < 2:
722 # tensor = tensor[None, :]
ValueError: expected sequence of length 37 at dim 2 (got 44)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[104], line 1
----> 1 trainer.train()
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer.py:1501, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1496 self.model_wrapped = self.model
1498 inner_training_loop = find_executable_batch_size(
1499 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1500 )
-> 1501 return inner_training_loop(
1502 args=args,
1503 resume_from_checkpoint=resume_from_checkpoint,
1504 trial=trial,
1505 ignore_keys_for_eval=ignore_keys_for_eval,
1506 )
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer.py:1723, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1720 self._load_rng_state(resume_from_checkpoint)
1722 step = -1
-> 1723 for step, inputs in enumerate(epoch_iterator):
1724
1725 # Skip past any already trained steps if resuming training
1726 if steps_trained_in_current_epoch > 0:
1727 steps_trained_in_current_epoch -= 1
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:628, in _BaseDataLoaderIter.__next__(self)
625 if self._sampler_iter is None:
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
631 self._IterableDataset_len_called is not None and \
632 self._num_yielded > self._IterableDataset_len_called:
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:671, in _SingleProcessDataLoaderIter._next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:61, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
59 else:
60 data = self.dataset[possibly_batched_index]
---> 61 return self.collate_fn(data)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/trainer_utils.py:696, in RemoveColumnsCollator.__call__(self, features)
694 def __call__(self, features: List[dict]):
695 features = [self._remove_columns(feature) for feature in features]
--> 696 return self.data_collator(features)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/data/data_collator.py:249, in DataCollatorWithPadding.__call__(self, features)
248 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 249 batch = self.tokenizer.pad(
250 features,
251 padding=self.padding,
252 max_length=self.max_length,
253 pad_to_multiple_of=self.pad_to_multiple_of,
254 return_tensors=self.return_tensors,
255 )
256 if "label" in batch:
257 batch["labels"] = batch["label"]
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2985, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
2982 batch_outputs[key] = []
2983 batch_outputs[key].append(value)
-> 2985 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:210, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
206 n_sequences = encoding[0].n_sequences
208 self._n_sequences = n_sequences
--> 210 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
File ~/opt/miniconda3/envs/ub_nbdev2/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:731, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
726 if key == "overflowing_tokens":
727 raise ValueError(
728 "Unable to create tensor returning overflowing tokens of different lengths. "
729 "Please see if a fast version of this tokenizer is available to have this feature available."
730 )
--> 731 raise ValueError(
732 "Unable to create tensor, you should probably activate truncation and/or padding with"
733 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
734 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
735 " expected)."
736 )
738 return self
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Could someone please help me understand what could be causing this error?
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
SequentialSampler)
def data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50):
"""
Convert train and validation sets to torch.Tensors and load them to DataLoader.
"""
# Convert data type to torch.Tensor
train_inputs, val_inputs, train_labels, val_labels =\
tuple(torch.tensor(data) for data in
[train_inputs, val_inputs, train_labels, val_labels])
# Specify batch_size
batch_size = 50
# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=batch_size)
# Create DataLoader for validation data
val_data = TensorDataset(val_inputs, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
return train_dataloader, val_dataloader
The code works fine when the train_inputs and val_inputs tensors are of of type int64, but doesn't when the type is int32.
Can someone tell me what's wrong here?
ERROR:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
3620 try:
-> 3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:163, in pandas._libs.index.IndexEngine.get_loc()
File pandas\_libs\hashtable_class_helper.pxi:2131, in pandas._libs.hashtable.Int64HashTable.get_item()
File pandas\_libs\hashtable_class_helper.pxi:2140, in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 8
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Input In [31], in <cell line: 6>()
2 train_inputs, val_inputs, train_labels, val_labels = train_test_split(
3 input_ids, labels, test_size=0.1, random_state=42)
5 # Load data to PyTorch DataLoader
----> 6 train_dataloader, val_dataloader = data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)
Input In [28], in data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size)
6 """Convert train and validation sets to torch.Tensors and load them to
7 DataLoader.
8 """
10 # Convert data type to torch.Tensor
11 train_inputs, val_inputs, train_labels, val_labels =\
---> 12 tuple(torch.tensor(data) for data in
13 [train_inputs, val_inputs, train_labels, val_labels])
15 # Specify batch_size
16 batch_size = 50
Input In [28], in <genexpr>(.0)
6 """Convert train and validation sets to torch.Tensors and load them to
7 DataLoader.
8 """
10 # Convert data type to torch.Tensor
11 train_inputs, val_inputs, train_labels, val_labels =\
---> 12 tuple(torch.tensor(data) for data in
13 [train_inputs, val_inputs, train_labels, val_labels])
15 # Specify batch_size
16 batch_size = 50
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:958, in Series.__getitem__(self, key)
955 return self._values[key]
957 elif key_is_scalar:
--> 958 return self._get_value(key)
960 if is_hashable(key):
961 # Otherwise index.get_value will raise InvalidIndexError
962 try:
963 # For labels that don't resolve as scalars like tuples and frozensets
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:1069, in Series._get_value(self, label, takeable)
1066 return self._values[label]
1068 # Similar to Index.get_value, but we do not fall back to positional
-> 1069 loc = self.index.get_loc(label)
1070 return self.index._get_values_for_loc(self, loc, label)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance)
3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
-> 3623 raise KeyError(key) from err
3624 except TypeError:
3625 # If we have a listlike key, _check_indexing_error will raise
3626 # InvalidIndexError. Otherwise we fall through and re-raise
3627 # the TypeError.
3628 self._check_indexing_error(key)
KeyError: 8
I was using the same code on my data set and had the same issue. I did 2 things. changed the random_state to not be 42 (which probably wasn't what fixed it) and I also changed my labels to np.array and now it works
I am working with the SketchRNN dataset.
I would like to do the followings:
Parse examples in TFRecord to get labels.
Filter rows with specific labels. There are 345 labels, and I would like to keep only data with label number below 10.
Batch multiple observations for training.
Below is my failed attempt:
from pathlib import Path
quickdraw_dir = Path("/home/long/.keras/./datasets/quickdraw/quickdraw_tutorial_dataset_v1.tar.gz").parent
train_files = sorted([str(path) for path in quickdraw_dir.glob("training.tfrecord-*")])
eval_files = sorted([str(path) for path in quickdraw_dir.glob("eval.tfrecord-*")])
def parse_single_example(data):
feature_descriptions = {
"ink": tf.io.VarLenFeature(dtype=tf.float32),
"shape": tf.io.FixedLenFeature([2], dtype=tf.int64),
"class_index": tf.io.FixedLenFeature([1], dtype=tf.int64)
}
example = tf.io.parse_single_example(data, feature_descriptions)
flat_sketch = tf.sparse.to_dense(example["ink"])
sketch = tf.reshape(flat_sketch, shape=[-1, 3])
length = example["shape"][0]
label = example["class_index"][0]
return sketch, length, label
def filter_cond(dataset):
return dataset.filter(lambda x, y, z: tf.math.less(z, 10))
def get_dataset(filepaths, batch_size=32, shuffle_buffer_size=None, n_parse_threads=5, n_read_threads=5, cache=False):
dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=n_read_threads)
if cache:
dataset.cache()
if shuffle_buffer_size:
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_single_example, num_parallel_calls=n_parse_threads)
dataset = dataset.apply(filter_cond)
dataset = dataset.batch(batch_size=batch_size)
return dataset.prefetch(1)
train_dataset = get_dataset(train_files, shuffle_buffer_size=10000)
train_dataset.take(1)
Below is the error:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
Input In [31], in <cell line: 1>()
----> 1 for sketches, lengths, labels in train_dataset.take(1):
2 print("sketches =", sketches)
3 print("sketches shape=", sketches.shape)
File ~/anaconda3/envs/experiment/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py:800, in OwnedIterator.__next__(self)
798 def __next__(self):
799 try:
--> 800 return self._next_internal()
801 except errors.OutOfRangeError:
802 raise StopIteration
File ~/anaconda3/envs/experiment/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py:783, in OwnedIterator._next_internal(self)
780 # TODO(b/77291417): This runs in sync mode as iterators use an error status
781 # to communicate that there is no more data to iterate over.
782 with context.execution_mode(context.SYNC):
--> 783 ret = gen_dataset_ops.iterator_get_next(
784 self._iterator_resource,
785 output_types=self._flat_output_types,
786 output_shapes=self._flat_output_shapes)
788 try:
789 # Fast path for the case `self._structure` is not a nested structure.
790 return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access
File ~/anaconda3/envs/experiment/lib/python3.9/site-packages/tensorflow/python/ops/gen_dataset_ops.py:2845, in iterator_get_next(iterator, output_types, output_shapes, name)
2843 return _result
2844 except _core._NotOkStatusException as e:
-> 2845 _ops.raise_from_not_ok_status(e, name)
2846 except _core._FallbackException:
2847 pass
File ~/anaconda3/envs/experiment/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:7107, in raise_from_not_ok_status(e, name)
7105 def raise_from_not_ok_status(e, name):
7106 e.message += (" name: " + name if name is not None else "")
-> 7107 raise core._status_to_exception(e) from None
InvalidArgumentError: Cannot batch tensors with different shapes in component 0. First element had shape [55,3] and element 1 had shape [42,3]. [Op:IteratorGetNext]
From what I understand this is because tensorflow is unable to group together sequence data with variable length.
Below code works with processing and batching, but I am unable to filter my data (to reduce classes to predict):
def parse_example(data_batch):
feature_descriptions = {
"ink": tf.io.VarLenFeature(dtype=tf.float32),
"shape": tf.io.FixedLenFeature([2], dtype=tf.int64),
"class_index": tf.io.FixedLenFeature([1], dtype=tf.int64)
}
examples = tf.io.parse_example(data_batch, feature_descriptions)
flat_sketches = tf.sparse.to_dense(examples["ink"])
sketches = tf.reshape(flat_sketches, shape=[tf.size(data_batch), -1, 3])
lengths = examples["shape"][:, 0]
labels = examples["class_index"][:, 0]
return sketches, lengths, labels
def get_dataset(filepaths, batch_size=32, shuffle_buffer_size=None, n_parse_threads=5, n_read_threads=5, cache=False):
dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=n_read_threads)
if cache:
dataset.cache()
if shuffle_buffer_size:
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.batch(batch_size=batch_size)
dataset = dataset.map(parse_example, num_parallel_calls=n_parse_threads)
# dataset = dataset.apply(filter_cond)
return dataset.prefetch(1)
I am trying to fine tune the T5 transformer for summarization but I am receiving a key error message:
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
The code I am using is basically this:
model_name = '...'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)
(...)
df_dataset = df_dataset[['summary','document']]
df_dataset.document = 'summarize: ' + df_dataset.document
X = list(df_dataset['document'])
y = list(df_dataset['summary'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
y_train_tokenized = tokenizer(y_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
y_val_tokenized = tokenizer(y_val, padding=True, truncation=True, max_length=512)
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
training_set = Dataset(X_train_tokenized, y_train_tokenized)
validation_set = Dataset(X_val_tokenized, y_val_tokenized)
# Define Trainer
args = TrainingArguments(
output_dir="output",
evaluation_strategy="steps",
eval_steps=500,
per_device_train_batch_size=TRAIN_BATCH_SIZE,
per_device_eval_batch_size=VALID_BATCH_SIZE,
num_train_epochs=TRAIN_EPOCHS,
save_steps=3000,
load_best_model_at_end = True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=training_set,
eval_dataset=validation_set,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
And the full error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-29-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-24-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\tokenization_utils_base.py in __getitem__(self, item)
232 return self._encodings[item]
233 else:
--> 234 raise KeyError(
235 "Indexing with integers (to access backend Encoding for a given batch index) "
236 "is not available when using Python based tokenizers"
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
And if change the line:
tokenizer = T5Tokenizer.from_pretrained(model_name)
To:
tokenizer = T5TokenizerFast.from_pretrained(model_name)
the error changes to:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-23-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
RuntimeError: Could not infer dtype of tokenizers.Encoding
Any idea of what is wrong?
This is because this tokenizer returns an object with the following structure
You have to amend the __getitem__ method of your dataset class along the lines of
class ForT5Dataset(torch.utils.data.Dataset):
def __init__(self, inputs, targets):
self.inputs = inputs
self.targets = targets
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
input_ids = torch.tensor(self.inputs["input_ids"][index]).squeeze()
target_ids = torch.tensor(self.targets["input_ids"][index]).squeeze()
return {"input_ids": input_ids, "labels": target_ids}
and pass data prop when initializing, like:
train_ds = ForT5Dataset(train_in.data, train_out.data).