I am trying to fine tune the T5 transformer for summarization but I am receiving a key error message:
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
The code I am using is basically this:
model_name = '...'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)
(...)
df_dataset = df_dataset[['summary','document']]
df_dataset.document = 'summarize: ' + df_dataset.document
X = list(df_dataset['document'])
y = list(df_dataset['summary'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
y_train_tokenized = tokenizer(y_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
y_val_tokenized = tokenizer(y_val, padding=True, truncation=True, max_length=512)
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
training_set = Dataset(X_train_tokenized, y_train_tokenized)
validation_set = Dataset(X_val_tokenized, y_val_tokenized)
# Define Trainer
args = TrainingArguments(
output_dir="output",
evaluation_strategy="steps",
eval_steps=500,
per_device_train_batch_size=TRAIN_BATCH_SIZE,
per_device_eval_batch_size=VALID_BATCH_SIZE,
num_train_epochs=TRAIN_EPOCHS,
save_steps=3000,
load_best_model_at_end = True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=training_set,
eval_dataset=validation_set,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
And the full error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-29-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-24-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\tokenization_utils_base.py in __getitem__(self, item)
232 return self._encodings[item]
233 else:
--> 234 raise KeyError(
235 "Indexing with integers (to access backend Encoding for a given batch index) "
236 "is not available when using Python based tokenizers"
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
And if change the line:
tokenizer = T5Tokenizer.from_pretrained(model_name)
To:
tokenizer = T5TokenizerFast.from_pretrained(model_name)
the error changes to:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-f31e4c5cde21> in <module>
1 # Train pre-trained model
----> 2 trainer.train()
c:\programdata\anaconda3\envs\summa\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1099 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
1100
-> 1101 for step, inputs in enumerate(epoch_iterator):
1102
1103 # Skip past any already trained steps if resuming training
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
515 if self._sampler_iter is None:
516 self._reset()
--> 517 data = self._next_data()
518 self._num_yielded += 1
519 if self._dataset_kind == _DatasetKind.Iterable and \
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
555 def _next_data(self):
556 index = self._next_index() # may raise StopIteration
--> 557 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
558 if self._pin_memory:
559 data = _utils.pin_memory.pin_memory(data)
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\programdata\anaconda3\envs\summa\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-23-67979e648b75> in __getitem__(self, idx)
7 def __getitem__(self, idx):
8 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
----> 9 item['labels'] = torch.tensor(self.labels[idx])
10 return item
11
RuntimeError: Could not infer dtype of tokenizers.Encoding
Any idea of what is wrong?
This is because this tokenizer returns an object with the following structure
You have to amend the __getitem__ method of your dataset class along the lines of
class ForT5Dataset(torch.utils.data.Dataset):
def __init__(self, inputs, targets):
self.inputs = inputs
self.targets = targets
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
input_ids = torch.tensor(self.inputs["input_ids"][index]).squeeze()
target_ids = torch.tensor(self.targets["input_ids"][index]).squeeze()
return {"input_ids": input_ids, "labels": target_ids}
and pass data prop when initializing, like:
train_ds = ForT5Dataset(train_in.data, train_out.data).
Related
I am trying to use a custom CNN to classify spectrogram images generated for 3s audio segments. I am using GroupShuffleSplit to divide the training dataset into a training set and a validation set and to ensure that each participant is only included in one set (to prevent data leakage). I am using a Custom Class, to load the audio files and augment the training set.
If I generate a SoundDS object using the training set, randomly split the object into a training and validation subset, and pass these subsets into two respective data loaders, my model trains without any issues.
However, if I initially use GroupShuffleSplit on the dataframe train_df (grouping by the column addressfname), then generate two SoundDS objects and pass the train_DS and validation_DS into two respective data loaders, I encounter the error message shown at the bottom of this post when I try to run for i, data in enumerate(train_dl) in my train function. Does anyone know where the issue lies?
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
def __init__(self, df):
self.df = df
self.duration = 3000
self.sr = 44100
self.channel = 2
self.shift_pct = 0.4
# ----------------------------
# Number of items in dataset
# ----------------------------
def __len__(self):
return len(self.df)
# ----------------------------
# Get i'th item in dataset
# ----------------------------
def __getitem__(self, idx):
audio_file = self.df.loc[idx, "relative_path"]
class_id = self.df.loc[idx, "dx"]
aud = AudioUtil.open(audio_file)
reaud = AudioUtil.resample(aud, self.sr)
rechan = AudioUtil.rechannel(reaud, self.channel)
dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
return aug_sgram, class_id
This method which randomly splits the training set into a training and validation set works.
from torch.utils.data import random_split
myds = SoundDS(train_df)
# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])
# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)
This method which uses GroupShuffleSplit to splits the training set into a training and validation set and then generates two SoundDS objects does not work.
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=0.15, n_splits=1, random_state = 7)
split = splitter.split(train_df, groups=train_df['adressfname'])
train_inds, valid_inds = next(split)
train_data_df = train_df.iloc[train_inds]
valid_data_df = train_df.iloc[valid_inds]
train_dataset_DS = SoundDS(train_data_df)
valid_dataset_DS = SoundDS(valid_data_df)
train_dl = torch.utils.data.DataLoader(train_dataset_DS, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(valid_dataset_DS, batch_size=16, shuffle=False)
Function to train the model
def training(model, train_dl, num_epochs):
# Loss Function, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
steps_per_epoch=int(len(train_dl)),
epochs=num_epochs,
anneal_strategy='linear')
# Repeat for each epoch
for epoch in range(num_epochs):
running_loss = 0.0
correct_prediction = 0
total_prediction = 0
# Repeat for each batch in the training set
for i, data in enumerate(train_dl): ---- This is where the issue occurs
# Get the input features and target labels, and put them on the GPU
inputs, labels = data[0].to(device), data[1].to(device)
# Normalize the inputs
inputs_m, inputs_s = inputs.mean(), inputs.std()
inputs = (inputs - inputs_m) / inputs_s
This is the error message:
KeyError
Traceback (most recent call last)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
15 frames
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 3170
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-201-b60077dcefb4> in <module>
54
55 num_epochs=2 # Just for demo, adjust this higher.
---> 56 training(myModel, train_dl, num_epochs)
<ipython-input-201-b60077dcefb4> in training(model, train_dl, num_epochs)
15
16 # Repeat for each batch in the training set
---> 17 for i, data in enumerate(train_dl):
18 # Get the input features and target labels, and put them on the GPU
19 inputs, labels = data[0].to(device), data[1].to(device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in __next__(self)
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
<ipython-input-191-4c84224a9983> in __getitem__(self, idx)
27
28 # print(self.df.loc[idx, 'relative_path'])
---> 29 audio_file = self.df.loc[idx, "relative_path"]
30 class_id = self.df.loc[idx, "dx"]
31 # participant_id = self.df.loc[idx, 'adressfname']
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1098 def _getitem_tuple(self, tup: tuple):
1099 with suppress(IndexingError):
-> 1100 return self._getitem_lowerdim(tup)
1101
1102 # no multi-index, so validate all of the indexers
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
836 # We don't need to check for tuples here because those are
837 # caught by the _is_nested_tuple_indexer check above.
--> 838 section = self._getitem_axis(key, axis=i)
839
840 # We should never have a scalar section here, because
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1162 # fall thru to straight lookup
1163 self._validate_key(key, axis)
-> 1164 return self._get_label(key, axis=axis)
1165
1166 def _get_slice_axis(self, slice_obj: slice, axis: int):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis)
1111 def _get_label(self, label, axis: int):
1112 # GH#5667 this will fail if the label is not present in the axis.
-> 1113 return self.obj.xs(label, axis=axis)
1114
1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3775 else:
-> 3776 loc = index.get_loc(key)
3777
3778 if isinstance(loc, np.ndarray):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 3170
Does anyone have any idea why one approach works without any issues, but the other does not?
I am trying to create a custom transformation to part of the CIFAR10 data set which superimposing of an image over the dataset. I was able to download the data and divide it into subsets. Using the following code:
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
traindata = datasets.CIFAR10('./data', train=True, download=True,
transform= transform_train)
partitions = 5
traindata_split = torch.utils.data.random_split(traindata, [int(traindata.data.shape[0] / partitions) for _ in range(partitions)])
then I wanted to modify part of the splits so I created the following class and functions to use as as follows:
class MyDataset(Dataset): # https://discuss.pytorch.org/t/torch-utils-data-dataset-random-split/32209/3
def __init__(self, subset, transform=None):
self.subset = subset
self.transform = transform
def __getitem__(self, index):
x, y = self.subset[index]
if self.transform:
x = self.transform(x)
return x, y
def __len__(self):
return len(self.subset)
and
class ImageSuperImpose(object):
""" Image input as PIL and output as PIL
To be used as part of torchvision.transforms
Args: p, a threshold value to control image thinning
"""
def __init__(self, p=0):
self.p = p
def __call__(self, image):
img = cv2.imread('img.jpg')
img = img('float32')/255
imgSm = cv2.resize(img,(32,32))
np_arr = image.cpu().detach().numpy().T
sample = cv2.addWeighted(np_arr, 1, imgSm, 1, 0)
sample = sample.T
t = torch.from_numpy(sample)
return sample
transform_train2 = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
ImagePoisoning(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
datasetA = MyDataset(
traindata_split[0], transform= transform_train2
)
test_loader = torch.utils.data.DataLoader(datasetA, batch_size=128, shuffle=True)
But when I tried to train the model on the subset I got the following error:
RuntimeError: The size of tensor a (32) must match the size of tensor b (3) at non-singleton dimension 0
** UPDATE**
Here is the full given error
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-20-7428084b03be> in <module>()
----> 1 train(model, opt, test_loader, 3)
9 frames
<ipython-input-14-fcb03e1d7685> in client_update(client_model, optimizer, train_loader, epoch)
5 client_model.train()
6 for e in range(epoch):
----> 7 for batch_idx, (data, target) in enumerate(train_loader):
8 data, target = data.to(device), target.to(device)
9 optimizer.zero_grad()
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
433 if self._sampler_iter is None:
434 self._reset()
--> 435 data = self._next_data()
436 self._num_yielded += 1
437 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
473 def _next_data(self):
474 index = self._next_index() # may raise StopIteration
--> 475 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
476 if self._pin_memory:
477 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
<ipython-input-7-1bde43acaff0> in __getitem__(self, index)
7 x, y = self.subset[index]
8 if self.transform:
----> 9 x = self.transform(x)
10 return x, y
11
/usr/local/lib/python3.6/dist-packages/torchvision/transforms/transforms.py in __call__(self, img)
65 def __call__(self, img):
66 for t in self.transforms:
---> 67 img = t(img)
68 return img
69
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
/usr/local/lib/python3.6/dist-packages/torchvision/transforms/transforms.py in forward(self, tensor)
224 Tensor: Normalized Tensor image.
225 """
--> 226 return F.normalize(tensor, self.mean, self.std, self.inplace)
227
228 def __repr__(self):
/usr/local/lib/python3.6/dist-packages/torchvision/transforms/functional.py in normalize(tensor, mean, std, inplace)
282 if std.ndim == 1:
283 std = std.view(-1, 1, 1)
--> 284 tensor.sub_(mean).div_(std)
285 return tensor
286
RuntimeError: The size of tensor a (32) must match the size of tensor b (3) at non-singleton dimension 0
I apologize in advance in this was asked before. I genuinely did not understand the solution.
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 10
class GPReviewDataset(data.Dataset):
def __init__(self, review, target, tokenizer, max_len):
self.review = review
self.target = target
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.review)
def __getitem__(self, item):
review = str(self.review[item])
encoding = tokenizer.encode_plus(text=review,
max_length=self.max_len,
add_special_tokens=True, padding='max_length',
return_attention_mask=True,
return_token_type_ids=False, return_tensors='pt')
return {'review': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(self.target[item], dtype=torch.long)}
free_df_train, free_df_test = train_test_split(free_df, test_size=0.2)
free_df_val, free_df_test = train_test_split(free_df_test, test_size=0.5)
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = GPReviewDataset(review=df.content.to_numpy(),
target=df['score'].to_numpy(),
tokenizer=tokenizer,
max_len=max_len)
return data.DataLoader(ds, batch_size=batch_size,
num_workers=0)
train_data_loader = create_data_loader(free_df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(free_df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(free_df_test, tokenizer, MAX_LEN, BATCH_SIZE)
data = next(iter(train_data_loader))
I wrote a function later on to take in train_data_loader when training the data, but it was giving me the Runtime error. It seems like the proper solution is to use some sort of collate_fn; however I am confused on how exactly to apply that function.
My error below:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<timed exec> in <module>
<ipython-input-26-8ba1e19dd195> in train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples)
4 correct_predictions = 0
5
----> 6 for i in data_loader:
7 input_ids = i['input_ids'].to(device)
8 attention_mask = i['attention_mask'].to(device)
~\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
~\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
~\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)
~\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py in default_collate(batch)
72 return batch
73 elif isinstance(elem, container_abcs.Mapping):
---> 74 return {key: default_collate([d[key] for d in batch]) for key in elem}
75 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
76 return elem_type(*(default_collate(samples) for samples in zip(*batch)))
~\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py in <dictcomp>(.0)
72 return batch
73 elif isinstance(elem, container_abcs.Mapping):
---> 74 return {key: default_collate([d[key] for d in batch]) for key in elem}
75 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
76 return elem_type(*(default_collate(samples) for samples in zip(*batch)))
~\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py in default_collate(batch)
53 storage = elem.storage()._new_shared(numel)
54 out = elem.new(storage)
---> 55 return torch.stack(batch, 0, out=out)
56 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
57 and elem_type.__name__ != 'string_':
RuntimeError: stack expects each tensor to be equal size, but got [160] at entry 0 and [376] at entry 5
try this instead add (padding = 'max_length') in encode_plus
I got an error of ValueError: Input tensors to a Functional must come from tf.keras.Input. Received: 0 (missing previous layer metadata) and i cant find the cause
this is my error trace and my code
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-15-8058f3a2fd50> in <module>()
6 test_loss, test_accuracy = eg.test(dg.user_test)
7 print('Test set: Loss=%.4f ; Accuracy=%.1f%%' % (test_loss, test_accuracy * 100))
----> 8 eg.save_embeddings('embeddings.csv')
7 frames
<ipython-input-5-54ff9897b1c3> in save_embeddings(self, file_name)
66 inp = self.m.input # input placeholder
67 outputs = [layer.output for layer in self.m.layers] # all layer outputs
---> 68 functor = K.function([inp, K.learning_phase()], outputs ) # evaluation function
69
70 #append embeddings to vectors
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py in function(inputs, outputs, updates, name, **kwargs)
3934 from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top
3935 from tensorflow.python.keras.utils import tf_utils # pylint: disable=g-import-not-at-top
-> 3936 model = models.Model(inputs=inputs, outputs=outputs)
3937
3938 wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in __new__(cls, *args, **kwargs)
240 # Functional model
241 from tensorflow.python.keras.engine import functional # pylint: disable=g-import-not-at-top
--> 242 return functional.Functional(*args, **kwargs)
243 else:
244 return super(Model, cls).__new__(cls, *args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in __init__(self, inputs, outputs, name, trainable)
113 # 'arguments during initialization. Got an unexpected argument:')
114 super(Functional, self).__init__(name=name, trainable=trainable)
--> 115 self._init_graph_network(inputs, outputs)
116
117 #trackable.no_automatic_dependency_tracking
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in _init_graph_network(self, inputs, outputs)
142 base_layer_utils.create_keras_history(self._nested_outputs)
143
--> 144 self._validate_graph_inputs_and_outputs()
145
146 # A Network does not create weights of its own, thus it is already
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/functional.py in _validate_graph_inputs_and_outputs(self)
637 'must come from `tf.keras.Input`. '
638 'Received: ' + str(x) +
--> 639 ' (missing previous layer metadata).')
640 # Check that x is an input tensor.
641 # pylint: disable=protected-access
ValueError: Input tensors to a Functional must come from `tf.keras.Input`. Received: 0 (missing previous layer metadata).
and this is my snipped code:
class EmbeddingsGenerator:
def __init__(self, train_users, data):
self.train_users = train_users
#preprocess
self.data = data.sort_values(by=['timestamp'])
#make them start at 0
self.data['userId'] = self.data['userId'] - 1
self.data['itemId'] = self.data['itemId'] - 1
self.user_count = self.data['userId'].max() + 1
self.movie_count = self.data['itemId'].max() + 1
self.user_movies = {} #list of rated movies by each user
for userId in range(self.user_count):
self.user_movies[userId] = self.data[self.data.userId == userId]['itemId'].tolist()
self.m = self.model()
def model(self, hidden_layer_size=100):
m = Sequential()
m.add(Dense(hidden_layer_size, input_shape=(1, self.movie_count)))
m.add(Dropout(0.2))
m.add(Dense(self.movie_count, activation='softmax'))
m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return m
def generate_input(self, user_id):
'''
Returns a context and a target for the user_id
context: user's history with one random movie removed
target: id of random removed movie
'''
user_movies_count = len(self.user_movies[user_id])
#picking random movie
random_index = np.random.randint(0, user_movies_count-1) # -1 avoids taking the last movie
#setting target
target = np.zeros((1, self.movie_count))
target[0][self.user_movies[user_id][random_index]] = 1
#setting context
context = np.zeros((1, self.movie_count))
context[0][self.user_movies[user_id][:random_index] + self.user_movies[user_id][random_index+1:]] = 1
return context, target
def train(self, nb_epochs = 300, batch_size = 10000):
'''
Trains the model from train_users's history
'''
for i in range(nb_epochs):
print('%d/%d' % (i+1, nb_epochs))
batch = [self.generate_input(user_id=np.random.choice(self.train_users) - 1) for _ in range(batch_size)]
X_train = np.array([b[0] for b in batch])
y_train = np.array([b[1] for b in batch])
self.m.fit(X_train, y_train, epochs=1, validation_split=0.5)
def test(self, test_users, batch_size = 100000):
'''
Returns [loss, accuracy] on the test set
'''
batch_test = [self.generate_input(user_id=np.random.choice(test_users) - 1) for _ in range(batch_size)]
X_test = np.array([b[0] for b in batch_test])
y_test = np.array([b[1] for b in batch_test])
return self.m.evaluate(X_test, y_test)
def save_embeddings(self, file_name):
'''
Generates a csv file containg the vector embedding for each movie.
'''
inp = self.m.input # input placeholder
outputs = [layer.output for layer in self.m.layers] # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs ) # evaluation function
#append embeddings to vectors
vectors = []
for movie_id in range(self.movie_count):
movie = np.zeros((1, 1, self.movie_count))
movie[0][0][movie_id] = 1
layer_outs = functor([movie])
vector = [str(v) for v in layer_outs[0][0][0]]
vector = '|'.join(vector)
vectors.append([movie_id, vector])
#saves as a csv file
embeddings = pd.DataFrame(vectors, columns=['item_id', 'vectors']).astype({'item_id': 'int32'})
embeddings.to_csv(file_name, sep=';', index=False)
files.download(file_name)
this is the part of code which call the save_embeddings method
if True: # Generate embeddings?
eg = EmbeddingsGenerator(dg.user_train, pd.read_csv('ml-100k/u.data', sep='\t', names=['userId', 'itemId', 'rating', 'timestamp']))
eg.train(nb_epochs=300)
train_loss, train_accuracy = eg.test(dg.user_train)
print('Train set: Loss=%.4f ; Accuracy=%.1f%%' % (train_loss, train_accuracy * 100))
test_loss, test_accuracy = eg.test(dg.user_test)
print('Test set: Loss=%.4f ; Accuracy=%.1f%%' % (test_loss, test_accuracy * 100))
eg.save_embeddings('embeddings.csv')
I am loading NumPy arrays as images in PyTorch while training the model, it’s giving me this error, I tried everything but couldn’t figure out pls help…, I am training a classifier model.......................................................................................................................................................................................
ValueError Traceback (most recent call last)
<ipython-input-12-b4d3f7be01c1> in <module>
1 # training
----> 2 trained_model = train(n_epochs, np.Inf, loaders, model, optimizer, criterion)
<ipython-input-10-b4d180a2c041> in train(n_epochs, valid_loss_min_input, loaders, model, optimizer, criterion, device, checkpoint_path, best_model_path)
29 ###################
30 model.train()
---> 31 for batch_idx, (data, target) in enumerate(loaders['train']):
32 # move to gpu
33 data, target = data.to(device), target.to(device)
G:\anaconda3\envs\data_env\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
343
344 def __next__(self):
--> 345 data = self._next_data()
346 self._num_yielded += 1
347 if self._dataset_kind == _DatasetKind.Iterable and \
G:\anaconda3\envs\data_env\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
383 def _next_data(self):
384 index = self._next_index() # may raise StopIteration
--> 385 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
386 if self._pin_memory:
387 data = _utils.pin_memory.pin_memory(data)
G:\anaconda3\envs\data_env\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
G:\anaconda3\envs\data_env\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
G:\anaconda3\envs\data_env\lib\site-packages\torchvision\datasets\folder.py in __getitem__(self, index)
135 sample = self.loader(path)
136 if self.transform is not None:
--> 137 sample = self.transform(sample)
138 if self.target_transform is not None:
139 target = self.target_transform(target)
G:\anaconda3\envs\data_env\lib\site-packages\torchvision\transforms\transforms.py in __call__(self, img)
59 def __call__(self, img):
60 for t in self.transforms:
---> 61 img = t(img)
62 return img
63
<ipython-input-3-88cdee8f0d6c> in __call__(self, img)
7 im = np.asarray(img)
8 im = detect(im)
----> 9 img = Image.fromarray(im)
10 img = img.resize(size=(128, 128))
11 return img
G:\anaconda3\envs\data_env\lib\site-packages\PIL\Image.py in fromarray(obj, mode)
2768 obj = obj.tostring()
2769
-> 2770 return frombuffer(mode, size, obj, "raw", rawmode, 0, 1)
2771
2772
G:\anaconda3\envs\data_env\lib\site-packages\PIL\Image.py in frombuffer(mode, size, data, decoder_name, *args)
2708 return im
2709
-> 2710 return frombytes(mode, size, data, decoder_name, args)
2711
2712
G:\anaconda3\envs\data_env\lib\site-packages\PIL\Image.py in frombytes(mode, size, data, decoder_name, *args)
2648
2649 im = new(mode, size)
-> 2650 im.frombytes(data, decoder_name, args)
2651 return im
2652
G:\anaconda3\envs\data_env\lib\site-packages\PIL\Image.py in frombytes(self, data, decoder_name, *args)
795 # unpack data
796 d = _getdecoder(self.mode, decoder_name, args)
--> 797 d.setimage(self.im)
798 s = d.decode(data)
799
ValueError: tile cannot extend outside the image
In the custom function, I am trying to preprocess the image and then calling it
It works fine when I test out the code loading a small batch for display