from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
SequentialSampler)
def data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50):
"""
Convert train and validation sets to torch.Tensors and load them to DataLoader.
"""
# Convert data type to torch.Tensor
train_inputs, val_inputs, train_labels, val_labels =\
tuple(torch.tensor(data) for data in
[train_inputs, val_inputs, train_labels, val_labels])
# Specify batch_size
batch_size = 50
# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=batch_size)
# Create DataLoader for validation data
val_data = TensorDataset(val_inputs, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
return train_dataloader, val_dataloader
The code works fine when the train_inputs and val_inputs tensors are of of type int64, but doesn't when the type is int32.
Can someone tell me what's wrong here?
ERROR:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
3620 try:
-> 3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:163, in pandas._libs.index.IndexEngine.get_loc()
File pandas\_libs\hashtable_class_helper.pxi:2131, in pandas._libs.hashtable.Int64HashTable.get_item()
File pandas\_libs\hashtable_class_helper.pxi:2140, in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 8
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Input In [31], in <cell line: 6>()
2 train_inputs, val_inputs, train_labels, val_labels = train_test_split(
3 input_ids, labels, test_size=0.1, random_state=42)
5 # Load data to PyTorch DataLoader
----> 6 train_dataloader, val_dataloader = data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)
Input In [28], in data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size)
6 """Convert train and validation sets to torch.Tensors and load them to
7 DataLoader.
8 """
10 # Convert data type to torch.Tensor
11 train_inputs, val_inputs, train_labels, val_labels =\
---> 12 tuple(torch.tensor(data) for data in
13 [train_inputs, val_inputs, train_labels, val_labels])
15 # Specify batch_size
16 batch_size = 50
Input In [28], in <genexpr>(.0)
6 """Convert train and validation sets to torch.Tensors and load them to
7 DataLoader.
8 """
10 # Convert data type to torch.Tensor
11 train_inputs, val_inputs, train_labels, val_labels =\
---> 12 tuple(torch.tensor(data) for data in
13 [train_inputs, val_inputs, train_labels, val_labels])
15 # Specify batch_size
16 batch_size = 50
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:958, in Series.__getitem__(self, key)
955 return self._values[key]
957 elif key_is_scalar:
--> 958 return self._get_value(key)
960 if is_hashable(key):
961 # Otherwise index.get_value will raise InvalidIndexError
962 try:
963 # For labels that don't resolve as scalars like tuples and frozensets
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:1069, in Series._get_value(self, label, takeable)
1066 return self._values[label]
1068 # Similar to Index.get_value, but we do not fall back to positional
-> 1069 loc = self.index.get_loc(label)
1070 return self.index._get_values_for_loc(self, loc, label)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance)
3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
-> 3623 raise KeyError(key) from err
3624 except TypeError:
3625 # If we have a listlike key, _check_indexing_error will raise
3626 # InvalidIndexError. Otherwise we fall through and re-raise
3627 # the TypeError.
3628 self._check_indexing_error(key)
KeyError: 8
I was using the same code on my data set and had the same issue. I did 2 things. changed the random_state to not be 42 (which probably wasn't what fixed it) and I also changed my labels to np.array and now it works
Related
I've been working in my model.
I came to a point in which I've to use training, test and Validation generators like this:
def get_train_generator(df, image_dir, x_col, y_cols, shuffle=True, batch_size=8, seed=1, target_w = 320, target_h = 320):
"""
Return generator for training set, normalizing using batch
statistics.
Args:
train_df (dataframe): dataframe specifying training data.
image_dir (str): directory where image files are held.
x_col (str): name of column in df that holds filenames.
y_cols (list): list of strings that hold y labels for images.
batch_size (int): images per batch to be fed into model during training.
seed (int): random seed.
target_w (int): final width of input images.
target_h (int): final height of input images.
Returns:
train_generator (DataFrameIterator): iterator over training set
"""
print("getting train generator...")
# normalize images
image_generator = ImageDataGenerator(
samplewise_center=True,
samplewise_std_normalization= True)
# flow from directory with specified batch size
# and target image size
generator = image_generator.flow_from_dataframe(
dataframe=df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=shuffle,
seed=seed,
target_size=(target_w,target_h))
return generator
this work quite good!
but when I came to the test and valid generator as:
def get_test_and_valid_generator(valid_df, test_df, train_df, image_dir, x_col, y_cols, sample_size=100, batch_size=8, seed=1, target_w = 320, target_h = 320):
"""
Return generator for validation set and test set using
normalization statistics from training set.
Args:
valid_df (dataframe): dataframe specifying validation data.
test_df (dataframe): dataframe specifying test data.
train_df (dataframe): dataframe specifying training data.
image_dir (str): directory where image files are held.
x_col (str): name of column in df that holds filenames.
y_cols (list): list of strings that hold y labels for images.
sample_size (int): size of sample to use for normalization statistics.
batch_size (int): images per batch to be fed into model during training.
seed (int): random seed.
target_w (int): final width of input images.
target_h (int): final height of input images.
Returns:
test_generator (DataFrameIterator) and valid_generator: iterators over test set and validation set respectively
"""
print("getting train and valid generators...")
# get generator to sample dataset
raw_train_generator = ImageDataGenerator().flow_from_dataframe(
dataframe=train_df,
directory=IMAGE_DIR,
x_col="Image",
y_col=labels,
class_mode="raw",
batch_size=sample_size,
shuffle=True,
target_size=(target_w, target_h))
# get data sample
batch = raw_train_generator.next()
data_sample = batch[0]
# use sample to fit mean and std for test set generator
image_generator = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization= True)
# fit generator to sample from training data
image_generator.fit(data_sample)
# get test generator
valid_generator = image_generator.flow_from_dataframe(
dataframe=valid_df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=False,
seed=seed,
target_size=(target_w,target_h))
test_generator = image_generator.flow_from_dataframe(
dataframe=test_df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=False,
seed=seed,
target_size=(target_w,target_h))
return valid_generator, test_generator
so when I enter my data as :
IMAGE_DIR = "/Users/awabe/Desktop/Project/PapilaDB/Image"
train_generator = get_train_generator(train_df, IMAGE_DIR, "Image", labels)
it gives me:
getting train generator...
Found 488 validated image filenames.
and this is good also.
but in the test and valid section I get:
IMAGE_DIR = "/Users/awabe/Desktop/Project/PapilaDB/Image"
train_generator = get_train_generator(train_df, IMAGE_DIR, "Image", labels)
valid_generator, test_generator= get_test_and_valid_generator(valid_df, test_df, train_df, IMAGE_DIR, "Image", labels)
the error is:
KeyError Traceback (most recent call last)
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/pandas/core/indexes/base.py:3803, in Index.get_loc(self, key, method, tolerance)
3802 try:
-> 3803 return self._engine.get_loc(casted_key)
3804 except KeyError as err:
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/pandas/_libs/index.pyx:146, in pandas._libs.index.IndexEngine.get_loc()
File pandas/_libs/index_class_helper.pxi:49, in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Image'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[201], line 4
1 IMAGE_DIR = "/Users/awabe/Desktop/Project/PapilaDB/Image"
3 train_generator = get_train_generator(train_df, IMAGE_DIR, "Image", labels)
----> 4 valid_generator, test_generator= get_test_and_valid_generator(valid_df, test_df, train_df, IMAGE_DIR, "Image", labels)
Cell In[194], line 47, in get_test_and_valid_generator(valid_df, test_df, train_df, image_dir, x_col, y_cols, sample_size, batch_size, seed, target_w, target_h)
44 image_generator.fit(data_sample)
46 # get test generator
---> 47 valid_generator = image_generator.flow_from_dataframe(
48 dataframe=valid_df,
49 directory=image_dir,
50 x_col=x_col,
51 y_col=y_cols,
52 class_mode="raw",
53 batch_size=batch_size,
54 shuffle=False,
55 seed=seed,
56 target_size=(target_w,target_h))
58 test_generator = image_generator.flow_from_dataframe(
59 dataframe=test_df,
60 directory=image_dir,
(...)
66 seed=seed,
67 target_size=(target_w,target_h))
68 return valid_generator, test_generator
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/preprocessing/image.py:1808, in ImageDataGenerator.flow_from_dataframe(self, dataframe, directory, x_col, y_col, weight_col, target_size, color_mode, classes, class_mode, batch_size, shuffle, seed, save_to_dir, save_prefix, save_format, subset, interpolation, validate_filenames, **kwargs)
1801 if "drop_duplicates" in kwargs:
1802 warnings.warn(
1803 "drop_duplicates is deprecated, you can drop duplicates "
1804 "by using the pandas.DataFrame.drop_duplicates method.",
1805 DeprecationWarning,
1806 )
-> 1808 return DataFrameIterator(
1809 dataframe,
1810 directory,
1811 self,
1812 x_col=x_col,
1813 y_col=y_col,
1814 weight_col=weight_col,
1815 target_size=target_size,
1816 color_mode=color_mode,
1817 classes=classes,
1818 class_mode=class_mode,
1819 data_format=self.data_format,
1820 batch_size=batch_size,
1821 shuffle=shuffle,
1822 seed=seed,
1823 save_to_dir=save_to_dir,
1824 save_prefix=save_prefix,
1825 save_format=save_format,
1826 subset=subset,
1827 interpolation=interpolation,
1828 validate_filenames=validate_filenames,
1829 dtype=self.dtype,
1830 )
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/preprocessing/image.py:968, in DataFrameIterator.__init__(self, dataframe, directory, image_data_generator, x_col, y_col, weight_col, target_size, color_mode, classes, class_mode, batch_size, shuffle, seed, data_format, save_to_dir, save_prefix, save_format, subset, interpolation, keep_aspect_ratio, dtype, validate_filenames)
966 self.dtype = dtype
967 # check that inputs match the required class_mode
--> 968 self._check_params(df, x_col, y_col, weight_col, classes)
969 if (
970 validate_filenames
971 ): # check which image files are valid and keep them
972 df = self._filter_valid_filepaths(df, x_col)
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/preprocessing/image.py:1029, in DataFrameIterator._check_params(self, df, x_col, y_col, weight_col, classes)
1023 raise TypeError(
1024 'If class_mode="{}", y_col must be a list. Received {}.'.format(
1025 self.class_mode, type(y_col).__name__
1026 )
1027 )
1028 # check that filenames/filepaths column values are all strings
-> 1029 if not all(df[x_col].apply(lambda x: isinstance(x, str))):
1030 raise TypeError(
1031 "All values in column x_col={} must be strings.".format(x_col)
1032 )
1033 # check labels are string if class_mode is binary or sparse
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/pandas/core/frame.py:3805, in DataFrame.__getitem__(self, key)
3803 if self.columns.nlevels > 1:
3804 return self._getitem_multilevel(key)
-> 3805 indexer = self.columns.get_loc(key)
3806 if is_integer(indexer):
3807 indexer = [indexer]
File /opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key, method, tolerance)
3803 return self._engine.get_loc(casted_key)
3804 except KeyError as err:
-> 3805 raise KeyError(key) from err
3806 except TypeError:
3807 # If we have a listlike key, _check_indexing_error will raise
3808 # InvalidIndexError. Otherwise we fall through and re-raise
3809 # the TypeError.
3810 self._check_indexing_error(key)
KeyError: 'Image'
so what can Ido? I've tried different ways but still no action.
is there any way to generate it in another form?
I am trying to use a custom CNN to classify spectrogram images generated for 3s audio segments. I am using GroupShuffleSplit to divide the training dataset into a training set and a validation set and to ensure that each participant is only included in one set (to prevent data leakage). I am using a Custom Class, to load the audio files and augment the training set.
If I generate a SoundDS object using the training set, randomly split the object into a training and validation subset, and pass these subsets into two respective data loaders, my model trains without any issues.
However, if I initially use GroupShuffleSplit on the dataframe train_df (grouping by the column addressfname), then generate two SoundDS objects and pass the train_DS and validation_DS into two respective data loaders, I encounter the error message shown at the bottom of this post when I try to run for i, data in enumerate(train_dl) in my train function. Does anyone know where the issue lies?
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
def __init__(self, df):
self.df = df
self.duration = 3000
self.sr = 44100
self.channel = 2
self.shift_pct = 0.4
# ----------------------------
# Number of items in dataset
# ----------------------------
def __len__(self):
return len(self.df)
# ----------------------------
# Get i'th item in dataset
# ----------------------------
def __getitem__(self, idx):
audio_file = self.df.loc[idx, "relative_path"]
class_id = self.df.loc[idx, "dx"]
aud = AudioUtil.open(audio_file)
reaud = AudioUtil.resample(aud, self.sr)
rechan = AudioUtil.rechannel(reaud, self.channel)
dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
return aug_sgram, class_id
This method which randomly splits the training set into a training and validation set works.
from torch.utils.data import random_split
myds = SoundDS(train_df)
# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])
# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)
This method which uses GroupShuffleSplit to splits the training set into a training and validation set and then generates two SoundDS objects does not work.
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=0.15, n_splits=1, random_state = 7)
split = splitter.split(train_df, groups=train_df['adressfname'])
train_inds, valid_inds = next(split)
train_data_df = train_df.iloc[train_inds]
valid_data_df = train_df.iloc[valid_inds]
train_dataset_DS = SoundDS(train_data_df)
valid_dataset_DS = SoundDS(valid_data_df)
train_dl = torch.utils.data.DataLoader(train_dataset_DS, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(valid_dataset_DS, batch_size=16, shuffle=False)
Function to train the model
def training(model, train_dl, num_epochs):
# Loss Function, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
steps_per_epoch=int(len(train_dl)),
epochs=num_epochs,
anneal_strategy='linear')
# Repeat for each epoch
for epoch in range(num_epochs):
running_loss = 0.0
correct_prediction = 0
total_prediction = 0
# Repeat for each batch in the training set
for i, data in enumerate(train_dl): ---- This is where the issue occurs
# Get the input features and target labels, and put them on the GPU
inputs, labels = data[0].to(device), data[1].to(device)
# Normalize the inputs
inputs_m, inputs_s = inputs.mean(), inputs.std()
inputs = (inputs - inputs_m) / inputs_s
This is the error message:
KeyError
Traceback (most recent call last)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
15 frames
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 3170
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-201-b60077dcefb4> in <module>
54
55 num_epochs=2 # Just for demo, adjust this higher.
---> 56 training(myModel, train_dl, num_epochs)
<ipython-input-201-b60077dcefb4> in training(model, train_dl, num_epochs)
15
16 # Repeat for each batch in the training set
---> 17 for i, data in enumerate(train_dl):
18 # Get the input features and target labels, and put them on the GPU
19 inputs, labels = data[0].to(device), data[1].to(device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in __next__(self)
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
<ipython-input-191-4c84224a9983> in __getitem__(self, idx)
27
28 # print(self.df.loc[idx, 'relative_path'])
---> 29 audio_file = self.df.loc[idx, "relative_path"]
30 class_id = self.df.loc[idx, "dx"]
31 # participant_id = self.df.loc[idx, 'adressfname']
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1098 def _getitem_tuple(self, tup: tuple):
1099 with suppress(IndexingError):
-> 1100 return self._getitem_lowerdim(tup)
1101
1102 # no multi-index, so validate all of the indexers
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
836 # We don't need to check for tuples here because those are
837 # caught by the _is_nested_tuple_indexer check above.
--> 838 section = self._getitem_axis(key, axis=i)
839
840 # We should never have a scalar section here, because
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1162 # fall thru to straight lookup
1163 self._validate_key(key, axis)
-> 1164 return self._get_label(key, axis=axis)
1165
1166 def _get_slice_axis(self, slice_obj: slice, axis: int):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _get_label(self, label, axis)
1111 def _get_label(self, label, axis: int):
1112 # GH#5667 this will fail if the label is not present in the axis.
-> 1113 return self.obj.xs(label, axis=axis)
1114
1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3775 else:
-> 3776 loc = index.get_loc(key)
3777
3778 if isinstance(loc, np.ndarray):
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 3170
Does anyone have any idea why one approach works without any issues, but the other does not?
I tried SER in machine learning from a site called dataflair and for the following code,
#DataFlair - Load the data and extract features for each sound file
def load_data(test_size=0.2):
x,y=[],[]
for file in glob.glob("D:\archive\Actor_01\03-01-01-01-01-01-01.wav"):
file_name=os.path.basename(file)
emotion=emotions[file_name.split("-")[2]]
if emotion not in observed_emotions:
continue
feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
x.append(feature)
y.append(emotion)
return train_test_split(np.array(x), y, test_size=test_size, random_state=9)
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)
i got this error-
ValueError Traceback (most recent call last)
Input In [10], in <cell line: 2>()
1 #DataFlair - Split the dataset
----> 2 x_train,x_test,y_train,y_test=load_data(test_size=0.25)
Input In [9], in load_data(test_size)
10 x.append(feature)
11 y.append(emotion)
---> 12 return train_test_split(np.array(x), y, test_size=test_size, random_state=9)
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2420, in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays)
2417 arrays = indexable(*arrays)
2419 n_samples = _num_samples(arrays[0])
-> 2420 n_train, n_test = _validate_shuffle_split(
2421 n_samples, test_size, train_size, default_test_size=0.25
2422 )
2424 if shuffle is False:
2425 if stratify is not None:
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2098, in _validate_shuffle_split(n_samples, test_size, train_size, default_test_size)
2095 n_train, n_test = int(n_train), int(n_test)
2097 if n_train == 0:
-> 2098 raise ValueError(
2099 "With n_samples={}, test_size={} and train_size={}, the "
2100 "resulting train set will be empty. Adjust any of the "
2101 "aforementioned parameters.".format(n_samples, test_size, train_size)
2102 )
2104 return n_train, n_test
ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
what are these errors and how do i overcome them? it works just fine on the website, https://data-flair.training/blogs/python-mini-project-speech-emotion-recognition/
Hi I am using virtual env and making a train model for my project and using keras 2.3.1 and tensorflow 2.2.0 All my code is working but I am run last line and exception's occur ,so line is here;
from Lib.data_loader import DataLoader
from Lib.resnet_model import Resnet3DBuilder
from Lib.HistoryGraph import HistoryGraph
import Lib.image as img
from Lib.utils import mkdirs
import os
from math import ceil
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
target_size = (64,96)
nb_frames = 16 # here this will get number of pictres from datasets folder
skip = 1 # using resnet we skip different layers
nb_classes = 27
batch_size = 64
input_shape = (nb_frames,) + target_size + (3,)
workers = 8
use_multiprocessing = False
max_queue_size = 20
data_root = r"D:\FYP\DataSet"
csv_labels = r"D:\FYP\DataSet\jester-v1-labels.csv"
csv_train = r"D:\FYP\DataSet\jester-v1-train.csv"
csv_val = r"D:\FYP\DataSet\jester-v1-validation.csv"
csv_test = r"D:\FYP\DataSet\jester-v1-test.csv "
data_vid = r"D:\FYP\DataSet\videos"
model_name = 'resent_3d_model'
data_model = r"D:\FYP\DataSet\Model"
path_model = os.path.join(data_root, data_model, model_name)
path_vid = os.path.join(data_root, data_vid)
path_labels = os.path.join(data_root, csv_labels)
path_train = os.path.join(data_root, csv_train)
path_val = os.path.join(data_root, csv_val)
path_test = os.path.join(data_root, csv_test)
data = DataLoader(path_vid, path_labels, path_train, path_val)
mkdirs(path_model, 0o755)
mkdirs(os.path.join(path_model, "graphs"), 0o755)
gen = img.ImageDataGenerator()
gen_train = gen.flow_video_from_dataframe(data.train_df, path_vid, path_classes=path_labels, x_col='video_id', y_col="labels", target_size=target_size, batch_size=batch_size, nb_frames=nb_frames, skip=skip, has_ext=True)
gen_val = gen.flow_video_from_dataframe(data.val_df, path_vid, path_classes=path_labels, x_col='video_id', y_col="labels", target_size=target_size, batch_size=batch_size, nb_frames=nb_frames, skip=skip, has_ext=True)
resnet_model = Resnet3DBuilder.build_resnet_101(input_shape, nb_classes, drop_rate = 0.5)
optimizer = SGD(lr=0.01, momentum=0.9, decay=0.0001, nesterov=False)
resnet_model.compile(optimizer = optimizer, loss= "categorical_crossentropy" , metrics=["accuracy"])
model_file = os.path.join(path_model, 'resnetmodel.hdf5')
model_checkpointer = ModelCheckpoint(model_file, monitor='val_acc',verbose=1, save_best_only=True, mode='max')
history_graph = HistoryGraph(model_path_name = os.path.join(path_model, "graphs"))
nb_sample_train = data.train_df["video_id"].size
nb_sample_val = data.val_df["video_id"].size
resnet_model.fit_generator(
generator = gen_train,
steps_per_epoch = ceil(nb_sample_train/batch_size),
epochs=100,
validation_data=gen_val,
validation_steps=30,
shuffle=True,
verbose=1,
workers=workers,
max_queue_size = max_queue_size,
use_multiprocessing = use_multiprocessing,
callbacks = [model_checkpointer, history_graph])
And error is here below when I am running last line
Epoch 1/100
C:\Users\Virus\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\utils\data_utils.py:613: UserWarning: The input 80 could not be retrieved. It could be because a worker has died.
warnings.warn(
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\utils\data_utils.py in get(self)
609 future = self.queue.get(block=True)
--> 610 inputs = future.get(timeout=30)
611 except mp.TimeoutError:
~\anaconda3\envs\HandGestureRecognitionSystem\lib\multiprocessing\pool.py in get(self, timeout)
766 if not self.ready():
--> 767 raise TimeoutError
768 if self._success:
TimeoutError:
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'labels'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-15-6810853f4b54> in <module>
----> 1 resnet_model.fit_generator(
2 generator = gen_train,
3 steps_per_epoch = ceil(nb_sample_train/batch_size),
4 epochs=100,
5 validation_data=gen_val,
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\legacy\interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1716 ```
1717 """
-> 1718 return training_generator.fit_generator(
1719 self, generator,
1720 steps_per_epoch=steps_per_epoch,
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\engine\training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
183 batch_index = 0
184 while steps_done < steps_per_epoch:
--> 185 generator_output = next(output_generator)
186
187 if not hasattr(generator_output, '__len__'):
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\utils\data_utils.py in get(self)
623 except Exception:
624 self.stop()
--> 625 six.reraise(*sys.exc_info())
626
627
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\keras\utils\data_utils.py in get(self)
615 ' It could be because a worker has died.'.format(idx),
616 UserWarning)
--> 617 inputs = self.sequence[idx]
618 finally:
619 self.queue.task_done()
D:\HandGesturesProject\Lib\image.py in __getitem__(self, idx)
1534 index_array = self.index_array[self.batch_size * idx:
1535 self.batch_size * (idx + 1)]
-> 1536 return self._get_batches_of_transformed_samples(index_array)
1537
1538 def common_init(self, image_data_generator,
D:\HandGesturesProject\Lib\image.py in _get_batches_of_transformed_samples(self, index_array)
2243 dtype=self.dtype)
2244
-> 2245 for i, label in enumerate(self.df.iloc[index_array][self.y_col].values):
2246 batch_y[i, self.classes_indices[label]] = 1
2247
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
~\anaconda3\envs\HandGestureRecognitionSystem\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
3084 if tolerance is not None:
KeyError: 'labels'
Anyone help me how to deal with it thankyou.
I am trying to feed data for multiple inputs (2 inputs) for Keras (Tenserflow) for TPU training, but I get this error:
ValueError: The dataset returned a non-Tensor type
((<class 'tensorflow.python.framework.ops.Tensor'>,
<class 'tensorflow.python.framework.ops.Tensor'>)) at index 0
I tried this link: tf.data with multiple inputs / outputs in Keras
def train_input_fn(batch_size=1024):
dataset_features = tf.data.Dataset.from_tensor_slices((x_train_h, x_train_l))
dataset_label = tf.data.Dataset.from_tensor_slices(Y_train)
dataset = tf.data.Dataset.zip((dataset_features, dataset_label)).batch(batch_size, drop_remainder=True)
return dataset
history = tpu_model.fit(train_input_fn,
steps_per_epoch = 30,
epochs = 100,
validation_data = test_input_fn,
validation_steps = 1,
callbacks = [tensorboard])
[model]: https://take.ms/jO4P5
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-242-65c6d5a98fb7> in <module>()
12 validation_data = test_input_fn,
13 validation_steps = 1,
---> 14 callbacks = [tensorboard,
15 #checkpointer
16 ]
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1486 'be None')
1487 infeed_manager = TPUDatasetInfeedManager(
-> 1488 dataset, self._tpu_assignment, model_fn_lib.ModeKeys.TRAIN)
1489 # Use dummy numpy inputs for the rest of Keras' shape checking. We
1490 # intercept them when building the model.
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in __init__(self, dataset, tpu_assignment, mode)
722 mode: ModeKeys enum.
723 """
--> 724 self._verify_dataset_shape(dataset)
725
726 self._dataset = dataset
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in _verify_dataset_shape(self, dataset)
783 if cls != ops.Tensor:
784 raise ValueError('The dataset returned a non-Tensor type (%s) at '
--> 785 'index %d.' % (cls, i))
786 for i, shape in enumerate(dataset.output_shapes):
787 if not shape:
ValueError: The dataset returned a non-Tensor type ((<class 'tensorflow.python.framework.ops.Tensor'>, <class 'tensorflow.python.framework.ops.Tensor'>)) at index 0.