Too much RAM is required for loading dataset - python

I’m working in a neural network and my dataset has 42000 images and I have to load it all. I’m using google colab for that, but every time I load the dataset the RAM is insufficient.
I am putting everything in a numpy array, cause I tried to use the ImageGenerator method and it didn’t work. I’m using the following code to load the data:
class = glob.glob(r"/content/drive/MyDrive/DATASET/class/*.*")
data = []
labels = []
for i in class:
image=tf.keras.preprocessing.image.load_img(i, color_mode='rgb',
target_size= (336, 336))
image=np.array(image)
data.append(image)
labels.append(0)
data = np.array(data)
labels = np.array(labels)

As ImageDataGenerator is deprecated, you can use a custom Keras Sequence class to load images when needed.
The strategy here is to create a Pandas DataFrame with all the path and class of your images then transform the class to numeric label with pd.factorize. Once, you have X (paths) and y (labels), you can use train_test_split to extract 3 subsets: train, test and validation. The last step is to convert these collections to datasets compatible with Tensorflow.
Each time, Tensorflow process a batch, the Sequence will load a batch of images in memory and so on.
Step 0: Imports and constants
import tensorflow as tf
import pandas as pd
import numpy as np
import pathlib
from sklearn.model_selection import train_test_split
INPUT_SHAPE = (336, 336, 3)
BATCH_SIZE = 32
DATA_DIR = pathlib.Path('/content/drive/MyDrive/DATASET/')
Step 1: Load all image paths to a Pandas DataFrame:
# Find images of dataset
data = []
for file in DATA_DIR.glob('**/*.jpg'):
d = {'class': file.parent.name,
'path': file}
data.append(d)
# Create dataframe and select columns
df = pd.DataFrame(data)
df['label'] = pd.factorize(df['class'])[0]
X = df['path']
y = df['label']
# Split into 3 balanced datasets
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=2023)
X_train, X_valid, y_train, y_valid = \
train_test_split(X_train, y_train, test_size=0.2, random_state=2023)
Step 2: Create a custom data Sequence
class ImgDataSequence(tf.keras.utils.Sequence):
"""
Check documentation here: https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence
"""
def __init__(self, image_set, label_set, batch_size=32, image_size=(256, 256)):
self.image_set = np.array(image_set)
self.label_set = np.array(label_set)
self.batch_size = batch_size
self.image_size = image_size
def __get_image(self, image):
image = tf.keras.preprocessing.image.load_img(image, color_mode='rgb', target_size=self.image_size)
image_arr = tf.keras.preprocessing.image.img_to_array(image)
return image_arr
def __get_data(self, images, labels):
image_batch = np.asarray([self.__get_image(img) for img in images])
label_batch = np.asarray(labels)
return image_batch, label_batch
def __getitem__(self, index):
images = self.image_set[index * self.batch_size:(index + 1) * self.batch_size]
labels = self.label_set[index * self.batch_size:(index + 1) * self.batch_size]
images, labels = self.__get_data(images, labels)
return images, labels
def __len__(self):
return len(self.image_set) // self.batch_size + (len(self.image_set) % self.batch_size > 0)
Step 3: Create datasets
train_ds = ImgDataSequence(X_train, y_train, image_size=INPUT_SHAPE[:2], batch_size=BATCH_SIZE)
valid_ds = ImgDataSequence(X_valid, y_valid, image_size=INPUT_SHAPE[:2], batch_size=BATCH_SIZE)
test_ds = ImgDataSequence(X_test, y_test, image_size=INPUT_SHAPE[:2], batch_size=BATCH_SIZE)
Test the new datasets:
# Take the first batch of our train dataset
>>> imgs, labels = train_ds[0]
# Check then length (BATCH_SIZE)
>>> len(labels)
32
# Check the dimension of one image
>>> imgs[0].shape
(336, 336, 3)
How to use it with Tensorflow?
# train_ds & valid_ds to fit
history = model.fit(train_ds, epochs=10, validation_data=valid_ds)
# test_ds to evaluate
loss, *metrics = model.evaluate(test_ds)

Related

LSTM model has poor prediction in simple example

I am trying to generate a LSTM model using Keras. I create a simple sine wave example which contain more thang 1000 point to predict the next point. But the result is not good as i expected. When i fit the model the result is moves between 0~1 not like the sine wave. I have tried to change parameter like epoch, batchsize, learning rate, but it is not better.
model predict image
What am I doing wrong?
import joblib
import numpy as np
import matplotlib.pyplot as plt
import copy
import gc
import os
import sys
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from keras.callbacks import Callback
learning_rate = 0.001
len_train = 30
total_predict = 300
len_test = 400
epoch = 100
batch_size = 32
workers = -1
class Callback_Class(Callback):
def load_data(self, x_test, y_test):
self.x_test = x_test
self.y_test = np.array(y_test)
def model_predict(self, data_close):
output_predict = []
for i in range(total_predict):
if (i==0):
data_close_ = data_close.reshape(-1, len_train, 1)
else:
data_close_ = np.delete(data_close_, 0)
data_close_ = np.append(data_close_, pred_close)
data_close_ = data_close_.reshape(-1, len_train, 1)
pred_close = model.predict(data_close_)
pred_close = pred_close.ravel()
pred_close = np.array(pred_close).reshape(len(pred_close), 1)
pred_cl = sc.inverse_transform(pred_close)
output_predict.append(pred_cl)
output_predict = np.array(output_predict)
return output_predict
def on_epoch_end(self, epoch, logs=None):
if (epoch % 20 == 0):
output_predict = self.model_predict(self.x_test)
fig, ax = plt.subplots(figsize=(12,6))
ax.grid(True)
plt.title(f"Model predict")
plt.plot(output_predict.ravel(), color="red", label='Predict')
plt.plot(self.y_test.ravel(), color="blue", label='REAL')
fig.tight_layout()
plt.legend(loc='lower left')
plt.savefig(f'Demo_lstm_epoch_{epoch}.png')
plt.clf()
plt.close()
def lstm_reg(input_shape=(60, 1), unit=40, clustering_params=None):
inputs = Input(input_shape)
lstm1f = Bidirectional(LSTM(units=32, return_sequences=True))(inputs)
lstm1f = Bidirectional(LSTM(units=32, return_sequences=False))(lstm1f)
outputs = Dense(units=1, activation='linear')(lstm1f)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_squared_error', metrics=["accuracy"])
return model
def create_data_train(data_time_series):
data_time_series = np.array(data_time_series).ravel()
X_train = []
y_train = []
for i in range(len_train, len(data_time_series)):
X_train.append(data_time_series[i-len_train:i])
y_train.append(data_time_series[i])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
return X_train, y_train
x = np.linspace(-20*np.pi, 20*np.pi, 2001)
sin_alpha = np.sin(x).ravel()
sin_alpha_train = np.array(copy.deepcopy(sin_alpha))[:len(sin_alpha)-len_test]
sin_alpha_train = np.array(sin_alpha_train).reshape(len(sin_alpha_train), 1)
sc = MinMaxScaler(feature_range=(0, 1))
sin_alpha_train = sc.fit_transform(sin_alpha_train)
X_train, y_train = create_data_train(sin_alpha_train)
joblib.dump(sc, f'Demo_MinMaxScaler.gz')
sc = joblib.load(f"Demo_MinMaxScaler.gz")
X_test = np.array(copy.deepcopy(sin_alpha))[len(sin_alpha)-len_test:len(sin_alpha)-len_test+len_train]
X_test = np.array(X_test).reshape(len(X_test), 1)
X_test = sc.fit_transform(X_test)
y_test = np.array(copy.deepcopy(sin_alpha))[len(sin_alpha)-len_test+len_train:len(sin_alpha)-len_test+len_train+total_predict]
model = lstm_reg(input_shape=(len_train, 1), unit=int(2*(len_train+len(y_train))/3))
model.summary()
callback_class = Callback_Class()
callback_class.load_data(X_test, y_test)
model.fit(X_train, y_train, epochs=epoch, use_multiprocessing=True, verbose=1, callbacks=[callback_class], workers=workers, batch_size=batch_size)
It seems like you are normalizing your features and your labels in these lines
sc = MinMaxScaler(feature_range=(0, 1))
sin_alpha_train = sc.fit_transform(sin_alpha_train)
X_train, y_train = create_data_train(sin_alpha_train)
Try it without scaling your label set. Due to your output layer using the linear activation function, which is correct as you're working on a regression problem, the model should be able to handle non scaled labels. The model only learns your data in a range of 0 to 1 while your sine wave goes from -1 to 1.

DataGenerator generates indexes out of bounds

I trying to fine tune BERT model. For this purpose i use DataGeneartor
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
"""Generates batches of data."""
def __init__(
self,
sentence_pairs,
labels,
batch_size=batch_size,
shuffle=True,
include_targets=True,
):
self.sentence_pairs = sentence_pairs
self.labels = labels
self.shuffle = shuffle
self.batch_size = batch_size
self.include_targets = include_targets
# Load our BERT Tokenizer to encode the text.
# We will use base-base-uncased pretrained model.
self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
self.indexes = np.arange(len(self.sentence_pairs))
self.on_epoch_end()
def __len__(self):
# Denotes the number of batches per epoch.
return len(self.sentence_pairs) // self.batch_size
def __getitem__(self, idx):
# Retrieves the batch of index.
indexes = self.indexes[idx * self.batch_size: (idx + 1) * self.batch_size]
sentence_pairs = self.sentence_pairs[indexes]
# With BERT tokenizer's batch_encode_plus batch of both the sentences are
# encoded together and separated by [SEP] token.
encoded = self.tokenizer.batch_encode_plus(
sentence_pairs.tolist(),
add_special_tokens=True,
max_length=max_length,
return_attention_mask=True,
return_token_type_ids=True,
padding=True,
return_tensors="tf",
)
# Convert batch of encoded features to numpy array.
input_ids = np.array(encoded["input_ids"], dtype="int32")
attention_masks = np.array(encoded["attention_mask"], dtype="int32")
token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
# Set to true if data generator is used for training/validation.
if self.include_targets:
labels = np.array(self.labels[indexes], dtype="int32")
return [input_ids, attention_masks, token_type_ids], labels
else:
return [input_ids, attention_masks, token_type_ids]
The size of train is 27051 / test is 9017, the batch size is 32. When i trying to generates labels labels = np.array(self.labels[indexes], dtype="int32") the generator returns labels that out of bounds and i got an error
KeyError: '[7326, 2726, 23864, 4084, 3394, 19934, 22494] not in index'
There is probably something wrong in indexes calculation indexes = self.indexes[idx * self.batch_size: (idx + 1) * self.batch_size] but i can't get it
When i was splitting dataframe to train / val / test
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.05, random_state=1)
i forgot to drop indexes. This fixed the issue
for df in (X_train, X_val, X_test, y_train, y_val, y_test):
df.reset_index(drop=True, inplace=True)

Reading images without rigid folder structure

I am using Tensorflow 2 (Tensorflow 2.2 in particular)
The function below allows us to read in images from folders
train_datagen = ImageDataGenerator(
rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
test_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
'data/train',
target_size=(150, 150),
batch_size=32,
class_mode='binary')
but it requires us to rigidly structured the folder according to the classes say cat and dog to be classified as
data/train/cat and data/train/dog
Say now, we have all the training images in the folder data/train/ (say data/train/1.jpg etc.) and I have train_set X and label y in the following:
X=['1.jpg','2.jpg',...]
y=[0,1,...]
where 0 denotes dog and 1 denotes cat for y, and I want to achieve the same effect as the code above (e.g., image aug. like horizontal flipping etc. + with batchsize specified), how should I do that?
An approach I have tried:
I use the following code
def preprocess(image):
img_shape=np.array(image).shape
image = tf.cast(np.array(image), tf.float32)
image = (image / 127.5) - 1
return image
image_path=pathlib.Path.joinpath("train", "data")
class_names=[x.name.lower() for x in image_path.glob('*') if x.is_dir()]
X=[]
y=[]
for path in image_path.glob('**/*'):
if path.is_file():
if path.name.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
X.append(preprocess(Image.open(path).resize((224,224),resample=Image.BICUBIC)))
y.append(class_names.index(path.parent.name.lower()))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
test_size=test_ratio / (test_ratio + validation_ratio),
stratify=y_test)
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
validation_data = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)
test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
I got out of memory error (as I store all images in X), how should I resolve that?
For this I strongly recommend that you use tf.data.Dataset(), in order to read and ingest your data.
In fact it is even officially the recommended manner in which the ETL process in TensorFlow (extract,transform,load) should be prepared.
You can have a look here: https://www.tensorflow.org/api_docs/python/tf/data/Dataset
For example, in your particular case (when you read the documentation it will make more sense), you could use a .map() function in which you retrieve/generate the label 0 or 1 depending on the string in the description of your image.
Or you could also implement it in the way you described above, using tf.data.Dataset.from_tensor_slices()
In addition, you can use another mapping function for augmentation; you can investigate here the available image preprocessing techniques: https://www.tensorflow.org/api_docs/python/tf/image
From my own work(adapted tutorial from some time ago), I attach here an example:
def load_filenames(csv_data, datapath):
filenames = [os.path.join(datapath, filename) for filename in csv_data['id'].tolist()]
return filenames
def load_labels(csv_data):
return csv_data['has_cactus'].tolist()
def parse_fn(filename, label):
filename = filename.numpy().decode('utf-8')
print(filename)
return filename, label
def process_function(filename, label):
img = tf.io.read_file(filename)
img = tf.image.decode_jpeg(img)
img = (tf.cast(img, tf.float32) / 127.5) - 1
img = tf.image.resize(img, (96, 96))
return img, label
train_csv = pd.read_csv(filepath_or_buffer='data/aerial-cactus-identification/train.csv')
filenames = load_filenames(csv_data=train_csv, datapath='data/aerial-cactus-identification/train')
labels = load_labels(csv_data=train_csv)
train_filenames, val_filenames, train_labels, val_labels = train_test_split(filenames,
labels,
train_size=0.9,
random_state=42)
num_train = len(train_filenames)
num_val = len(val_filenames)
train_data = tf.data.Dataset.from_tensor_slices(
(tf.constant(train_filenames), tf.constant(train_labels))
)
val_data = tf.data.Dataset.from_tensor_slices(
(tf.constant(val_filenames), tf.constant(val_labels))
)
train_data = (train_data.map(process_function)
.shuffle(buffer_size=num_train)
.batch(BATCH_SIZE)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
val_data = (val_data.map(process_function)
.shuffle(buffer_size=num_val)
.batch(BATCH_SIZE)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

My training Data and labels have different numpy array shapes. It is disrupting my training

I have an image based database That I am working with and am attempting to convert it to a numpy array. Which I would then use for a cGAN input. I have tried using multiple codes and they are all giving me dimesnionality issue. Not sure what to do
training_data = []
IMG_SIZE = 32
datadir = 'drive/My Drive/dummyDS'
CATEGORIES = ['HTC-1-M7', 'IPhone-4s', 'iPhone-6', 'LG-Nexus-5x',
'Motorola-Droid-Max', 'Motorola-Nexus-6', 'Motorola-X',
'Samsung-Galaxy-Note3', 'Samsung-Galaxy-S4', 'Sony-Nex-7']
def create_training_data():
i=0
for category in CATEGORIES:
path=os.path.join(datadir,category)
class_num = CATEGORIES.index(category)
for img in os.listdir(path):
img_array=cv2.imread(os.path.join(path,img))
new_array=cv2.resize(img_array,(IMG_SIZE,IMG_SIZE))
training_data.append([new_array,class_num])
plt.imshow(img_array,cmap="gray")
plt.imshow(new_array,cmap="gray")
plt.show()
create_training_data()
X=[]
y=[]
random.shuffle(training_data)
for features,label in training_data:
X.append(features)
y.append(label)
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
pickle_out = open("X.pickle","wb")
pickle.dump(X, pickle_out)
pickle_out.close()
y = np.array(y)
pickle_out = open("y.pickle","wb")
pickle.dump(y, pickle_out)
pickle_out.close()
y = to_categorical(y)
# saving the y_labels_one_hot array as a .npy file
np.save('y_labels_one_hot.npy', y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2./11)
X_train.shape=(32,32,32,3) while y_train.shape= (32,4,2)
Now in training I am getting
real_labels=to_categorical(Y_train[i*batch_size:(i+1)*batch_size].reshape(-1,1),num_classes=10)
d_loss_real = discriminator.train_on_batch(x=[X_batch, real_labels],
y=real * (1 - smooth))
ValueError: All input arrays (x) should have the same number of samples. Got array shapes: [(32, 32, 32, 3), (256, 10)]
tensorflow.keras.imagedatagenerator.flow_from_directory should simplify your task.
It does almost everything you do using the code you mentioned, in a simpler way, including Splitting the Data
Code mentioned demonstrates how to use it, along with the detailed explanation of each line of code :
train_datagen = ImageDataGenerator(rescale=1./255, # Normalizes every pixel value
validation_split=0.2) # Setting Validation Data as 20% of Total Data
train_generator = train_datagen.flow_from_directory(
datadir, # Traverses through all the Sub Folders (Category) inside this dir
target_size=(img_height, img_width), # Sets the Image Size
batch_size=batch_size, # Generates batches of `batch_size`
class_mode='categorical', # Will Consider Labels as Categorical
shuffle = True, # Shuffles the Data
subset='training') # Considers 80% as training data
# Since we don't have separate directory for Validation Data and since we want the Total Data to be Partitioned, we should use "train_datagen"
validation_generator = train_datagen.flow_from_directory(
datadir , # Should use the Same Dir as Training for Splitting
target_size=(img_height, img_width),
batch_size=batch_size,
class_mode='categorical',
shuffle = True, # Shuffles the Data
subset='validation') # Considers 20% as Validation data
# Then you can train the model using the code mentioned below
model.fit(
train_generator,
steps_per_epoch = train_generator.samples // batch_size,
validation_data = validation_generator,
validation_steps = validation_generator.samples // batch_size,
epochs = nb_epochs)
Hope this will resolve your issue of different Shapes as it will ensure that Features and Labels will be of same shape. Please share more information if this approach is resulting in Error.
Happy Learning!

cifar10.load_data() takes long time to download data

Hi I downloaded the cifar-10 dataset.
In my code, it loads the data set as below.
import cv2
import numpy as np
from keras.datasets import cifar10
from keras import backend as K
from keras.utils import np_utils
nb_train_samples = 3000 # 3000 training samples
nb_valid_samples = 100 # 100 validation samples
num_classes = 10
def load_cifar10_data(img_rows, img_cols):
# Load cifar10 training and validation sets
(X_train, Y_train), (X_valid, Y_valid) = cifar10.load_data()
# Resize trainging images
if K.image_dim_ordering() == 'th':
X_train = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_train[:nb_train_samples,:,:,:]])
X_valid = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_valid[:nb_valid_samples,:,:,:]])
else:
X_train = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_train[:nb_train_samples,:,:,:]])
X_valid = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_valid[:nb_valid_samples,:,:,:]])
# Transform targets to keras compatible format
Y_train = np_utils.to_categorical(Y_train[:nb_train_samples], num_classes)
Y_valid = np_utils.to_categorical(Y_valid[:nb_valid_samples], num_classes)
return X_train, Y_train, X_valid, Y_valid
But this takes a long time to download the dataset. Instead I downloaded 'cifar-10-python.tar.gz' manually. So how can I load that into variables, (X_train, Y_train), (X_valid, Y_valid) instead of using, cifar10.load_data()?
Excuse my english. I am trying to load the cifar-10 dataset manually as well. In the following code I unpack cifar-10-python.tar.gz to a folder and load the file data_batch_1 from the folder into 4 arrays: x_train, y_train, x_test, y_test. 20% of data_batch_1 is used for validation as x_test and y_test and the remaining is used for training as x_train and y_train.
import pickle
import numpy
# load data
with open('cifar-10-batches-py\\data_batch_1','rb') as f:
dict1 = pickle.load(f,encoding='bytes')
x = dict1[b'data']
x = x.reshape(len(x), 3, 32, 32).astype('float32')
y = numpy.asarray(dict1[b'labels'])
x_test = x[0:int(0.2 * x.shape[0]), :, :, :]
y_test = y[0:int(0.2 * y.shape[0])]
x_train = x[int(0.2 * x.shape[0]):x.shape[0], :, :, :]
y_train = y[int(0.2 * y.shape[0]):y.shape[0]]
Code here reads training and test images from respective batch files as stated in dataset website, modification from this post with nice explanation.
import pickle
import numpy as np
for i in range(1,6):
path = 'data_batch_' + str(i)
with open(path, mode='rb') as file:
# note the encoding type is 'latin1'
batch = pickle.load(file, encoding='latin1')
if i == 1:
x_train = (batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)).astype('float32')
y_train = batch['labels']
else:
x_train_temp = (batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)).astype('float32')
y_train_temp = batch['labels']
x_train = np.concatenate((x_train,x_train_temp),axis = 0)
y_train = np.concatenate((y_train,y_train_temp),axis=0)
path = 'test_batch'
with open(path,'rb') as file:
# note the encoding type is 'latin1'
batch = pickle.load(file, encoding='latin1')
x_test = (batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)).astype('float32')
y_test = batch['labels']
We can visualise the read data as follows:
import matplotlib.pyplot as plt
x_train=x_train.astype(np.uint8)
y_train = np.expand_dims(y_train, axis = 1)
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck']
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(np.squeeze(x_train[i]), cmap=plt.cm.binary)
# The CIFAR labels happen to be arrays,
# which is why you need the extra index
plt.xlabel(class_names[y_train[i][0]])
plt.show()
Also see here in case download time is your only problem, you can still use load_data().

Categories

Resources