Error preprocessing the input data when using Tensorflow Dataset API - python

I have images of [64,512,5] stored in *.npy files which I convert into *.tfrecords files.
I have verified that the reading of said records corresponds correctly with what is present in the *.npy files. However, when I perform some operation on the parser, like adding 1 to each pixel of the image, the result is not the expected one. The result should be 65*512*5 = 163840 but it is 163839.99980013957 (not always the same)
I have tried to perform different operations like tf.subtract, but the results are the same.
Could someone tell me what is wrong?
import re
import ast
import sys, select
import random as rn
from glob import glob
from tqdm import tqdm
from datetime import datetime
from configparser import SafeConfigParser
import numpy as np
import numpy.ma as ma
import scipy.misc
import os.path
from os import mkdir, stat
from os.path import exists, dirname, abspath
from os.path import join as dir_join
import tensorflow as tf
''' File hierarchy
'''
_code_dir = dirname(abspath(__file__))
_python_dir = dirname(_code_dir)
_model_dir = dirname(_python_dir)
_project_dir = dirname(_model_dir)
_ml_dir = dirname(_project_dir)
_srv_dir = dirname(_ml_dir)
_root_datasets_dir = dir_join(_srv_dir,'machine_learning','data_sets/ssd_prepared')
_config_dir = dir_join(_python_dir, 'config')
'''Data sets directories
'''
THIS_DATA_SET_DIR = 'Sph_50m' #WARNING: Global variable also used in helper.py
_data_dir = dir_join(_root_datasets_dir, THIS_DATA_SET_DIR)
_data_set_dir = dir_join(_data_dir,'ImageSet')
_data_npy_dir = dir_join(_data_dir,'data')
_data_tfRecord_dir = dir_join(_data_dir,'tfRecord')
''' Configuration parser
'''
cfg_parser = SafeConfigParser()
cfg_parser.read(dir_join(_config_dir,'cfg_model.ini'))
''' Private variables
'''
_batch_size = cfg_parser.getint(section='train', option='batch_size')
_max_epoch = cfg_parser.getint(section='train', option='max_epoch')
_standarize = cfg_parser.getboolean(section='train', option='standarize_input')
_input_shape = ast.literal_eval(cfg_parser.get(section='data_shape', option='input_shape'))
_label_channel = cfg_parser.getint(section='data_shape', option='label_channel')
_track_channel = cfg_parser.getint(section='data_shape', option='track_channel')
_mask_channel = cfg_parser.getint(section='data_shape', option='mask_channel')
_data_train = cfg_parser.get(section='data_set', option='data_train')
_data_val = cfg_parser.get(section='data_set', option='data_val')
_data_test = cfg_parser.get(section='data_set', option='data_test')
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.reshape(-1)))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _floats_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))
def numpy_to_TFRecord():
if not exists(_data_tfRecord_dir): mkdir(_data_tfRecord_dir)
for dataset in [_data_train, _data_val, _data_test]:
tfRecord_folder = dir_join(_data_tfRecord_dir, dataset)
if not exists(tfRecord_folder): mkdir(tfRecord_folder)
#Retrieve list of files
projections_dir=[]
file_ = open(dir_join(_data_set_dir, dataset+'.txt'), 'r')
for x in file_.readlines():
file_nat = x.strip()+'.npy'
filename = dir_join(_data_npy_dir, file_nat)
assert exists(filename), "{} doesn't exist".format(filename)
projections_dir.append(filename)
file_.close()
totaltfRecordSize = 0
numFile = 0
for projection_dir in tqdm(projections_dir, ncols= 100, desc = 'TFRecord {}'.format(dataset)):
scanName = projection_dir.split('/')[-1].split('.')[0]
if totaltfRecordSize > 100*(10**6) or totaltfRecordSize == 0:
# address to save the TFRecords file
train_filename = dir_join(tfRecord_folder, \
str(numFile) + '_' + dataset +'.tfrecords')
# open the TFRecords file
writer = tf.python_io.TFRecordWriter(train_filename)
numFile += 1
totaltfRecordSize = 0
# Load the image
projection = np.load(projection_dir)
image = projection[:,:,:_label_channel]
label = projection[:,:,_label_channel].astype(int)
mask = projection[:,:,_mask_channel].astype(int)
track = projection[:,:,_track_channel].astype(int)
# Create a feature
feature = {'image': _floats_feature(image),
'label': _int64_feature(label),
'mask' : _int64_feature(mask),
'track': _int64_feature(track),
'scanName': _bytes_feature(tf.compat.as_bytes(scanName))}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
fileSize = stat(train_filename).st_size
totaltfRecordSize += fileSize
writer.close()
sys.stdout.flush()
def readTFRecord():
# Transforms a scalar string `example_proto` into a pair of a scalar string and
# a scalar integer, representing an image and its label, respectively.
image_dim = _input_shape[0] * _input_shape[1] * _label_channel
label_dim = _input_shape[0] * _input_shape[1]
mean = np.load(dir_join(_data_dir,'mean.npy'))
std = np.load(dir_join(_data_dir,'std.npy'))
mean_tf = tf.convert_to_tensor(mean, dtype=tf.float32, name='mean')
std_tf = tf.convert_to_tensor(std, dtype=tf.float32, name='std')
with tf.variable_scope('TFRecord'):
def _parse_function(example_proto):
with tf.variable_scope('parser'):
features = {'image': tf.FixedLenFeature([image_dim], tf.float32),
'label': tf.FixedLenFeature([label_dim], tf.int64),
'mask' : tf.FixedLenFeature([label_dim], tf.int64),
'track': tf.FixedLenFeature([label_dim], tf.int64),
'scanName': tf.FixedLenFeature([], tf.string)}
parsed_features = tf.parse_single_example(example_proto, features)
# Reshape image data into the original shape
image = tf.reshape(parsed_features['image'], [_input_shape[0], _input_shape[1], _label_channel], name='image')
label = tf.reshape(parsed_features['label'], _input_shape, name='lable_reshape')
mask = tf.reshape(parsed_features['mask'], _input_shape, name='mask_reshape')
track = tf.reshape(parsed_features['track'], _input_shape, name='track_reshape')
scanName = parsed_features['scanName']
image = image + tf.constant(1., dtype=tf.float32)
return image, label, mask, track, scanName
training_filenames = glob(dir_join(_data_tfRecord_dir, _data_train, '*.tfrecords'))
validation_filenames = glob(dir_join(_data_tfRecord_dir, _data_val, '*.tfrecords'))
filenames = tf.placeholder(tf.string, shape=[None], name='filenames')
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function, num_parallel_calls=20) # Parse the record into tensors.
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(_batch_size, drop_remainder=True)
dataset = dataset.prefetch(buffer_size=10)
iterator = dataset.make_initializable_iterator()
next = iterator.get_next()
sess = tf.Session()
while True:
sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
try:
img, _, _, _, scanX = sess.run(next)
for i, scan in enumerate(scanX):
print(scan.decode("utf-8"))
projection = np.load(dir_join(_data_npy_dir, scan.decode("utf-8") + '.npy'))
imagenp = projection[:,:,:_label_channel]
if np.abs(np.sum(img[i,...] - imagenp)) > 0.:
print(np.sum(img[i,...] - imagenp))
except tf.errors.OutOfRangeError:
break
return training_filenames, validation_filenames, filenames, iterator
if __name__ == '__main__':
numpy_to_TFRecord()
readTFRecord()
The test I'm doing in the previous code is to convert the *.npy files to *.tfrecords. Then, I compare the *.trecords with the *.npy. The value should be 0 if both images were identical.
img, _, _, _, scanX = sess.run(next)
for i, scan in enumerate(scanX):
print(scan.decode("utf-8"))
projection = np.load(dir_join(_data_npy_dir, scan.decode("utf-8") + '.npy'))
imagenp = projection[:,:,:_label_channel]
print(np.sum(img[i,...] - imagenp))
If the data is not preprocessed, these images are the same, however, if we perform some kind of transformation, the results do not match what was expected. In this case we are adding 1 to each pixel of the image, so the total difference should be 64 * 512 * 5.
image = image + tf.constant(1., dtype=tf.float32)
I would like to solve this error, since so far I have not been able to obtain the results obtained by my neural network using feed_dict instead of Tensorflow Dataset API, and this is the only point where I can observe a difference in the input data.

Related

torch dataset error -- 'numpy.int32' is not callable

I'm preparing a set of medical imaging volumes and segmentation masks to be input into a multi-label segmentation neural network for training. I am recieving the following error message when I attempt to load my 5D tensors into a torch TensorDataset:
Traceback (most recent call last):
File (path/project.py), line 122, in <module>
train_dataset = torch.utils.data.TensorDataset(timg, tmask)
File (path/dataset.py), line 365, in __init__
assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
File (path/dataset.py)", line 365, in <genexpr>
assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
TypeError: 'numpy.int32' object is not callable
My original assumption was that the size mismatch was due to the difference in the dimensions of my tensors -- the feature tensor had dimensions 60x128x128x64x1 and the label tensor had dimensions 60x128x128x64x5. However, making the shape and size of these tensors equal has not resolved the issue. My other theory was that the issue was related to this line of code:
def transt(list):
array = np.asarray(list, ->np.int32<-)
changing the dtype did not seem to have an effect. I also tried casting the tensor into different dtypes, again to seemingly no effect on the problem.
Attached is the code. Unsure of how to proceed and any advice would be very appreciated.
import numpy as np
import os
import tensorflow as tf
import nibabel as nib
import matplotlib.pyplot as plt
from VNet import VNet
import Layers
import torchvision
from torchvision.transforms import ToTensor
import torch
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
# Setting path
dirname = os.path.dirname(__file__)
path = os.path.join(dirname, 'Liver_MR_Dataset')
# Loading images/masks
img_list = []
gall_list = []
IVC_list = []
kidney_list = []
liver_list = []
rib_list = []
os.chdir(path + '/Image')
image_path = sorted(os.listdir(path + '/Image'))
for image in image_path:
img = nib.load(image).get_fdata()
img_list.append(img)
print(len(img_list))
train_img = img_list[:60]
print(len(train_img))
val_img = img_list[60:]
print(len(val_img))
os.chdir(path + '/Gall')
gall_path = sorted(os.listdir(path + '/Gall'))
for image in gall_path:
mask = nib.load(image).get_fdata()
gall_list.append(mask)
train_gall = gall_list[:60]
val_gall = gall_list[60:]
os.chdir(path + '/IVC')
IVC_path = sorted(os.listdir(path + '/IVC'))
for image in IVC_path:
mask = nib.load(image).get_fdata()
IVC_list.append(mask)
train_IVC = IVC_list[:60]
val_IVC = IVC_list[60:]
os.chdir(path + '/Kidney')
kidney_path = sorted(os.listdir(path + '/Kidney'))
for image in kidney_path:
mask = nib.load(image).get_fdata()
kidney_list.append(mask)
train_kidney = kidney_list[:60]
val_kidney = kidney_list[60:]
os.chdir(path + '/Liver')
liver_path = sorted(os.listdir(path + '/Liver'))
for image in liver_path:
mask = nib.load(image).get_fdata()
liver_list.append(mask)
train_liver = liver_list[:60]
val_liver = liver_list[60:]
os.chdir(path + '/Rib')
rib_path = sorted(os.listdir(path + '/Rib'))
for image in rib_path:
mask = nib.load(image).get_fdata()
rib_list.append(mask)
train_rib = rib_list[:60]
val_rib = rib_list[60:]
os.chdir(path)
# Transformations
def transt(list):
array = np.asarray(list, np.int32)
resize = np.resize(array, [60,128,128,64])
tensor = tf.convert_to_tensor(resize)
return tensor
def transv(list):
array = np.asarray(list, np.int32)
resize = np.resize(array, [7,128,128,64])
tensor = tf.convert_to_tensor(resize)
return tensor
tgall = transt(train_gall)
vgall = transv(val_gall)
tIVC = transt(train_IVC)
vIVC = transv(val_IVC)
tkidney = transt(train_kidney)
vkidney = transv(val_kidney)
tliver = transt(train_liver)
vliver = transv(val_liver)
trib = transt(train_rib)
vrib = transv(val_rib)
timg4d = transt(train_img)
vimg4d = transv(val_img)
timg = tf.stack([timg4d, timg4d, timg4d, timg4d, timg4d], axis=4)
print(timg.shape)
print(timg.size)
vimg = tf.stack([timg4d, timg4d, timg4d, timg4d, timg4d], axis=4)
tmask = tf.stack([tgall, tIVC, tkidney, tliver, trib], axis=4)
print(tmask.shape)
print(tmask.size)
vmask = tf.stack([vgall, vIVC, vkidney, vliver, vrib], axis=4)
# Create Datasets
train_dataset = torch.utils.data.TensorDataset(timg, tmask)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=60)
#val_dataset = torch.utils.data.TensorDataset(vimg, vmask)
#val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=60)
size is a tuple, all(tensors[0].size[0] == tensor.size[0] for tensor in tensors),

Skip image during tensorflow input pipeline

I have a Tensorflow input pipeline that reads in two png files (example, label) from disk. I want to tell tensorflow to skip an example/label pair based on a value in the label. Anyone know how to do this?
Here is a simplified example of the input pipeline and with a comment where I want to do the filtering:
import tensorflow as tf
import glob2 as glob
def preprocess_images(impath, labpath):
image = tf.io.read_file(impath)
label = tf.io.read_file(labpath)
image = tf.image.decode_png(image, channels=3)
label = tf.image.decode_png(label, channels=1)
if tf.reduce_sum(label) == 0:
#skip the image and move on to the next, don't include this in the batch
else:
return (image, label)
im_files = glob.glob(impath + '*.png')
lab_files = glob.glob(labpath + '*.png')
files = (im_files, lab_files)
path = tf.data.Dataset.from_tensor_slices(files)
pair = path.map(preprocess_images)
ds = tf.data.Dataset.zip(pair)
ds = ds.batch(64)
The easiest way seems to be to use filter method on your tf.data.Dataset object.
Here I am going to load the label only and filter out the entries with a sum of 0:
def load_label_only(impath, labpath):
label = tf.io.read_file(labpath)
label = tf.image.decode_png(label, channels=1)
return impath, label
# Create the dataset as in your example:
im_files = glob.glob(impath + '*.png')
lab_files = glob.glob(labpath + '*.png')
files = (im_files, lab_files)
ds = tf.data.Dataset.from_tensor_slices(files)
ds = ds.map(load_label_only)
# Here, I am going to keep only non-zero labels:
filtered_ds = ds.filter(lambda image_path, label_map: tf.reduce_sum(label_map) != 0)
# Load the rest of the images...

My python/sklearn/SVM is running endlessly

Hi I'm trying to train MNIST classifier with SVM(SVC), sci-kit learn(sklearn).
But my training runs endlessly.. What should I do?
I tried changing parameters of SVC but I'm not sure about what I'm doing
And It doesn't work...
The number of training data is 60,000
Please help me
import os
import struct
import numpy as np
import matplotlib .pyplot as plt
from sklearn.svm import SVC
from google.colab import drive
drive.mount('/content/gdrive')
def read(dataset = "training", path="."):
if dataset is "training":
fname_img = os.path.join(path, 'train-images-idx3-ubyte.idx3-ubyte')
fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte.idx1-ubyte')
elif dataset is "testing":
fname_img = os.path.join(path, 't10k-images-idx3-ubyte.idx3-ubyte')
fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte.idx1-ubyte')
else:
raise Exception("dataset must be 'testing' or 'training'")
with open(fname_lbl, 'rb') as flbl:
magic, num = struct.unpack(">II", flbl.read(8))
lbl = np.fromfile(flbl, dtype=np.int8)
with open(fname_img, 'rb') as fimg:
magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)
get_img = lambda idx: (lbl[idx], img[idx])
# Create an iterator which returns each image in turn
for i in range(len(lbl)):
yield get_img(i)
tr = list(read("training", "/content/gdrive/My Drive/ColabNotebooks/MNIST"))
tst = list(read("testing", "/content/gdrive/My Drive/ColabNotebooks/MNIST"))
def seperate(data):
labels =[]
images =[]
for i in data:
labels.append(int(i[0]))
images.append(i[1])
return {"labels":labels, "images":images}
train = seperate(tr)
test = seperate(tst)
clf = SVC(kernel = 'linear', cache_size = 6000, gamma = 0.001, C = 100)
train_len = len(tr)
train_Array = np.array(train["images"]).reshape(train_len, -1)
clf.fit(train_Array, train["labels"])

I'm getting a Memory Error while processing my dataset in python ? What could be the reason?

I'm trying a deep learning code for processing my dataset which consists 1,12,120 images. What my code does is the following:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import imageio
from os import listdir
import skimage.transform
import pickle
import sys, os
from sklearn.preprocessing import MultiLabelBinarizer
def get_labels(pic_id):
labels = meta_data.loc[meta_data["Image Index"]==pic_id,"Finding Labels"]
return labels.tolist()[0].split("|")
#Loading Data
meta_data = pd.read_csv(data_entry_path)
bbox_list = pd.read_csv(bbox_list_path)
with open(train_txt_path, "r") as f:
train_list = [ i.strip() for i in f.readlines()]
with open(valid_txt_path, "r") as f:
valid_list = [ i.strip() for i in f.readlines()]
label_eight = list(np.unique(bbox_list["Finding Label"])) + ["No Finding"]
# transform training images
print("training example:",len(train_list))
print("take care of your RAM here !!!")
train_X = []
for i in range(len(train_list)):
image_path = os.path.join(image_folder_path,train_list[i])
img = imageio.imread(image_path)
if img.shape != (1024,1024): # there some image with shape (1024,1024,4) in training set
img = img[:,:,0]
img_resized = skimage.transform.resize(img,(256,256)) # or use img[::4] here
train_X.append((np.array(img_resized)/255).reshape(256,256,1))
if i % 3000==0:
print(i)
train_X = np.array(train_X)
np.save(os.path.join(data_path,"train_X_small.npy"), train_X)
# transform validation images
print("validation example:",len(valid_list))
valid_X = []
for i in range(len(valid_list)):
image_path = os.path.join(image_folder_path,valid_list[i])
img = imageio.imread(image_path)
if img.shape != (1024,1024):
img = img[:,:,0]
img_resized = skimage.transform.resize(img,(256,256))
# if img.shape != (1024,1024):
# train_X.append(img[:,:,0])
# else:
valid_X.append((np.array(img_resized)/255).reshape(256,256,1))
if i % 3000==0:
print(i)
valid_X = np.array(valid_X)
np.save(os.path.join(data_path,"valid_X_small.npy"), valid_X)
# process label
print("label preprocessing")
train_y = []
for train_id in train_list:
train_y.append(get_labels(train_id))
valid_y = []
for valid_id in valid_list:
valid_y.append(get_labels(valid_id))
encoder = MultiLabelBinarizer()
encoder.fit(train_y+valid_y)
train_y_onehot = encoder.transform(train_y)
valid_y_onehot = encoder.transform(valid_y)
train_y_onehot = np.delete(train_y_onehot, [2,3,5,6,7,10,12],1) # delete out 8 and "No Finding" column
valid_y_onehot = np.delete(valid_y_onehot, [2,3,5,6,7,10,12],1) # delete out 8 and "No Finding" column
with open(data_path + "/train_y_onehot.pkl","wb") as f:
pickle.dump(train_y_onehot, f)
with open(data_path + "/valid_y_onehot.pkl","wb") as f:
pickle.dump(valid_y_onehot, f)
with open(data_path + "/label_encoder.pkl","wb") as f:
pickle.dump(encoder, f)
So this is my code My system configration:Intel i7-7700HQ,16GB Ram,256GB ssd,GTX 1050 4GB
Is there a way to split my dataset so and write to the same file again? I'm also posting the error which i got as a screenshot Error From Powershell after executing the code for 30mins
I'm also using python3 in my system 64bit version
Does spliting the 1,12,120 images and taking them as batches will it work here? If yes how?

Filtering Python Numpy ndarray

I'm trying to filter my CIFAR-100 ndarray by class index, here is my code:
def get_cifar100(folder, class_idx):
train_fname = os.path.join(folder, 'train')
test_fname = os.path.join(folder, 'test')
data_dict = unpickle(train_fname)
train_data = data_dict['data']
train_fine_labels = data_dict['fine_labels']
train_coarse_labels = data_dict['coarse_labels']
# Filtering process
filt_tdata = numpy.empty((0))
for i, v in enumerate(train_coarse_labels):
if v == class_idx:
filt_tdata = numpy.append(filt_tdata, train_data[i])
data_dict = unpickle(test_fname)
test_data = data_dict['data']
test_fine_labels = data_dict['fine_labels']
test_coarse_labels = data_dict['coarse_labels']
bm = unpickle(os.path.join(folder, 'meta'))
clabel_names = bm['coarse_label_names']
flabel_names = bm['fine_label_names']
return data_dict, filt_tdata, numpy.array(train_coarse_labels), numpy.array(train_fine_labels), test_data, numpy.array(test_coarse_labels), numpy.array(test_fine_labels), clabel_names, flabel_names
datapath = "./data/cifar-100-python"
data_dict, tr_data100, tr_clabels100, tr_flabels100, te_data100, te_clabels100, te_flabels100, clabel_names100, flabel_names100 = get_cifar100(datapath, 4)
print(len(tr_data100))
I want to filter train_data based on class_idx = 4 (train_coarse_labels). The size of original array is 50000 and it should be 5000 on filtered. But, I got more than its original size (7 million ++). What's wrong with my function? Thanks.

Categories

Resources