I was in the middle of training my gan when a very unexpected error came up. I have no idea how to fix it. The error doesn't come right away it happens about 2-3 minutes into my training. Here is the Error
Traceback (most recent call last):
File "gan.py", line 103, in <module>
train(X_train_dataset,200)
File "gan.py", line 80, in train
train_step(images) # takes images and improves both the generator and the discriminator
File "gan.py", line 91, in train_step
discriminator_loss = get_discriminator_loss(real_output,fake_output)
File "gan.py", line 48, in get_discriminator_loss
return fake_loss+real_loss
File "/home/jake/.local/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1125, in binary_op_wrapper
return func(x, y, name=name)
File "/home/jake/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper
return target(*args, **kwargs)
File "/home/jake/.local/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1447, in _add_dispatch
return gen_math_ops.add_v2(x, y, name=name)
File "/home/jake/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 486, in add_v2
_ops.raise_from_not_ok_status(e, name)
File "/home/jake/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 6843, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [100] vs. [13] [Op:AddV2]
So from I can tell from this call back my error occures during my get_discriminator_loss() so here is that code.
def get_discriminator_loss(real_predictions,fake_predictions):
real_predictions = tf.sigmoid(real_predictions)
fake_predictions = tf.sigmoid(fake_predictions)
real_loss=tf.losses.binary_crossentropy(tf.ones_like(real_predictions),real_predictions)
fake_loss=tf.losses.binary_crossentropy(tf.zeros_like(fake_predictions),fake_predictions)
return fake_loss+real_loss
Does anyone have any ideas? And remember this is after running successfully for about 2-3 minutes. The error doesn't occur in the first many passes.
I've found the source of my error but I don't know why it's occuring.
My real loss at one of the passes has only 13 values instead of the normal 100
How can this be?
Here is my full code.
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import time
import pickle
pickle_in_X = open("X.pickle","rb")
pickle_in_y = open("y.pickle","rb")
X=pickle.load(pickle_in_X)
y = pickle.load(pickle_in_y)
y = np.array(y)
X_train = X[ int(len(X)*.3): ]
y_train = y[ int(len(y)*.3 ): ]
X_test = X[ :int(len(X)*.3) ]
y_test = X[ :int(len(y)*.3) ]
X_train = (X_train-127.5)/127.5
BATCH_SIZE = 100
X_train_dataset = tf.data.Dataset.from_tensor_slices(X_train).batch(BATCH_SIZE)
#creates a discriminator model.
#discriminator will ouput 0-1 which represents the probability that the image is real
def make_discriminator():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(7,(3,3),padding="same",input_shape=(40,40,1)))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.LeakyReLU())
model.add(tf.keras.layers.Dense(50,activation="relu"))
model.add(tf.keras.layers.Dense(1))
return model
model_discriminator = make_discriminator()
discriminator_optimizer = tf.optimizers.Adam(1e-3)
#real_loss is the amount of error when trying to guess that the real images are in fact real. i.e loss will be if our discriminator guesses that there is a 100% chance that this real image is real
#fake_loss is the amount of error when trying to guess that the fake images are in fact fake. i.e loss will be zero if our discriminator guesses there is a 0% chance that this fake image is fake
#returns the total of our loss
def get_discriminator_loss(real_predictions,fake_predictions):
real_predictions = tf.sigmoid(real_predictions)
fake_predictions = tf.sigmoid(fake_predictions)
real_loss=tf.losses.binary_crossentropy(tf.ones_like(real_predictions),real_predictions)
fake_loss=tf.losses.binary_crossentropy(tf.zeros_like(fake_predictions),fake_predictions)
return fake_loss+real_loss
#take an input of a random string of numbers. and output either a dog or a cat
def make_generator():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10*10*256,input_shape = (100,)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Reshape((10,10,256)))
model.add(tf.keras.layers.Conv2DTranspose(128,(3,3),padding="same"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2DTranspose(64,(3,3),strides=(2,2),padding="same"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2DTranspose(1,(3,3),strides=(2,2),padding="same"))
return model
model_generator = make_generator()
#generator gets rewarded when it fools the discriminator
def get_generator_loss(fake_predictions):
fake_predictions = tf.sigmoid(fake_predictions)
fake_loss=tf.losses.binary_crossentropy(tf.ones_like(fake_predictions),fake_predictions)
return fake_loss
generator_optimizer = tf.optimizers.Adam(1e-3)
#training
def train(X_train_dataset,epochs):
for _ in range(epochs):
for images in X_train_dataset:
images = tf.cast(images,tf.dtypes.float32)
train_step(images) # takes images and improves both the generator and the discriminator
def train_step(images):
fake_image_noise = np.random.randn(BATCH_SIZE,100)#produces 100 random numbers that wll be converted to images
with tf.GradientTape() as generator_gradient, tf.GradientTape() as discriminator_gradient:
generated_images = model_generator(fake_image_noise)
real_output = model_discriminator(images)
fake_output = model_discriminator(generated_images)
generator_loss = get_generator_loss(fake_output)
discriminator_loss = get_discriminator_loss(real_output,fake_output)
gradients_of_generator = generator_gradient.gradient(generator_loss,model_generator.trainable_variables)#gradient of gen loss with respect to trainable variables
gradients_of_discriminator = discriminator_gradient.gradient(discriminator_loss,model_discriminator.trainable_variables)
discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator,model_discriminator.trainable_variables))
generator_optimizer.apply_gradients(zip(gradients_of_generator,model_generator.trainable_variables))
print("generator loss: ", np.mean(generator_loss))
print("discriminator loss: ",np.mean(discriminator_loss))
train(X_train_dataset,200)
model_generator.save('genModel')
model_discriminator.save('discModel')
If the size of your dataset is not a multiple of your batch size, then your last batch will have a smaller number of samples than the other batches. To avoid this, you can force a tf.data.Dataset to drop the last batch if it is smaller than the batch size. See the documentation for more information.
tf.data.Dataset.from_tensor_slices(X_train).batch(BATCH_SIZE, drop_remainder=True)
Related
I am running a visual question answering task.
The problems take as input : image features (which I have saved in a
h5py file) and question tokens (which I have pickled) and outputs
are the answers (the whole answer is considered a target , so 3129
answers –one word or more - and 3129 labels)
I am using the Keras sequence utility to create the generator.
I am getting a dimension error in the output layer when the model
is training. when I change the len function, based on its value the training process breaks down
I have copied my getitem function in the generator and also a sample
of my model.
Do I need to change my generator configuration or my model?
Epoch 1/1
Traceback (most recent call last):
File "<ipython-input-45-e55a5853e499>", line 32, in <module>
validation_data=valid_generator)
File "C:\python\envs\tf2-keras\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "C:\python\envs\tf2-keras\lib\site-packages\keras\engine\training.py", line 1732, in fit_generator
initial_epoch=initial_epoch)
File "C:\python\envs\tf2-keras\lib\site-packages\keras\engine\training_generator.py", line 220, in fit_generator
reset_metrics=False)
File "C:\python\envs\tf2-keras\lib\site-packages\keras\engine\training.py", line 1508, in train_on_batch
class_weight=class_weight)
File "C:\python\envs\tf2-keras\lib\site-packages\keras\engine\training.py", line 621, in _standardize_user_data
exception_prefix='target')
File "C:\python\envs\tf2-keras\lib\site-packages\keras\engine\training_utils.py", line 145, in standardize_input_data
str(data_shape))
ValueError: Error when checking target: expected output to have shape (3129,) but got array with shape (1,)
def __len__(self):
'Denotes the number of batches per epoch'
# return int(np.floor(len(self.list_IDs) / self.batch_size))
return 512*866
# this is the getitem function
The __getitem__ of my generator look like this:
def __getitem__(self, index):
'Generate one batch of data'
imfeatures = np.empty((self.batch_size,2048))
question_tokens = np.empty((self.batch_size,14))
answers = np.empty((self.batch_size,3129))
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# self.T.append(indexes)
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
for i,k in enumerate(list_IDs_temp):
temp =self.Features['image_features'][k]
imfeatures[i,]=temp[0,:]
question_tokens[i,]=self.Questions[indexes[i]]
answers=self.Answer[indexes[i]]
return [imfeatures,question_tokens],answers
# this is where I instantiate the generators
#train_features is h5py file
# entries is where questions, answers, and ids are saved
batch_size=512
train_generator = DataGenerator(entries['train'].images,
train_fetures,
entries['train'].q_token,
entries['train'].target,
batch_size=batch_size,
shuffle = False)
valid_generator = DataGenerator(entries['val'].images,
valid_features,
entries['val'].q_token,
entries['val'].target,
batch_size=batch_size,
shuffle = False)
#And this is what my model looks like:
ImInput = Input(shape=(2048,),name='image_input')
QInput = Input(shape=(14,),name='question')
# some dense layers and dropouts
#Then the layers are merged
M =Multiply()[ImInput,QInput]
#Some dense layers and dropouts
output=Dense(3129,activation='softmax',name='output')(M)
model = Model([ImInput,QInput ],output)
model.compile(optimizer='RMSprop',loss='categorical_crossentropy',metrics = ['accuracy'])
model.fit_generator(train_generator,
epochs=1,
verbose =1,
validation_data=valid_generator)
I've built a custom keras generator.function.
It yields an img and associated gt.
It works well for the training phase with predict_generator() function.
To evaluate my model, I use it on a test set containing 592 images. I call it with the predict_generator() function.
So I get the right number of prediction (592). Every time get_item() function is called, I add the GT to the self.gt list.
Then, after running predict_generator(), I compare the predictions with the stored GT.
My problem :
I want to store ground truth array in a list, everytime the generator is called. But at the end, I have more GT_arrays than the 592 predictions.
So I can't build my confusion matrix...
Here is the code of the generator:
class DataGenerator(Sequence):
def __init__(self, data_folders_txt, gen_data_type, batchsize, shuffle=True, classes=None, selected_class=None):
'''
- data_fodlers_txt : txt_file containing all the paths to different folders of data
- gen_type : string : can be either "train", "val" or "test" (correspond to a specific folder)
- shuffle : Shuffle the dataset at each epoch
- classes : dict of classes with associated nb (class nb must match the class position on the class axis of the ground truth one-hot-encoded array)
- selected_class : name of the selected class (128x128x1) in the 128x128x3 ground truth one-hot-encoded array
'''
self.gt = []
self.shuffle = shuffle
self.gen_data_type = gen_data_type
self.batchsize = batchsize
self.data_folders = open(data_folders_txt, "r").readlines()
self.list_IDs = self.tiles_list_creation(self.data_folders)
self.samples = len(self.list_IDs)
self.classes = classes
self.selected_class = selected_class
self.index = 0
self.on_epoch_end()
def tiles_list_creation(self, list_folders):
list_IDs = []
for folder in list_folders:
samples = glob.glob(folder.rstrip() + self.gen_data_type + '3/tile/*')
list_IDs += samples
random.shuffle(list_IDs)
return list_IDs
def __len__(self):
if len(self.list_IDs) % self.batchsize == 0:
return len(self.list_IDs)//self.batchsize
else:
return len(self.list_IDs) // self.batchsize + 1
def __getitem__(self, index):
self.index = index
X = []
y = []
# min(...,...) is for taking all the data without being out of range
for i in range(index*self.batchsize, min(self.samples, (index+1)*self.batchsize)):
tile = np.load(self.list_IDs[i])
#If specific class is specified, just take the right channel of the GT_array corresponding to the wanted class
if self.classes:
gt = np.load(self.list_IDs[i].replace("tile", "gt"))[:, :, self.classes[self.selected_class]]
gt = np.expand_dims(gt, axis=-1)
else:
gt = np.load(self.list_IDs[i].replace("tile", "gt"))
#store ground truth to compare the values between gt and predictions after running predict_generator()
self.gt.append(gt)
X.append(tile)
y.append(gt)
return np.array(X), np.array(y)
def on_epoch_end(self):
if self.shuffle:
random.shuffle(self.list_IDs)
And here is where I call it:
batchsize = 10
model = load_model(model_path, custom_objects={'jaccard_distance': jaccard_distance, 'auc': auc})
test_gen = DataGenerator("/path/to/data/path/written/in/file.txt",
gen_data_type='test',
batchsize=batchsize,
classes=None,
selected_class=None)
y_pred = model.predict_generator(test_gen, steps=None, verbose=1)
y_true = np.array(test_gen.gt)
plot_confusion_matrix(y_true, y_pred, ["Hedgerows", "No Hedgerows"])
Here is the error:
60/60 [==============================] - 4s 71ms/step
Traceback (most recent call last):
File "/work/stages/mathurin/sentinel_segmentation/unet/confusion_matrix.py", line 95, in <module>
plot_confusion_matrix(y_true, y_pred, ["Hedgrows", "No Hedgerows"], normalize=normalization, title=model_path.split('/')[-1].split('.')[0])
File "/work/stages/mathurin/sentinel_segmentation/unet/confusion_matrix.py", line 35, in plot_confusion_matrix
cm = confusion_matrix(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 253, in confusion_matrix
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 71, in _check_targets
check_consistent_length(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py", line 235, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [702, 592]
when I look at the index number of the get_item() function, it is not the expected number... It should be the number returned by the len() function but it is always smaller.
In this example, after making the predictions, the self.index parameter value is 8.
Like if it was exceeding then restarting at 0, 1, 2, etc...
EDIT: more strange !
I just re-run and I get a different number of stored_gt arrays ...
60/60 [==============================] - 6s 100ms/step
Traceback (most recent call last):
File "/work/tools/pycharm-community-2019.1.1/helpers/pydev/pydevd.py", line 1741, in <module>
main()
File "/work/tools/pycharm-community-2019.1.1/helpers/pydev/pydevd.py", line 1735, in main
globals = debugger.run(setup['file'], None, None, is_module)
File "/work/tools/pycharm-community-2019.1.1/helpers/pydev/pydevd.py", line 1135, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "/work/tools/pycharm-community-2019.1.1/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/work/stages/mathurin/sentinel_segmentation/unet/confusion_matrix.py", line 95, in <module>
plot_confusion_matrix(y_true, y_pred, ["Hedgrows", "No Hedgerows"], normalize=normalization, title=model_path.split('/')[-1].split('.')[0])
File "/work/stages/mathurin/sentinel_segmentation/unet/confusion_matrix.py", line 35, in plot_confusion_matrix
cm = confusion_matrix(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 253, in confusion_matrix
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 71, in _check_targets
check_consistent_length(y_true, y_pred)
File "/work/tools/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py", line 235, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [682, 592]
There is nothing strange in this, generators are run by keras using multiple processes/threads to improve performance, specially for training, that's why fit_generator and predict_generator have keyword arguments like workers, use_multiprocessing, max_queue_size. So the solution is not to store any kind of ground truth or state in the generator instance.
For your specific case, you can use another kind of prediction loop, by calling the generator manually:
labels = []
preds = []
for step in range(len(generator)):
data, label = generator.__getitem__(step)
pred = model.predict(data)
preds.append(pred)
labels.append(label)
Then using preds and labels to make a confusion matrix.
I am new to deep learning and cnn and trying to get familiar with that field using CIFAR10 tutorial code from PyTorch website. So, in that code I was playing with removing/adding layers to better understand the effect of them and I tried to connect the input(which is the initial data with the batch of 4 images) to the output directly with using only single fully connected layer. I know that does not make much sense, but I do it only for the sake of experiment. So, when I tried to do it, I faced with some errors, which are as follows:
First, here is the code snippet:
########################################################################
# 2. Define a Convolution Neural Network
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Copy the neural network from the Neural Networks section before and modify it to
# take 3-channel images (instead of 1-channel images as it was defined).
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#self.conv1 = nn.Conv2d(3, 6, 5)
#self.pool = nn.MaxPool2d(2, 2)
#self.conv2 = nn.Conv2d(6, 16, 5)
#self.fc1 = nn.Linear(16 * 5 * 5, 120)
#self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(768 * 4 * 4, 10)
def forward(self, x):
#x = self.pool(F.relu(self.conv1(x)))
#x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 768 * 4 * 4)
#x = F.relu(self.fc1(x))
#x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
#######################################################################
# 3. Define a Loss function and optimizer
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Let's use a Classification Cross-Entropy loss and SGD with momentum.
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
########################################################################
# 4. Train the network
# ^^^^^^^^^^^^^^^^^^^^
#
# This is when things start to get interesting.
# We simply have to loop over our data iterator, and feed the inputs to the
# network and optimize.
for epoch in range(4): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
print(len(outputs))
print(len(labels))
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
So, when I run the code, I get the following error:
Traceback (most recent call last):
File "C:\Users\Andrey\Desktop\Machine_learning_Danila\Homework 3\cifar10_tutorial1.py", line 180, in <module>
loss = criterion(outputs, labels)
File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\loss.py", line 862, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "C:\Program Files\Python36\lib\site-packages\torch\nn\functional.py", line 1550, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "C:\Program Files\Python36\lib\site-packages\torch\nn\functional.py", line 1405, in nll_loss
.format(input.size(0), target.size(0)))
ValueError: Expected input batch_size (1) to match target batch_size (4).
I was trying to check the length of x and it turns out, that it is 4 initially but it becomes 1 after the line
x = x.view(-1, 768 * 4 * 4)
I think my numbers are correct, but it seems like I am having only 1 tensor instead of 4 as I supposed to have, and I feel like that is what causes that error.
I am wondering, why is that and what is the best way to fix that?
Also, what would be the best optimal number for output dimension output in nn.Linear(Fully connected Layer) in this case?
There are two obvious errors in your modified code (from the official ones from PyTorch webpage). First,
torch.nn.Linear(in_features, out_features)
is the correct syntax. But, you're passing 768 * 4 * 4 as in_features. This is 4 times the actual number of neurons (pixels) in one CIFAR10 image (32*32*3 = 3072).
The second bug is related to the first one. When you prepare your inputs tensor,
# forward + backward + optimize;
# `inputs` should be a tensor of shape [batch_size, input_shape]
outputs = net(inputs)
you should pass it as a tensor of shape [batch_size, input_size], which according to your requirement is [4, 3072] since you want to use a batch size of 4. This is where you should provide the batch dimension; Not in nn.Linear which is what you're currently doing and that is causing the error.
Finally, you should also fix the line in forward method. Change the below line
x = x.view(-1, 768 * 4 * 4)
to
x = x.view(-1, 32*32*3)
Fixing these bugs should fix your errors.
Having said that I'm unsure whether this would actually work, in a conceptual sense. Because this is a simple linear transformation (i.e. an affine transformation without any non-linearity). The data points (which correspond to images in CIFAR10) would most probably be not linearly separable, in this 3072 dimensional space (manifold). Hence, the accuracy would be drastically poor. Thus it is advisable to add at least a hidden layer with non-linearity such as ReLU.
I'm trying to use tf.contrib.training.stratified_sample in Tensorflow to balance classes. I made a quick example below to test it, drawing samples from two unbalanced classes in a balanced way and verifying it, but I'm getting an error.
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
batch_size = 10
data = ['a']*9990+['b']*10
labels = [1]*9990+[0]*10
data_tensor = ops.convert_to_tensor(data, dtype=dtypes.string)
label_tensor = ops.convert_to_tensor(labels)
target_probs = [0.5,0.5]
data_batch, label_batch = tf.contrib.training.stratified_sample(
data_tensor, label_tensor, target_probs, batch_size,
queue_capacity=2*batch_size)
with tf.Session() as sess:
d,l = sess.run(data_batch,label_batch)
print('percentage "a" = %.3f' % (np.sum(l)/len(l)))
The error I'm getting is:
Traceback (most recent call last):
File "/home/jason/code/scrap.py", line 56, in <module>
test_stratified_sample()
File "/home/jason/code/scrap.py", line 47, in test_stratified_sample
queue_capacity=2*batch_size)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/training/python/training/sampling_ops.py", line 191, in stratified_sample
with ops.name_scope(name, 'stratified_sample', tensors + [labels]):
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/math_ops.py", line 829, in binary_op_wrapper
y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 676, in convert_to_tensor
as_ref=False) File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 741, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/constant_op.py", line 113, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/constant_op.py", line 102, in constant
tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/tensor_util.py", line 374, in make_tensor_proto
_AssertCompatible(values, dtype)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/tensor_util.py", line 302, in _AssertCompatible
(dtype.name, repr(mismatch), type(mismatch).__name__)) TypeError: Expected string, got list containing Tensors of type '_Message' instead.
The error doesn't explain what I'm doing wrong. I also tried putting the raw data and labels in (without converting to a tensor), as well as tried using tf.train.slice_input_producer to create an initial queue of the data and label tensors.
Has anyone gotten stratified_sample to work? I haven't been able to find any examples.
I've modified the code into something that works for me. Summary of the changes:
Use enqueue_many=True to enqueue a batch of examples with different labels. Otherwise it's expecting a single scalar label Tensor (which can be stochastic when evaluated by the queue runners).
The first argument is expected to be a list of Tensors. It should have a better error message (I think this is what you ran into). Please do send a pull request or open an issue on Github for a better error message.
Start queue runners. Otherwise code that uses queues will deadlock. Or use Estimators or MonitoredSession so you don't need to worry about this.
(Edit based on comments) stratified_sample does not shuffle the data, it just accepts/rejects! So if your data is not randomized, consider putting it through slice_input_producer (enqueue_many=False) or shuffle_batch (enqueue_many=True) before sampling if you want it to come out in a random order.
Modified code (improved based on Jason's comments):
import numpy
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
with tf.Graph().as_default():
batch_size = 100
data = ['a']*9000+['b']*1000
labels = [1]*9000+[0]*1000
data_tensor = ops.convert_to_tensor(data, dtype=dtypes.string)
label_tensor = ops.convert_to_tensor(labels, dtype=dtypes.int32)
shuffled_data, shuffled_labels = tf.train.slice_input_producer(
[data_tensor, label_tensor], shuffle=True, capacity=3*batch_size)
target_probs = numpy.array([0.5,0.5])
data_batch, label_batch = tf.contrib.training.stratified_sample(
[shuffled_data], shuffled_labels, target_probs, batch_size,
queue_capacity=2*batch_size)
with tf.Session() as session:
tf.local_variables_initializer().run()
tf.global_variables_initializer().run()
coordinator = tf.train.Coordinator()
tf.train.start_queue_runners(session, coord=coordinator)
num_iter = 10
sum_ones = 0.
for _ in range(num_iter):
d, l = session.run([data_batch, label_batch])
count_ones = l.sum()
sum_ones += float(count_ones)
print('percentage "a" = %.3f' % (float(count_ones) / len(l)))
print('Overall: {}'.format(sum_ones / (num_iter * batch_size)))
coordinator.request_stop()
coordinator.join()
Outputs:
percentage "a" = 0.480
percentage "a" = 0.440
percentage "a" = 0.580
percentage "a" = 0.570
percentage "a" = 0.580
percentage "a" = 0.520
percentage "a" = 0.480
percentage "a" = 0.460
percentage "a" = 0.390
percentage "a" = 0.530
Overall: 0.503
I want to use not_mnist to train vgg16. The train_datasets shape is (batch_size,image_size,image_size,channels) and train_labels shape is (batch_size,10)
my code is
import tensorflow as tf
import tensorflow.contrib.slim.nets as nets
import pickle
import numpy as np
slim = tf.contrib.slim
with open('notMNIST.pickle', 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
batch_size = 16
num_labels = 10
image_size = 28
num_channnels = 1
train_log_dir = './variables/'
train_dataset = np.reshape(train_dataset, (-1, image_size, image_size, num_channnels))
train_labels = (np.arange(num_labels) == train_labels[:, None]).astype(np.float32)
tf_train_labels = tf.constant(train_labels)
tf_train_dataset = tf.constant(train_dataset)
tf_train_dataset = tf.image.resize_images(tf_train_dataset, [224, 224])
with tf.Graph().as_default():
predictions, _ = nets.vgg.vgg_16(tf_train_dataset, 10, is_training=True)
slim.losses.softmax_cross_entropy(predictions, tf_train_labels)
total_loss = slim.losses.get_total_loss()
tf.summary.scalar('losses/total_loss', total_loss)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_tensor = slim.learning.create_train_op(total_loss, optimizer)
slim.learning.train(train_tensor, train_log_dir)
but this code cannot run,the error is
Traceback (most recent call last):
File "/Users/szp/Documents/github/deeplearning_example/not_mnist_cnn/Inception_v1_cnn.py", line 33, in <module>
total_loss = slim.losses.get_total_loss()
File "/Users/szp/Documents/github/deeplearning_example/venv/lib/python3.5/site-packages/tensorflow/python/util/deprecation.py", line 117, in new_func
return func(*args, **kwargs)
File "/Users/szp/Documents/github/deeplearning_example/venv/lib/python3.5/site-packages/tensorflow/contrib/losses/python/losses/loss_ops.py", line 264, in get_total_loss
return math_ops.add_n(losses, name=name)
File "/Users/szp/Documents/github/deeplearning_example/venv/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 1861, in add_n
raise ValueError("inputs must be a list of at least one Tensor with the "
ValueError: inputs must be a list of at least one Tensor with the same dtype and shape
I trace the code ,
In math_ops.py and in get_total_loss function the get_losses()cannot return any value.
The answer to your question may not always be the one you wanted, but that doesn’t mean it is wrong. A conclusive answer isn’t always possible. When in doubt, ask people to cite their sources, or to explain how/where they learned something. Even if we don’t agree with you, or tell you exactly what you wanted to hear, remember: we’re just trying to help.
thanks, I will keep these tips in mind when asking