Shap & PyTorch Lightning - Problem with Tensor size

Shap & PyTorch Lightning - Problem with Tensor size - python

I am trying to use shap to explain the outputs of a Pytorch (Lightning) model. Here is the code:
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset,[train_size,val_size,test_size])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=True)
model = Model.load_from_checkpoint("path")
batch = next(iter(test_dataloader))
x, _, _ = batch
background = x[:100].to(model.device)
test_points = x[100:180].to(model.device)
# model(test_points) PLEASE NOTE THAT THIS LINE RUN WITH NO WARNING/ERROR
e = shap.DeepExplainer(model, background)
shap_values = e.shap_values(test_points)
the last line of the code, raise the following error:
Traceback (most recent call last):
File "shap_computation.py", line 40, in <module>
main()
File "shap_computation.py", line 35, in main
shap_values = e.shap_values(test_points)
File "virtualenv/lib/python3.9/site-packages/shap/explainers/_deep/__init__.py", line 124, in shap_values
return self.explainer.shap_values(X, ranked_outputs, output_rank_order, check_additivity=check_additivity)
File "virtualenv/lib/python3.9/site-packages/shap/explainers/_deep/deep_pytorch.py", line 185, in shap_values
sample_phis = self.gradient(feature_ind, joint_x)
File "virtualenv/lib/python3.9/site-packages/shap/explainers/_deep/deep_pytorch.py", line 121, in gradient
grad = torch.autograd.grad(selected, x,
File "virtualenv/lib/python3.9/site-packages/torch/autograd/__init__.py", line 300, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "virtualenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 62, in __call__
return self.hook(module, *args, **kwargs)
File "virtualenv/lib/python3.9/site-packages/shap/explainers/_deep/deep_pytorch.py", line 226, in deeplift_grad
return op_handler[module_type](module, grad_input, grad_output)
File "virtualenv/lib/python3.9/site-packages/shap/explainers/_deep/deep_pytorch.py", line 358, in nonlinear_1d
grad_output[0] * (delta_out / delta_in).repeat(dup0))
RuntimeError: The size of tensor a (50) must match the size of tensor b (25) at non-singleton dimension 1
is there anyone that can help?

The original model was something like
fc1 = nn.Linear(...)
fc2 = nn.Linear(...)
and so on. Inspired by a discussion on GitHub, I found out that by changing the model using nn.Sequential. The code posted in the question works without problems

Related

RuntimeError: each element in list of batch should be of equal size in BERT

Now i want to train bert model.
But runtimeError occured.
I don't know how to solve this error.
Please give me some help.
Here is my code.
I excute this code in
python = 3.8.10
pytorch = 1.8.0
I used IMDB datasets.
I tried change version and check data.
And the dataset is variable data.
How to handle variable data?
Please give me some tips.
def pretraining(
model: MLMandNSPmodel,
model_name: str,
train_dataset: PretrainDataset,
val_dataset: PretrainDataset,
):
# Below options are just our recommendation. You can choose different options if you want.
batch_size = 8
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
epochs = 200 # 200 if you want to feel the effect of pretraining
steps_per_a_epoch: int=2000
steps_for_val: int=200
### YOUR CODE HERE
# pretraining(model, model_name, train_dataset, val_dataset)
MLM_train_losses: List[float] = None
MLM_val_losses: List[float] = None
NSP_train_losses: List[float] = None
NSP_val_losses: List[float] = None
MLM_train_losses = []
MLM_val_losses = []
NSP_train_losses = []
NSP_val_losses = []
print('')
print(train_dataset)
print(val_dataset)
print('')
train_data_iterator = iter(
torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
# train_data_iterator = torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
eval_data_iterator = iter(
torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
# eval_data_iterator = torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
loss_log = tqdm(total=0, bar_format='{desc}')
i = 0
for epoch in trange(epochs, desc="Epoch", position=0):
i += 1
# Run batches for 'steps_per_a_epoch' times
MLM_loss = 0
NSP_loss = 0
model.train()
for step in trange(steps_per_a_epoch, desc="Training steps"):
optimizer.zero_grad()
src, mlm, mask, nsp = next(train_data_iterator)
mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
MLM_loss += mlm_loss
NSP_loss += nsp_loss
loss = mlm_loss + nsp_loss
loss.backward()
optimizer.step()
des = 'Loss: {:06.4f}'.format(loss.cpu())
loss_log.set_description_str(des)
# Calculate training loss
MLM_loss = MLM_loss / steps_per_a_epoch
NSP_loss = NSP_loss / steps_per_a_epoch
MLM_train_losses.append(float(MLM_loss.data))
NSP_train_losses.append(float(NSP_loss.data))
# Calculate valid loss
model.eval()
valid_mlm_loss = 0.
valid_nsp_loss = 0.
for step in trange(steps_for_val, desc="Evaluation steps"):
src, mlm, mask, nsp = next(eval_data_iterator)
mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
valid_mlm_loss += mlm_loss
valid_nsp_loss += nsp_loss
valid_mlm_loss = valid_mlm_loss / steps_for_val
valid_nsp_loss = valid_nsp_loss / steps_for_val
MLM_val_losses.append(float(valid_mlm_loss.data))
NSP_val_losses.append(float(valid_nsp_loss.data))
torch.save(model.state_dict(), os.path.join('/home/ml/Desktop/song/HW3/hw3/',model_name + str(i)+'.pth'))
### END YOUR CODE
assert len(MLM_train_losses) == len(MLM_val_losses) == epochs and \
len(NSP_train_losses) == len(NSP_val_losses) == epochs
assert all(isinstance(loss, float) for loss in MLM_train_losses) and \
all(isinstance(loss, float) for loss in MLM_val_losses) and \
all(isinstance(loss, float) for loss in NSP_train_losses) and \
all(isinstance(loss, float) for loss in NSP_val_losses)
return MLM_train_losses, MLM_val_losses, NSP_train_losses, NSP_val_losses
And this is error message
(hw3) ml#automl03:~/Desktop/song/HW3/hw3$ python pretrain.py
======MLM & NSP Pretraining======
<__main__.PretrainDataset object at 0x7fa72117eb80>
<__main__.PretrainDataset object at 0x7fa70d0a2a30>
<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3640>
<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3670>
Training steps: 0%| | 0/2000 [00:00<?, ?it/s]
Epoch: 0%| | 0/200 [00:00<?, ?it/s]
Traceback (most recent call last):
File "pretrain.py", line 527, in <module>
pretrain_model()
File "pretrain.py", line 505, in pretrain_model
= pretraining(model, model_name, train_dataset, val_dataset)
File "pretrain.py", line 316, in pretraining
src, mlm, mask, nsp = next(train_data_iterator)
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 517, in __next__
data = self._next_data()
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
return self._process_data(data)
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
data.reraise()
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
data = fetcher.fetch(index)
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 35, in fetch
return self.collate_fn(data)
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
return [default_collate(samples) for samples in transposed]
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 81, in default_collate
raise RuntimeError('each element in list of batch should be of equal size')
RuntimeError: each element in list of batch should be of equal siz

InvalidArgumentError: Can not squeeze dim[2], expected a dimension of 1, got 10

I am doing Covid19 facemask detection project and when I train my image dataset I find a error which I can't understand. So, please help me to solve this problem. the error is given below.
Epoch 1/20
Traceback (most recent call last):
File "Mask_detection.py", line 108, in <module>
epochs=Epoch)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 840, in _call
return self._stateless_fn(*args, **kwds)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1848, in _filtered_call
cancellation_manager=cancellation_manager)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1924, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 550, in call
ctx=ctx)
File "C:\Users\ABDEALIVORA\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Can not squeeze dim[2], expected a dimension of 1, got 10
[[node categorical_crossentropy/remove_squeezable_dimensions/Squeeze (defined at Mask_detection.py:108) ]] [Op:__inference_train_function_889]
Function call stack:
train_function
2020-09-28 12:37:31.761507: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Failed precondition: Python interpreter s
tate is not initialized. The process may be terminated.
[[{{node PyFunc}}]]
I provide my python code that helps you to understand the problem. My system is without GPU so this error is related with GPU.
DIRECTORY = 'images'
Categories = ["With_mask","Without_mask"]
batch_size= 10
num_class = 10
Epoch= 20
data = []
label =[]
for category in Categories:
path = os.path.join(DIRECTORY,category)
for img in os.listdir(path):
img_path = os.path.join(path,img)
image = load_img(img_path,target_size =(64,64))
image = img_to_array(image)
image = preprocess_input(image)
data.append(image)
label.append(category)
lb = LabelBinarizer()
label = lb.fit_transform(label)
label = to_categorical(label)
data = numpy.asarray(data,dtype = 'float32')
label = numpy.array(label)
print("////")
x_train,x_test,y_train,y_test = train_test_split(data,label,stratify=label,test_size=0.2,random_state=3)
y_train = utils.to_categorical(y_train, num_class)
y_test = utils.to_categorical(y_test, num_class)
mask_model = Sequential()
mask_model.add(Conv2D(32,kernel_size=(3,3),activation= 'linear',padding ="same",input_shape=(64,64,3)))
mask_model.add(LeakyReLU(alpha = 0.3))
mask_model.add(Conv2D(32,kernel_size=(3,3),activation= 'linear',padding ="same",input_shape=(64,64,3)))
mask_model.add(LeakyReLU(alpha=0.3))
mask_model.add(MaxPooling2D(pool_size =(2,2)))
mask_model.add(Conv2D(32,kernel_size=(3,3),activation= 'linear',padding ="same",input_shape=(64,64,3)))
mask_model.add(LeakyReLU(alpha=0.3))
mask_model.add(MaxPooling2D(pool_size =(2,2)))
mask_model.add(Flatten())
mask_model.add(Dense(128,activation = "linear"))
mask_model.add(LeakyReLU(alpha=0.3))
mask_model.add(Dense(10,activation= "softmax"))
mask_model.compile(optimizer ='adam',loss = 'categorical_crossentropy',metrics =['accuracy'] )
mask_model.summary()
datagen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True)
datagen.fit(x_train)
mask_model.fit(datagen.flow(x_train,y_train,batch_size=10),
steps_per_epoch=len(x_train),
validation_data=(x_test, y_test),
validation_steps=len(x_test) // batch_size,
workers=0,
epochs=Epoch)
print("//")
for e in range(Epoch):
print('Epoch', e)
batches = 0
for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
mask_model.fit(x_batch, y_batch)
batches += 1
if batches >= len(x_train) / 32:
break
mask_model.save("Mask_model/mask_model.h5")

ValueError: Variable <tf.Variable 'TensorGraph/base_params/trainable_float32_1:0' shape=(1,) dtype=float32> has `None` for gradient

I’m trying to implement a prediction of low birth rate using nengo and tensorflow with a SNN model.\
But, I got the following Value error (in Anaconda):
Traceback (most recent call last):
File "C:\Users\USER\NengoPRJ\nengo_lowbirth.py", line 95, in <module>
sim.fit(train_data, {out_p: train_labels}, epochs=epochs)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo\utils\magic.py", line 181, in __call__
return self.wrapper(self.__wrapped__, self.instance, args, kwargs)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo_dl\simulator.py", line 66, in require_open
return wrapped(*args, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo_dl\simulator.py", line 869, in fit
"fit", x=x, y=y, n_steps=n_steps, stateful=stateful, **kwargs
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo\utils\magic.py", line 181, in __call__
return self.wrapper(self.__wrapped__, self.instance, args, kwargs)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo_dl\simulator.py", line 50, in with_self
output = wrapped(*args, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\nengo_dl\simulator.py", line 1032, in _call_keras
outputs = getattr(self.keras_model, func_type)(**func_args)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 819, in fit
use_multiprocessing=use_multiprocessing)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training_arrays.py", line 680, in fit
steps_name='steps_per_epoch')
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training_arrays.py", line 189, in model_iteration
f = _make_execution_function(model, mode)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training_arrays.py", line 571, in _make_execution_function
return model._make_execution_function(mode)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 2125, in _make_execution_function
self._make_train_function()
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 2057, in _make_train_function
params=self._collected_trainable_weights, loss=self.total_loss)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\optimizer_v2\optimizer_v2.py", line 503, in get_updates
grads = self.get_gradients(loss, params)
File "C:\ProgramData\Anaconda3\envs\tf210\lib\site-packages\tensorflow_core\python\keras\optimizer_v2\optimizer_v2.py", line 397, in get_gradients
"K.argmax, K.round, K.eval.".format(param))
ValueError: Variable <tf.Variable 'TensorGraph/base_params/trainable_float32_1:0' shape=(1,) dtype=float32> has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.
(in Google Colab):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-205839bf5640> in <module>()
96 loss={out_p: tf.losses.SparseCategoricalCrossentropy(from_logits=True)})
97
---> 98 sim.fit(train_data, {out_p: train_labels}, epochs=epochs)
99
100
13 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py in get_gradients(self, loss, params)
467 "gradient defined (i.e. are differentiable). "
468 "Common ops without gradient: "
--> 469 "K.argmax, K.round, K.eval.".format(param))
470 grads = self._clip_gradients(grads)
471 return grads
ValueError: Variable <tf.Variable 'TensorGraph/base_params/trainable_float32_1:0' shape=(1,) dtype=float32> has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.
I found many solution in github and stackoverflow.
https://github.com/tensorflow/tensorflow/issues/1511
https://github.com/huggingface/transformers/issues/5427
But it couldn't resolve my error.
My code are as follows:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import nengo
import nengo_dl
import requests
seed = 1
amp =1
max_rates = 100
intercepts = 0
tau_rc = 0.02
noise_filter = 0.1 #noise_filter
train_data_rate = 0.85
learning_rate = 0.001
epochs = 5
np.random.seed(seed)
do_train = True
url = "https://github.com/nfmcclure/tensorflow_cookbook/raw/master/01_Introduction/07_Working_with_Data_Sources/birthweight_data/birthweight.dat"
birth_file = requests.get(url)
birth_all_data = birth_file.text.split('\r\n')
birth_header = [x for x in birth_all_data[0].split('\t') if len(x)>=1]
birth_data = [[float(x) for x in y.split('\t') if len(x)>=1] for y in birth_all_data[1:] if len(y)>=1]
data_size = len(birth_data)
x_data = np.array([x[1:8] for x in birth_data])
y_data = np.array([y[0] for y in birth_data])
train_samples = round(data_size*train_data_rate)
train_indices = np.random.choice(data_size, train_samples, replace=False)
testset = set(range(data_size)) - set(train_indices)
test_indices = np.array(list(testset))
x_train = x_data[train_indices]
y_train = np.transpose([y_data[train_indices]])
x_test = x_data[test_indices]
y_test = np.transpose([y_data[test_indices]])
def normalize_cols(m):
col_max = m.max(axis=0)
col_min = m.min(axis=0)
return (m - col_min) / (col_max - col_min)
x_train = np.nan_to_num(normalize_cols(x_train))
x_test = np.nan_to_num(normalize_cols(x_test))
##################################################
nfeatures = 7
#minibatch_size = 189 - train_samples
minibatch_size=1
with nengo.Network(seed=seed) as net:
net.config[nengo.Ensemble].max_rates = nengo.dists.Choice([max_rates])
net.config[nengo.Ensemble].intercepts = nengo.dists.Choice([intercepts])
neuron_type=nengo.LIF(amplitude=amp, tau_rc=tau_rc)
nengo_dl.configure_settings(stateful=False)
inp = nengo.Node([0] * nfeatures)
ens = nengo.Ensemble(1, 1, neuron_type=neuron_type)
x = nengo.Connection(inp, ens.neurons, transform=nengo_dl.dists.Glorot(), synapse=None)
inp_p = nengo.Probe(inp)
out_p = nengo.Probe(x, label="out_p")
out_p_filt = nengo.Probe(x, synapse=noise_filter, label="out_p_filt")
sim = nengo_dl.Simulator(net, minibatch_size=minibatch_size)
n_steps=20
train_data = np.reshape(x_train, (x_train.shape[0], 1, nfeatures))
train_labels = np.reshape(y_train, (y_train.shape[0], 1, 1))
test_data = np.tile(np.reshape(x_test, (x_test.shape[0], 1, nfeatures)), (1, n_steps, 1))
test_labels = np.tile(np.reshape(y_test, (y_test.shape[0], 1, 1)), (1, n_steps, 1))
def accuracy(outputs, targets):
return 100 * tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(outputs)), targets), tf.float32))
sim.compile(loss={out_p_filt: accuracy})
print("accuracy before training:", sim.evaluate(test_data, {out_p_filt: test_labels}, verbose=0)["loss"])
do_training = do_train;
if do_training:
sim.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
loss={out_p: tf.losses.SparseCategoricalCrossentropy(from_logits=True)})
sim.fit(train_data, {out_p: train_labels}, epochs=epochs)
System:
python:3.7.7
tensorflow:2.2.0 and 2.1.0
nengo:3.0.0
nengo-dl:3.2.0
How can I solve this problem?
Thanks in advance.

Expected object of device type cuda but got device type cpu

I am trying to switch the training of my network from cpu to gpu but keep getting the following error.
I am getting the following error
Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _thnn_conv2d_forward
Error occurs, No graph saved
Traceback (most recent call last):
File "<ipython-input-6-2720a5ea768d>", line 12, in <module>
tb.add_graph(network, images)
File "E:\Anaconda\lib\site-packages\torch\utils\tensorboard\writer.py", line 707, in add_graph
self._get_file_writer().add_graph(graph(model, input_to_model, verbose))
File "E:\Anaconda\lib\site-packages\torch\utils\tensorboard\_pytorch_graph.py", line 291, in graph
raise e
File "E:\Anaconda\lib\site-packages\torch\utils\tensorboard\_pytorch_graph.py", line 285, in graph
trace = torch.jit.trace(model, args)
File "E:\Anaconda\lib\site-packages\torch\jit\__init__.py", line 882, in trace
check_tolerance, _force_outplace, _module_class)
File "E:\Anaconda\lib\site-packages\torch\jit\__init__.py", line 1034, in trace_module
module._c._create_method_from_trace(method_name, func, example_inputs, var_lookup_fn, _force_outplace)
File "E:\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 530, in __call__
result = self._slow_forward(*input, **kwargs)
File "E:\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 516, in _slow_forward
result = self.forward(*input, **kwargs)
File "<ipython-input-5-cd44a4e4fb73>", line 52, in forward
t = F.relu(self.conv1(t))
File "E:\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 530, in __call__
result = self._slow_forward(*input, **kwargs)
File "E:\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 516, in _slow_forward
result = self.forward(*input, **kwargs)
File "E:\Anaconda\lib\site-packages\torch\nn\modules\conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File "E:\Anaconda\lib\site-packages\torch\nn\modules\conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _thnn_conv2d_forward```**
I think it says that argument is in type cpu but I changed it in the training part.
I have the following code
Conv-neural network
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.conv1 = nn.Conv2d( in_channels= 1, out_channels= 6, kernel_size=5 )
self.conv2 = nn.Conv2d( in_channels= 6, out_channels= 12, kernel_size=5 )
self.fc1 = nn.Linear( in_features = 12*4*4, out_features = 120 )
self.fc2 = nn.Linear( in_features = 120, out_features = 60 )
self.out = nn.Linear( in_features = 60, out_features = 10 )
def forward(self, t):
t = F.relu(self.conv1(t))
t = F.max_pool2d(t, kernel_size=2, stride=2)
t = F.relu(self.conv2(t))
t = F.max_pool2d(t, kernel_size=2, stride=2)
t = F.relu(self.fc1(t.reshape(-1, 12*4*4)))
t = F.relu(self.fc2(t))
t = self.out(t)
return t
The training part
parameters = dict(
lr = [.01, .001]
, batch_size = [10, 100, 1000]
, shuffle = [True, False]
)
param_values = [v for v in parameters.values()]
param_values
for lr, batch_size, shuffle in product(*param_values):
network = Network()
network.to(device)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle = shuffle)
optimizer = optim.Adam(network.parameters(), lr=lr)
images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)
comment = f' batch_size={batch_size} lr={lr} shuffle={shuffle}'
tb = SummaryWriter(comment = comment)
tb.add_image('images', grid)
tb.add_graph(network, images)
for epoch in range(10):
total_loss = 0
total_correct = 0
for batch in train_loader: # Get batch
images, labels = batch
images = images.to(device) # Changing data to gpu
preds = network(images)
loss = F.cross_entropy(preds, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item() * batch_size
total_correct += get_num_correct(preds, labels)
tb.add_scalar('Loss:', total_loss, epoch)
tb.add_scalar('Number Correct:', total_correct, epoch)
tb.add_scalar('Accuracy:', total_correct/len(train_set), epoch)
#tb.add_histogram('conv1.bias', network.conv1.bias, epoch)
#tb.add_histogram('conv1.weight', network.conv1.weight, epoch)
#tb.add_histogram('conv1.weight.grap', network.conv1.weight.grad, epoch)
for name, weight in network.named_parameters():
tb.add_histogram(name, weight, epoch)
tb.add_histogram(f'{name}.grad', weight.grad, epoch)
print("epoch:", epoch, "total_correct:", total_correct, "loss:",total_loss)
tb.close()
I am new to deep learning so any help will be highly appreciated. Thanks

You missed moving your labels to gpu i.e.
labels = labels.to(device)
You also need to move these to gpu:
images, labels = next(iter(train_loader))
images = images.to(device)
labels = labels.to(device)

How do I create a regression model with multiple outputs in tf.keras?

I'm attempting to train a regression model to predict attributes of music such as BPM. The model takes in spectrograms of audio snippets that are 256x128px png files and outputs a couple continuous values. I have the following code so far that I have developed based upon this guide on the tensorflow website:
import tensorflow as tf
import os
import random
import pathlib
AUTOTUNE = tf.data.experimental.AUTOTUNE
TRAINING_DATA_DIR = r'specgrams'
def gen_model():
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(256, 128, 3)),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(2)
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.001),
loss='mse',
metrics=['mse', 'mae'])
return model
def fetch_batch(batch_size=1000):
all_image_paths = []
all_image_labels = []
data_root = pathlib.Path(TRAINING_DATA_DIR)
files = data_root.iterdir()
for file in files:
file = str(file)
all_image_paths.append(os.path.abspath(file))
label = file[:-4].split('-')[2:]
label = float(label[0]) / 200, int(label[1]) / 1000.0
all_image_labels.append(label)
def preprocess_image(path):
img_raw = tf.io.read_file(path)
image = tf.image.decode_png(img_raw, channels=3)
image = tf.image.resize(image, [256, 128])
image /= 255.0
return image
def preprocess(path, label):
return preprocess_image(path), label
path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
image_ds = path_ds.map(preprocess_image, num_parallel_calls=AUTOTUNE)
label_ds = tf.data.Dataset.from_tensor_slices(all_image_labels)
ds = tf.data.Dataset.zip((image_ds, label_ds))
ds = ds.shuffle(buffer_size=len(os.listdir(TRAINING_DATA_DIR)))
ds = ds.repeat()
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
ds = fetch_batch()
model = gen_model()
model.fit(ds, epochs=1, steps_per_epoch=10)
However I believe I have made a mistake with the structure of my model or how I am preprocessing the training data because I get an error about incorrect dimensions but I'm struggling to narrow down exactly where the issue is. I understand that the guide I followed was for classification problem as opposed to regression and my "labels" are an array of 2 value which is what is causing the problem but I'm not sure how to resolve this.
For context the filenames are in the format xxx-xxx-A-B.png where A and B are the two desired output values of the model. A is a floating-point value somewhere between 70 and 180 and B is an integer value between 0-1000. As such the label variable for each image looks something like this: (0.64, 0.319).
This is the error I am seeing when I attempt to execute the above script:
Traceback (most recent call last):
File "C:\Users\cainy\Desktop\BeatNet\training.py", line 60, in <module>
model.fit(ds, epochs=1, steps_per_epoch=3)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 791, in fit
initial_epoch=initial_epoch)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1515, in fit_generator
steps_name='steps_per_epoch')
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training_generator.py", line 257, in model_iteration
batch_outs = batch_function(*batch_data)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1259, in train_on_batch
outputs = self._fit_function(ins) # pylint: disable=not-callable
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\backend.py", line 3217, in __call__
outputs = self._graph_fn(*converted_inputs)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 558, in __call__
return self._call_flat(args)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 627, in _call_flat
outputs = self._inference_function.call(ctx, args)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 415, in call
ctx=ctx)
File "C:\Users\cainy\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 66, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Can not squeeze dim[1], expected a dimension of 1, got 2
[[{{node metrics/accuracy/Squeeze}}]] [Op:__inference_keras_scratch_graph_734]
Edit: I have uploaded the source code to GitHub here.

You currently only have 1 output - a tensor with length 2 (per batch element). If you want to use/monitor separate losses you'll need to unstack it in both the model output and the labels.
I'm not sure if models.Sequential will be suitable, but you can definitely use the functional API:
def gen_model():
inputs = tf.keras.layers.Input(shape=(256, 128, 3), dtype=tf.float32)
x = inputs
x = tf.keras.layers.Dense(256, activation='relu')
x = tf.keras.layers.Dense(2)
a, b = tf.keras.layers.Lambda(tf.unstack, arguments=dict(axis=-1))(x)
model = tf.keras.models.Model(inputs=inputs, outputs=[a, b])
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.001),
loss=['mse', 'mae'],
metrics=[['mse'], ['mae']])
return model
And in your preprocessing:
def preprocess(path, label):
return preprocess_image(path), tf.unstack(label, axis=-1)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Shap & PyTorch Lightning - Problem with Tensor size - python

The original model was something like fc1 = nn.Linear(...) fc2 = nn.Linear(...) and so on. Inspired by a discussion on GitHub, I found out that by changing the model using nn.Sequential. The code posted in the question works without problems

Related

RuntimeError: each element in list of batch should be of equal size in BERT

InvalidArgumentError: Can not squeeze dim[2], expected a dimension of 1, got 10

ValueError: Variable <tf.Variable 'TensorGraph/base_params/trainable_float32_1:0' shape=(1,) dtype=float32> has `None` for gradient

Expected object of device type cuda but got device type cpu

How do I create a regression model with multiple outputs in tf.keras?

Categories

Resources