Multi-GPU training of AllenNLP coreference resolution - python

I'm trying to replicate (or come close) to the results obtained by the End-to-end Neural Coreference Resolution paper on the CoNLL-2012 shared task. I intend to do some enhancements on top of this, so I decided to use AllenNLP's CoreferenceResolver. This is how I'm initialising & training the model:
import torch
from allennlp.common import Params
from allennlp.data import Vocabulary
from allennlp.data.dataset_readers import ConllCorefReader
from allennlp.data.dataset_readers.dataset_utils import Ontonotes
from allennlp.data.iterators import BasicIterator, MultiprocessIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.models import CoreferenceResolver
from allennlp.modules import Embedding, FeedForward
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import TokenCharactersEncoder
from allennlp.training import Trainer
from allennlp.training.learning_rate_schedulers import LearningRateScheduler
from torch.nn import LSTM, ReLU
from torch.optim import Adam
def read_data(directory_path):
data = []
for file_path in Ontonotes().dataset_path_iterator(directory_path):
data += dataset_reader.read(file_path)
return data
INPUT_FILE_PATH_TEMPLATE = "data/CoNLL-2012/v4/data/%s"
dataset_reader = ConllCorefReader(10, {"tokens": SingleIdTokenIndexer(),
"token_characters": TokenCharactersIndexer()})
training_data = read_data(INPUT_FILE_PATH_TEMPLATE % "train")
validation_data = read_data(INPUT_FILE_PATH_TEMPLATE % "development")
vocabulary = Vocabulary.from_instances(training_data + validation_data)
model = CoreferenceResolver(vocab=vocabulary,
text_field_embedder=BasicTextFieldEmbedder({"tokens": Embedding.from_params(vocabulary, Params({"embedding_dim": embeddings_dimension, "pretrained_file": "glove.840B.300d.txt"})),
"token_characters": TokenCharactersEncoder(embedding=Embedding(num_embeddings=vocabulary.get_vocab_size("token_characters"), embedding_dim=8, vocab_namespace="token_characters"),
encoder=CnnEncoder(embedding_dim=8, num_filters=50, ngram_filter_sizes=(3, 4, 5), output_dim=100))}),
context_layer=PytorchSeq2SeqWrapper(LSTM(input_size=400, hidden_size=200, num_layers=1, dropout=0.2, bidirectional=True, batch_first=True)),
mention_feedforward=FeedForward(input_dim=1220, num_layers=2, hidden_dims=[150, 150], activations=[ReLU(), ReLU()], dropout=[0.2, 0.2]),
antecedent_feedforward=FeedForward(input_dim=3680, num_layers=2, hidden_dims=[150, 150], activations=[ReLU(), ReLU()], dropout=[0.2, 0.2]),
feature_size=20,
max_span_width=10,
spans_per_word=0.4,
max_antecedents=250,
lexical_dropout=0.5)
if torch.cuda.is_available():
cuda_device = 0
model = model.cuda(cuda_device)
else:
cuda_device = -1
iterator = BasicIterator(batch_size=1)
iterator.index_with(vocabulary)
optimiser = Adam(model.parameters(), weight_decay=0.1)
Trainer(model=model,
train_dataset=training_data,
validation_dataset=validation_data,
optimizer=optimiser,
learning_rate_scheduler=LearningRateScheduler.from_params(optimiser, Params({"type": "step", "step_size": 100})),
iterator=iterator,
num_epochs=150,
patience=1,
cuda_device=cuda_device).train()
After reading the data I've trained the model but ran out of GPU memory: RuntimeError: CUDA out of memory. Tried to allocate 4.43 GiB (GPU 0; 11.17 GiB total capacity; 3.96 GiB already allocated; 3.40 GiB free; 3.47 GiB cached). Therefore, I attempted to make use of multiple GPUs to train this model. I'm making use of Tesla K80s (which have 12GiB memory).
I've tried making use of AllenNLP's MultiprocessIterator, by itialising the iterator as MultiprocessIterator(BasicIterator(batch_size=1), num_workers=torch.cuda.device_count()). However, only 1 GPU is being used (by monitoring the memory usage through the nvidia-smi command) & got the error below. I also tried fiddling with its parameters (increasing num_workers or decreasing output_queue_size) & the ulimit (as mentioned by this PyTorch issue) to no avail.
Process Process-3:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/user/.local/lib/python3.6/site-packages/allennlp/data/iterators/multiprocess_iterator.py", line 32, in _create_tensor_dicts
output_queue.put(tensor_dict)
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/user/.local/lib/python3.6/site-packages/allennlp/data/iterators/multiprocess_iterator.py", line 32, in _create_tensor_dicts
output_queue.put(tensor_dict)
File "<string>", line 2, in put
File "<string>", line 2, in put
File "/usr/lib/python3.6/multiprocessing/managers.py", line 772, in _callmethod
raise convert_to_error(kind, result)
File "/usr/lib/python3.6/multiprocessing/managers.py", line 772, in _callmethod
raise convert_to_error(kind, result)
multiprocessing.managers.RemoteError:
---------------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/managers.py", line 228, in serve_client
request = recv()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
File "/home/user/.local/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 276, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/usr/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
return recvfds(s, 1)[0]
File "/usr/lib/python3.6/multiprocessing/reduction.py", line 161, in recvfds
len(ancdata))
RuntimeError: received 0 items of ancdata
---------------------------------------------------------------------------
I also tried achieving this through PyTorch's DataParallel, by wrapping the model's context_layer, mention_feedforward, antecedent_feedforward with a custom DataParallelWrapper (to provide compatibility with the AllenNLP-assumed class functions). Still, only 1 GPU is used & it eventually runs out of memory as before.
class DataParallelWrapper(DataParallel):
def __init__(self, module):
super().__init__(module)
def get_output_dim(self):
return self.module.get_output_dim()
def get_input_dim(self):
return self.module.get_input_dim()
def forward(self, *inputs):
return self.module.forward(inputs)

After some digging through the code I found out that AllenNLP does this under the hood directly through its Trainer. The cuda_device can either be a single int (in the case of single-processing) or a list of ints (in the case of multi-processing):
cuda_device : Union[int, List[int]], optional (default = -1)
An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
So all GPU devices needed should be passed on instead:
if torch.cuda.is_available():
cuda_device = list(range(torch.cuda.device_count()))
model = model.cuda(cuda_device[0])
else:
cuda_device = -1
Note that the model still has to be manually moved to the GPU (via model.cuda(...)), as it would otherwise try to use multiple CPUs instead.

Related

Facing issues while converting a pre-trained PyTorch model to CoreML

I'm trying to convert a pre-trained model from PyTorch to CoreML. I have created a script to achieve the same. I'm able to load and convert the model to TorchScript from both of the methods. (i.e. Tracing and Scripting)
However, when calling the coremltools.convert() method for the traced or scripted model it throws an error.
I have mentioned the scripts for both methods along with errors thrown.
System Information
MacOS = 12.4
Python = 3.9
protobuf = 3.19.0
coremltools = 6.0b1
torch = 1.10.2
torchvision = 0.11.3
Note - I have tried with multiple versions of the libraries I have mentioned above but that does not help me in any way.
Method 1 -> Tracing
Code -
import coremltools as coremltools
import numpy as np
import torch
import torchvision as torchvision
def do_trace(in_model, in_input):
model_trace = torch.jit.trace(in_model, in_input)
model_trace.eval()
return model_trace
def dict_to_tuple(out_dict):
if "masks" in out_dict.keys():
return out_dict["boxes"], out_dict["scores"], out_dict["labels"], out_dict["masks"]
return out_dict["boxes"], out_dict["scores"], out_dict["labels"]
class PredictionModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=True)
def forward(self, in_input):
output = self.model(in_input)
return dict_to_tuple(output[0])
inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=(1, 3, 300, 300)))
model = PredictionModel().eval()
with torch.no_grad():
output = model(inp)
trace_model = do_trace(model, inp)
ml_model = coremltools.convert(trace_model, inputs=[coremltools.TensorType(shape=(1, 3, 300, 300))])
print(ml_model)
Error -
Converting PyTorch Frontend ==> MIL Ops: 3%|▎ | 74/2627 [00:00<00:05, 436.01 ops/s]
Traceback (most recent call last):
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 91, in _perform_torch_convert
prog = converter.convert()
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 263, in convert
convert_nodes(self.context, self.graph)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 89, in convert_nodes
add_op(context, node)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 3973, in reciprocal
context.add(mb.inverse(x=inputs[0], name=node.name))
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/mil/ops/registry.py", line 63, in add_op
return cls._add_op(op_cls, **kwargs)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/mil/builder.py", line 191, in _add_op
new_op.type_value_inference()
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/mil/operation.py", line 244, in type_value_inference
output_vals = self._auto_val(output_types)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/mil/operation.py", line 354, in _auto_val
builtin_val.val = v
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/mil/types/type_tensor.py", line 93, in val
raise ValueError(
ValueError: tensor should have value of type ndarray, got <class 'numpy.float32'> instead
Method 2 -> Scripting
Code -
import coremltools as coremltools
import torch
import torchvision as torchvision
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=True)
script_model = torch.jit.script(model)
ml_model = coremltools.convert(script_model, inputs=[coremltools.TensorType(shape=(1, 3, 300, 300))])
print(ml_model)
Error -
WARNING:root:Support for converting Torch Script Models is experimental. If possible you should use a traced model for conversion.
Traceback (most recent call last):
File "/Applications/PyCharm CE.app/Contents/plugins/python-ce/helpers/pydev/pydevd.py", line 1491, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "/Applications/PyCharm CE.app/Contents/plugins/python-ce/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/techlead/PycharmProjects/conversion_demo/main.py", line 8, in
ml_model = coremltools.convert(script_model, inputs=[coremltools.TensorType(shape=(1, 3, 300, 300))])
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/_converters_entry.py", line 426, in convert
mlmodel = mil_convert(
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/converter.py", line 182, in mil_convert
return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/converter.py", line 209, in _mil_convert
proto, mil_program = mil_convert_to_proto(
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/converter.py", line 272, in mil_convert_to_proto
prog = frontend_converter(model, **kwargs)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/converter.py", line 104, in call
return load(*args, **kwargs)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 51, in load
converter = TorchConverter(torchscript, inputs, outputs, cut_at_symbols)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 158, in init
raw_graph, params_dict = self._expand_and_optimize_ir(self.torchscript)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 478, in _expand_and_optimize_ir
graph, params_dict = TorchConverter._jit_pass_lower_graph(graph, torchscript)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 423, in _jit_pass_lower_graph
_lower_graph_block(graph)
File "/Users/techlead/PycharmProjects/conversion_demo/venv/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 402, in _lower_graph_block
module = getattr(node_to_module_map[_input], attr_name)
KeyError: images.7 defined in (%images.7 : torch.torchvision.models.detection.image_list.ImageList, %targets.31 : Dict(str, Tensor)[]? = prim::TupleUnpack(%405)
)
If you try to run any of the above snippets you'll see that the model gets successfully converted to TorchScipt (Trace and Script) but the last step i.e. to convert the torch script model to coreml fails. Please have a look at this issue and let me know how I can move further with this. Also, if I'm doing something wrong (for eg - Passing the inputs wrong) let me know as well in that case. This is me first time doing this, so i'm kind of a noob. Any help is appreciated. Thank you!

I am using pytorch dataparallel with 2 GPUs. Why are my model's state_dicts' empty on one GPU and have missing keys on the other GPU?

I have a problem with this GitHub project: https://github.com/researchmm/TTSR
If I use it on one GPU only everything runs smoothly. Once I turn on the second GPU and use torch.nn.DataParallel , this results in "Missing key(s) in state_dict":
[2021-08-03 09:01:00,829] - [trainer.py file line:70] - INFO: Current epoch learning rate: 1.000000e-04
Traceback (most recent call last):
File "/rwthfs/rz/cluster/home/ps815691/git/TTSR/main.py", line 53, in <module>
t.train(current_epoch=epoch, is_init=False)
File "/rwthfs/rz/cluster/home/ps815691/git/TTSR/trainer.py", line 126, in train
sr_lv1, sr_lv2, sr_lv3 = self.model(sr=sr)
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 167, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 177, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/rwthfs/rz/cluster/home/ps815691/git/TTSR/model/TTSR.py", line 32, in forward
self.LTE_copy.load_state_dict(self.LTE.state_dict())#, strict=False)
File "/home/ps815691/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1223, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for LTE:
Missing key(s) in state_dict: "slice1.0.weight", "slice1.0.bias", "slice2.2.weight", "slice2.2.bias", "slice2.5.weight", "slice2.5.bias", "slice3.7.weight", "slice3.7.bias", "slice3.10.weight", "slice3.10.bias".
I printed the state_dicts for the "LTE" and "LTE_copy":
LTE GPU1 odict_keys([])
LTE GPU0 odict_keys(['sub_mean.weight', 'sub_mean.bias'])
LTE_Copy GPU1 odict_keys([])
LTE_Copy GPU0 odict_keys(['slice1.0.weight', 'slice1.0.bias', 'slice2.2.weight', 'slice2.2.bias', 'slice2.5.weight', 'slice2.5.bias', 'slice3.7.weight', 'slice3.7.bias', 'slice3.10.weight', 'slice3.10.bias', 'sub_mean.weight', 'sub_mean.bias'])
I do not get why that happens. Let me give you a quick introduction to the code:
The code starts in main.py. First, the model gets initialized from model/ttsr.py. This ttsr model is composed of several submodels. One of which is "LTE" & "LTE_copy". Then that model is put into nn.DataParallel and the trainer (trainer.py) is initialized with that model. t.train starts the training
_model = TTSR.TTSR(args).to(device)
_model = nn.DataParallel(_model, list(range(args.num_gpu)))
t = Trainer(args, _logger, _dataloader, _model, _loss_all)
t.train(current_epoch=epoch, is_init=True)
In the train function, after a batch has been fed through the model, the models output is fed back to the model, to get some parts of the loss function (trainer.py line 97). The model then executes this code in ttsr.py:
### used in transferal perceptual loss
self.LTE_copy.load_state_dict(self.LTE.state_dict())
sr_lv1, sr_lv2, sr_lv3 = self.LTE_copy((sr + 1.) / 2.)
return sr_lv1, sr_lv2, sr_lv3
Has anyone a clue why the error message above gets thrown out? It does not appear if I use load_state_dict(...,strict=False), but doesn't this just ignore the underlying problem? There does not seem to be any LTE.state_dict on GPU1's memory for example.

CUDA_OUT_OF_MEMORY in PyTorch head2head model

I am executing the head2head model presented in the Github repo here.
When I am running the code using the following command:
./scripts/train/train_on_target.sh Obama head2headDataset
with contents of the train_on_target.sh file as:
target_name=$1
dataset_name=$2
python train.py --checkpoints_dir checkpoints/$dataset_name \
--target_name $target_name \
--name head2head_$target_name \
--dataroot datasets/$dataset_name/dataset \
--serial_batches
Then I am getting the following error:
Traceback (most recent call last):
File "train.py", line 108, in <module>
flow_ref, conf_ref, t_scales, n_frames_D)
File "/home/nitin/head2head/util/util.py", line 48, in get_skipped_flows
flow_ref_skipped[s], conf_ref_skipped[s] = flowNet(real_B[s][:,1:], real_B[s][:,:-1])
File "/home/nitin/anaconda3/envs/head2head/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/nitin/anaconda3/envs/head2head/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 150, in forward
return self.module(*inputs[0], **kwargs[0])
File "/home/nitin/anaconda3/envs/head2head/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/nitin/head2head/models/flownet.py", line 38, in forward
flow, conf = self.compute_flow_and_conf(input_A, input_B)
File "/home/nitin/head2head/models/flownet.py", line 55, in compute_flow_and_conf
flow1 = self.flowNet(data1)
File "/home/nitin/anaconda3/envs/head2head/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/nitin/head2head/models/flownet2_pytorch/models.py", line 156, in forward
flownetfusion_flow = self.flownetfusion(concat3)
File "/home/nitin/anaconda3/envs/head2head/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/nitin/head2head/models/flownet2_pytorch/networks/FlowNetFusion.py", line 62, in forward
concat0 = torch.cat((out_conv0,out_deconv0,flow1_up),1)
RuntimeError: CUDA out of memory. Tried to allocate 82.00 MiB (GPU 0; 5.80 GiB total capacity; 4.77 GiB already allocated; 73.56 MiB free; 4.88 GiB reserved in total by PyTorch)
I have checked the batch size in the file options/base_options.py. It is already set to 1. How can I solve the above mentioned exception. My system has 6 GB NVIDIA GTX 1660 Super GPU.
Data management:
You can try reducing the dataset used for training to check if is a hardware limitation.
Moreover, if it is an image dataset, you can reduce the dimensions of the images by reducing the dpi.
Model parameters management:
Another approach is to reduce the number of parameters of your model. The first suggestion would be to change the Dense layer size and then the other neural network hyperparameters.

CPU version of "torch._C._nn.nll_loss" function

Is there a function for torch._C._nn.nll_loss that takes in a CPU input? I don't have enough GPU memory to run my function so I'm trying to run everything on CPU.
This is my specific error (look at the anaconda files)
Traceback (most recent call last):
File "plot_parametric_pytorch.py", line 395, in <module>
val_result = validate(val_loader, model, criterion, 0)
File "plot_parametric_pytorch.py", line 228, in validate
training=False, optimizer=None)
File "plot_parametric_pytorch.py", line 169, in forward
loss = criterion(output, target_var)
File "/home/klee/anaconda3/envs/sharpenv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/home/klee/anaconda3/envs/sharpenv/lib/python3.7/site-packages/torch/nn/modules/loss.py", line 932, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "/home/klee/anaconda3/envs/sharpenv/lib/python3.7/site-packages/torch/nn/functional.py", line 2317, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "/home/klee/anaconda3/envs/sharpenv/lib/python3.7/site-packages/torch/nn/functional.py", line 2115, in nll_loss
ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _thnn_nll_loss_forward
nll_loss works for both CPU and GPU, but the input and the target need to be on the same device. Yours are on different devices, where the first one (output) is on the CPU, but the second (target_var) is on the GPU.
You need to put target_var onto the CPU.
loss = criterion(output, target_var.cpu())

RuntimeError: dictionary changed size during iteration when I load sklearn model

When I attempt to load a sklearn logistic regression model from pickle using the following code
def _feed(gpu):
log_info("prediction process started")
model = pickle.load(open(model_path, 'rb'))
while True:
X = pending_q.get()
if not X:
log_info("stop pending thread")
return
# first_elem_non_neg_one_count = (X[-1][0] == -1).sum()
# log_info("non negative one inside X: %d" % first_elem_non_neg_one_count)
if X[0] > WINDOW_SIZE: # only begin prediction when we've got enough frames to compute means and stds
predicted = model.predict(X[-1])
# TODO: count consecutive and recent X # only
# sum_count = (predicted.flatten()[MOST_X_FRAME:] >= THRESHOLD).sum()
# log_info("***** got %d frames > %.1f in most recent %d frames" % (sum_count, THRESHOLD, abs(MOST_X_FRAME)))
log_info("frame: %d, predicted: %d" % (X[0], predicted))
predicted = X[:2] + predicted
else:
log_info("warming up, skip inference...")
predicted = X[:2] + [-1]
predicted_q.put(predicted) # TODO: batch prediction
I got the following error:
Exception in thread Thread-6:
Traceback (most recent call last):
File "/home/support/.pyenv/versions/3.6.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/home/support/.pyenv/versions/3.6.6/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/app/xxxxx/04_predict_ps.py", line 106, in _feed
model = pickle.load(model_file)
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/sklearn/linear_model/__init__.py", line 12, in <module>
from .base import LinearRegression
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/sklearn/linear_model/base.py", line 38, in <module>
from ..preprocessing.data import normalize as f_normalize
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/sklearn/preprocessing/__init__.py", line 8, in <module>
from .data import Binarizer
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/sklearn/preprocessing/data.py", line 19, in <module>
from scipy import stats
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/scipy/stats/__init__.py", line 345, in <module>
from .stats import *
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/scipy/stats/stats.py", line 171, in <module>
from . import distributions
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/scipy/stats/distributions.py", line 13, in <module>
from . import _continuous_distns
File "/home/support/.pyenv/versions/va-worker-3.6.6/lib/python3.6/site-packages/scipy/stats/_continuous_distns.py", line 6692, in <module>
pairs = list(globals().items())
RuntimeError: dictionary changed size during iteration
I save the model using the below:
sk_logistic_regr = LogisticRegression()
sk_logistic_regr.fit(x_train, y_train)
with open(SK_MODEL_NAME, 'wb') as file:
pickle.dump(sk_logistic_regr, file)
I am not sure if it is because I am not closing the pickle file properly when I kill my script. Any idea on the reason causing this? It seems that the model loading code only succeeds for the first time.
P.S. the _feed function is being run on a separate thread dedicated for inferencing
Meanwhile as a workaround, I used from joblib import dump, load which is recommended in sklearn doc which fixes my problem.

Categories

Resources