Change the imported libraries of a saved pickle - python

Some time ago I saved a class function into a pickle file.
<library.module.Class at 0x1c926b2e520>
That class imports two libraries that changed their name meanwhile.
Is it possible to edit the pickle file so I can update those 2 new imports without regenerating the pickle all over again?
Thank you!
Regards
EDIT:
This is how I am loading the pickle:
import pickle
model_path = os.getenv('MODELS_FOLDER') + 'model_20210130.pkl'
model = pickle.load(open(model, 'rb'))
This of the pickle class content. The two libraries I want to update are pinpointed.
**import socceraction.spadl.config** as spadlconfig
**from socceraction.spadl.base import** SPADLSchema
class ExpectedThreat:
"""An implementation of the model.
"""
def __init__(self):
...
def __solve(
self) -> None:
def fit(self, actions: DataFrame[SPADLSchema]) -> 'ExpectedThreat':
"""Fits the xT model with the given actions."""
def predict(
self, actions: DataFrame[SPADLSchema], use_interpolation: bool = False
) -> np.ndarray:
"""Predicts the model values for the given actions.

I don't think you can do that.
Pickled objects are serialized content, and in order to be modified, it must be de-serialized properly.
Why can't you just load it, change it, and overwrite it?
You can still overwrite functions like properties with new ones:
# import new libraries
import new_socceraction.spadl.config as new_spadlconfig
from new_socceraction.spadl.base import new_SPADLSchema
import pickle
model_path = os.getenv('MODELS_FOLDER') + 'model_20210130.pkl'
with open(model_path, 'rb') as file:
model = pickle.load(file)
# define new methods with different libraries
def fit(self, actions: DataFrame[new_SPADLSchema]) -> 'ExpectedThreat':
"""Fits the xT model with the given actions."""
pass # your new implementation
def predict(
self, actions: DataFrame[new_SPADLSchema], use_interpolation: bool = False
) -> np.ndarray:
"""Predicts the model values for the given actions."""
pass # your new implementation
# overwrite new methods
model.fit = fit
model.predict = predict
# save it again
with open(model_path, 'wb') as file:
pickle.dump(file)
...

Related

Is there a way to pickle a custom tensorflow.keras metric?

I defined the following custom metric to train my model in tensorflow:
import tensorflow as tf
from tensorflow import keras as ks
N_CLASSES = 15
class MulticlassMeanIoU(tf.keras.metrics.MeanIoU):
def __init__(self,
y_true = None,
y_pred = None,
num_classes = None,
name = "Multi_MeanIoU",
dtype = None):
super(MulticlassMeanIoU, self).__init__(num_classes = num_classes,
name = name, dtype = dtype)
self.__name__ = name
def get_config(self):
base_config = super().get_config()
return {**base_config, "num_classes": self.num_classes}
def update_state(self, y_true, y_pred, sample_weight = None):
y_pred = tf.math.argmax(y_pred, axis = -1)
return super().update_state(y_true, y_pred, sample_weight)
met = MulticlassMeanIoU(num_classes = N_CLASSES)
After training the model, I save the model and I also tried to save the custom object as follows:
with open("/some/path/custom_metrics.pkl", "wb") as f:
pickle.dump(met, f)
However, when I try to load the metric like this:
with open(path_custom_metrics, "rb") as f:
met = pickle.load(f)
I always get some errors, e.g. AttributeError: 'MulticlassMeanIoU' object has no attribute 'update_state_fn'.
Now I wonder whether it is possible to pickle a custom metric at all and if so, how? It would come in handy if I could save custom metrics with the model, so when I load the model in another Python session, I always have the metric which is required to load the model in the first place. It would be possible to define the metric anew through inserting the full code to the other script before loading the model, however, I think this would be bad style and could cause problems in case I would change something about the metric in the training script and forget to copy the code to the other script.
If you need to pickle a metric, one possible solution is to use __getstate__() and __setstate__() methods. During the (de)serialization process, these two methods are called, if they are available. Add these methods to your code and you will have what you need. I tried to make it as general as possible, so that it works for any Metric:
def __getstate__(self):
variables = {v.name: v.numpy() for v in self.variables}
state = {
name: variables[var.name]
for name, var in self._unconditional_dependency_names.items()
if isinstance(var, tf.Variable)}
state['name'] = self.name
state['num_classes'] = self.num_classes
return state
def __setstate__(self, state: Dict[str, Any]):
self.__init__(name=state.pop('name'), num_classes=state.pop('num_classes'))
for name, value in state.items():
self._unconditional_dependency_names[name].assign(value)

how to calling get_data function in this code

i am new to work with python and i want to run this code ,
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader
class Preprocess():
def __init__(self, config, d_name):
self.root_dir = config.root_dir
self.batch_size = config.batch_size
self.cora = Planetoid(root='./data/cora', name='Cora')
#self.citeseer = Planetoid(root='./data/citeseer', name='CiteSeer')
#self.pubmed = Planetoid(root='./data/pubmed', name='PubMed')
self.num_classes, self.num_node_features, self.data = self.get_data(d_name)
def get_data(self, d_name):
'''
d_name = 'Cora', 'CiteSeer', 'PubMed'
'''
dataset = Planetoid(root=self.root_dir + d_name , name=d_name)
return dataset.num_classes, dataset.num_node_features, DataLoader(dataset, batch_size
= self.batch_size)
f=Preprocess(config, Cora)
f.get_data(Cora)
but get this error: name 'config' is not defined
So, It is not a bug.
Look your constructor:
def __init__(self, config, d_name):
self.root_dir = config.root_dir
self.batch_size = config.batch_size
etc...
And your instantiation:
f = Preprocess(config, Cora)
f.get_data(Cora)
Note that you are passing a variable "config" that has not been declared before. Also, by the arg "config" type in the constructor, yout need pass an object as a parameter to the instantiation "f = Preprocess(args)", because the arg "config" needs to have an "root_dir" atribute.
Please, check the Pytorch documentation for more examples of how to use this framework. Don't give up, you can do it.

predict() got an unexpected keyword argument 'stats'

I am trying to get predictions from a Tensor Flow custom routine served on AI platform.
I have managed to serve it with the following settings: --runtime-version 2.3 --python-version 3.7 --machine-type mls1-c4-m2
But I keep getting this error when i try to make any predictions.
ERROR:root:Prediction failed: predict() got an unexpected keyword argument 'stats'
ERROR:root:Prediction failed: unknown error.
The routine has two steps:
Takes the input (a string) and transforms it into a embedding using a bow model in .pkl format
Uses the embeddings for getting the predictions using a keras model saved as an .h5 file
this is my setup.py
from setuptools import setup
REQUIRED_PACKAGES = ['Keras==2.3.1', 'sklearn==0.0', 'h5py<3.0.0', 'numpy==1.16.0', 'scipy==1.4.1', 'pyyaml==5.2']
setup(
name='my_custom_code',
version='0.1',
scripts=['predictor.py'],
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=False,
description=''
)
And this is my predictor.py
import os
import pickle
import tensorflow as tf
import numpy as np
class MyPredictor(object):
def __init__(self, model, bow_model):
self._model = model
self._bow_model = bow_model
def predict(self, instances):
outputs = []
for x in instances:
vector = self.embedding(x)
output = self._model.predict(vector)
outputs.append(output)
return outputs
def embedding(self, statement):
vector = self._bow_model.transform(statement).toarray()
vector = vector.to_list()
return vector
#classmethod
def from_path(cls, model_dir):
model_path = os.path.join(model_dir, 'model.h5')
model = tf.keras.models.load_model(model_path, compile = False)
preprocessor_path = os.path.join(model_dir, 'bow.pkl')
with open(preprocessor_path, 'rb') as f:
bow_model = pickle.load(f)
return cls(model, bow_model)
The script im using for testing is:
import googleapiclient.discovery
instances = ['test','test']]
service = googleapiclient.discovery.build('ml', 'v1')
name = 'projects/{}/models/{}/versions/{}'.format(PROJECT_ID, MODEL_NAME, VERSION_NAME)
response = service.projects().predict(
name=name,
body={'instances': instances}
).execute()
if 'error' in response:
raise RuntimeError(response['error'])
else:
print(response['predictions'])
According to the Custom prediction routine documentation, once creating the predictor class, predict() method should be supplied with self, instances, **kwargs arguments to properly handle the prediction request.
instances: A list of prediction input instances.
**kwargs: A dictionary of keyword args provided as additional fields on the predict request body.

Where in the code of pytorch or huggingface/transformer label gets "renamed" into labels?

My question concerns the example, available in the great huggingface/transformers library.
I am using a notebook, provided by library creators as a starting point for my pipeline. It presents a pipeline of finetuning a BERT for Sentence Classification on Glue dataset.
When getting into the code, I noticed a very weird thing, which I cannot explain.
In the example, input data is introduced to the model as the instances of the InputFeatures class from here:
This class has 4 attributes, including the label attribute:
class InputFeatures:
...
input_ids: List[int]
attention_mask: Optional[List[int]] = None
token_type_ids: Optional[List[int]] = None
label: Optional[Union[int, float]] = None
which are later passed as a dictionary of inputs to the forward() method of the model. This is done by the Trainer class, for example in the lines 573-576 here:
def _training_step(
self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer
) -> float:
model.train()
for k, v in inputs.items():
inputs[k] = v.to(self.args.device)
outputs = model(**inputs)
However, the forward() method expects labels (note the plural form) input parameter (taken from here):
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
):
So my question is where does the label become labels in this pipeline?
To give some extra info on the issue, I created my own pipeline, which uses nothing, related, with Glue data and pipe, basically it relies only on the Trainer class of transformers. I even use another model (Flaubert). I replicated the InputFeature class and my code works for both cases below:
class InputFeature:
def __init__(self, text, label):
self.input_ids = text
self.label = label
class InputFeaturePlural:
def __init__(self, text, label):
self.input_ids = text
self.labels = label
But it does not work if I name the second attribute as self.labe or by any other names. Why is it possible to use both attribute names?
It's not like it is extremely important in my case, but I feel uncomfortable passing around the data in the variable, which "changes name" somewhere along the way.
The rename happens in the collator. In the trainer init, when data_collator is None, a default one is used:
class Trainer:
# ...
def __init__(...):
# ...
self.data_collator = data_collator if data_collator is not None else default_data_collator
# ...
FYI, the self.data_collator is later used when you get the dataloader:
data_loader = DataLoader(
self.train_dataset,
batch_size=self.args.train_batch_size,
sampler=train_sampler,
collate_fn=self.data_collator, # <-- here
drop_last=self.args.dataloader_drop_last,
)
The default collator has a special handling for labels, which does this renaming, if needed:
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if hasattr(first, "label") and first.label is not None:
if type(first.label) is int:
labels = torch.tensor([f.label for f in features], dtype=torch.long)
else:
labels = torch.tensor([f.label for f in features], dtype=torch.float)
batch = {"labels": labels} # <-- here is where it happens
elif hasattr(first, "label_ids") and first.label_ids is not None:
if type(first.label_ids[0]) is int:
labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
else:
labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
batch = {"labels": labels}
else:
batch = {}

IPython cluster and PicklingError

my problem seem to be similar to This Thread however, while I think I am following the advised method, I still get a PicklingError. When I run my process locally without sending to an IPython Cluster Engine the function works fine.
I am using zipline with IPyhon's notebook, so I first create a class based on zipline.TradingAlgorithm
Cell [ 1 ]
from IPython.parallel import Client
rc = Client()
lview = rc.load_balanced_view()
Cell [ 2 ]
%%px --local # This insures that the Class and modules exist on each engine
import zipline as zpl
import numpy as np
class Agent(zpl.TradingAlgorithm): # must define initialize and handle_data methods
def initialize(self):
self.valueHistory = None
pass
def handle_data(self, data):
for security in data.keys():
## Just randomly buy/sell/hold for each security
coinflip = np.random.random()
if coinflip < .25:
self.order(security,100)
elif coinflip > .75:
self.order(security,-100)
pass
Cell [ 3 ]
from zipline.utils.factory import load_from_yahoo
start = '2013-04-01'
end = '2013-06-01'
sidList = ['SPY','GOOG']
data = load_from_yahoo(stocks=sidList,start=start,end=end)
agentList = []
for i in range(3):
agentList.append(Agent())
def testSystem(agent,data):
results = agent.run(data) #-- This is how the zipline based class is executed
#-- next I'm just storing the final value of the test so I can plot later
agent.valueHistory.append(results['portfolio_value'][len(results['portfolio_value'])-1])
return agent
for i in range(10):
tasks = []
for agent in agentList:
#agent = testSystem(agent,data) ## On its own, this works!
#-- To Test, uncomment the above line and comment out the next two
tasks.append(lview.apply_async(testSystem,agent,data))
agentList = [ar.get() for ar in tasks]
for agent in agentList:
plot(agent.valueHistory)
Here is the Error produced:
PicklingError Traceback (most recent call last)/Library/Python/2.7/site-packages/IPython/kernel/zmq/serialize.pyc in serialize_object(obj, buffer_threshold, item_threshold)
100 buffers.extend(_extract_buffers(cobj, buffer_threshold))
101
--> 102 buffers.insert(0, pickle.dumps(cobj,-1))
103 return buffers
104
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
If I override the run() method from zipline.TradingAlgorithm with something like:
def run(self, data):
return 1
Trying something like this...
def run(self, data):
return zpl.TradingAlgorithm.run(self,data)
results in the same PicklingError.
then the passing off to the engines works, but obviously the guts of the test are not performed. As run is a method internal to zipline.TradingAlgorithm and I don't know everything that it does, how would I make sure it is passed through?
It looks like the zipline TradingAlgorithm object is not pickleable after it has been run:
import zipline as zpl
class Agent(zpl.TradingAlgorithm): # must define initialize and handle_data methods
def handle_data(self, data):
pass
agent = Agent()
pickle.dumps(agent)[:32] # ok
agent.run(data)
pickle.dumps(agent)[:32] # fails
But this suggests to me that you should be creating the Agents on the engines, and only passing data / results back and forth (ideally, not passing data across at all, or at most once).
Minimizing data transfers might look something like this:
define the class:
%%px
import zipline as zpl
import numpy as np
class Agent(zpl.TradingAlgorithm): # must define initialize and handle_data methods
def initialize(self):
self.valueHistory = []
def handle_data(self, data):
for security in data.keys():
## Just randomly buy/sell/hold for each security
coinflip = np.random.random()
if coinflip < .25:
self.order(security,100)
elif coinflip > .75:
self.order(security,-100)
load the data
%%px
from zipline.utils.factory import load_from_yahoo
start = '2013-04-01'
end = '2013-06-01'
sidList = ['SPY','GOOG']
data = load_from_yahoo(stocks=sidList,start=start,end=end)
agent = Agent()
and run the code:
def testSystem(agent, data):
results = agent.run(data) #-- This is how the zipline based class is executed
#-- next I'm just storing the final value of the test so I can plot later
agent.valueHistory.append(results['portfolio_value'][len(results['portfolio_value'])-1])
# create references to the remote agent / data objects
agent_ref = parallel.Reference('agent')
data_ref = parallel.Reference('data')
tasks = []
for i in range(10):
for j in range(len(rc)):
tasks.append(lview.apply_async(testSystem, agent_ref, data_ref))
# wait for the tasks to complete
[ t.get() for t in tasks ]
And plot the results, never fetching the agents themselves
%matplotlib inline
import matplotlib.pyplot as plt
for history in rc[:].apply_async(lambda : agent.valueHistory):
plt.plot(history)
This is not quite the same code you shared - three agents bouncing back and forth on all your engines, whereas this has on agent per engine. I don't know enough about zipline to say whether that's useful to you or not.

Categories

Resources