How to use mocking to compare real results - python

I have a sample class which reads a saved Tensorflow model and runs predictions
class Sample():
## all it does is creates a new column with predictions
def __init__(self, tf_model):
self.tf_model = tf_model
def tf_process(self, x):
##some other preprocessing
x["tf_predictions"] = self.tf_model.predict(x)
return x
def predict(self, x):
predictions = self.tf_process(x)
return predictions
Code for Unittesting without having to load the model:
import unittest
import pandas as pd
from unittest import TestCase, mock
from my_package.sample_model import Sample
class TestSample(unittest.TestCase):
def test_predict(self):
with mock.patch("Sample.tf_process") as process:
process.return_value = pd.DataFrame("hardcoded_value")
#to check: process.return_value = Output (Sample.predict())
Goal:
To compare process.return_value with the Output of predict method in Sample, but to do this I still have to load the model, I dont understand what is the use of mock here since i will have to anyway call the predict method to compare it with process.return_value. Any suggestions will be helpful

I think in your case it's better to use Mock(). You can create really good and simple tests without patch(). Just prepare all necessary mocked instances for initialization.
from unittest.mock import Mock
class TestSample(TestCase):
def test_predict(self):
# let's say predict() will return something... just an example
tf = Mock(predict=Mock(return_value=(10, 20, 30)))
df = pd.DataFrame({'test_col': (1, 2, 3)})
df = Sample(tf).predict(df)
# check column
self.assertTrue('tf_predictions' in df.columns)
# or check records
self.assertEqual(
df.to_dict('records'),
[
{'test_col': 1, 'tf_predictions': 10},
{'test_col': 2, 'tf_predictions': 20},
{'test_col': 3, 'tf_predictions': 30}
]
)
Also it's really helps when you need tests for complex services. Just an example:
class ClusterService:
def __init__(self, service_a, service_b, service_c) -> None:
self._service_a = service_a
self._service_b = service_b
self._service_c = service_c
# service_d, ... etc
def get_cluster_info(self, name: str):
self._service_a.send_something_to_somewhere(name)
data = {
'name': name,
'free_resources': self._service_b.get_free_resources(),
'current_price': self._service_c.get_price(name),
}
return ' ,'.join([
': '.join(['Cluster name', name]),
': '.join(['CPU', str(data['free_resources']['cpu'])]),
': '.join(['RAM', str(data['free_resources']['ram'])]),
': '.join(['Price', '{} $'.format(round(data['current_price']['usd'], 2))]),
])
class TestClusterService(TestCase):
def test_get_cluster_info(self):
cluster = ClusterService(
service_a=Mock(),
service_b=Mock(get_free_resources=Mock(return_value={'cpu': 100, 'ram': 200})),
service_c=Mock(get_price=Mock(return_value={'usd': 101.4999})),
)
self.assertEqual(
cluster.get_cluster_info('best name'),
'Cluster name: best name ,CPU: 100 ,RAM: 200 ,Price: 101.5 $'
)

Related

Overwrite superclass instance in Python

I can create a FasterRCNN object using
model = fasterrcnn_resnet50_fpn(...)
which I want to inherit from, as
class MyDetector(FasterRCNN):
...
but overwrite the superclass instance from the fasterrcnn_resnet50_fpn() factory. I have tried using __new__, as:
class MyDetector(FasterRCNN):
def __new__(cls):
return fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
def __init__(self):
num_features_in = self.roi_heads.box_predictor.cls_score.in_features
self.roi_heads.box_predictor = FastRCNNPredictor(num_features_in, num_classes=2)
def some_func(self):
pass
so that I can add custom methods to the child class and so forth. What is the correct way of doing this?
I guess you'd be better to make your own factory function.
import libraries
from typing import Optional, Any
import torch
from torch import nn
import torchvision
from torchvision.models.resnet import resnet50, ResNet50_Weights
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights, FasterRCNN
from torchvision.models._utils import _ovewrite_value_param
from torchvision.models.detection.backbone_utils import (
_validate_trainable_layers,
_resnet_fpn_extractor,
)
from torchvision.models.detection._utils import overwrite_eps
from torchvision.ops import misc as misc_nn_ops
class MyDetector
class MyDetector(FasterRCNN):
def __init__(self, backbone, num_classes=None, **kwarg):
super().__init__(backbone=backbone, num_classes=num_classes, **kwarg)
def some_func(self):
pass
MyDetector factory function
# https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py#L459
def mydetector_resnet50_fpn(
*,
weights: Optional[FasterRCNN_ResNet50_FPN_Weights] = None,
progress: bool = True,
num_classes: Optional[int] = None,
weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1,
trainable_backbone_layers: Optional[int] = None,
**kwargs: Any,
) -> MyDetector:
weights = FasterRCNN_ResNet50_FPN_Weights.verify(weights)
weights_backbone = ResNet50_Weights.verify(weights_backbone)
if weights is not None:
weights_backbone = None
num_classes = _ovewrite_value_param(
"num_classes", num_classes, len(weights.meta["categories"])
)
elif num_classes is None:
num_classes = 91
is_trained = weights is not None or weights_backbone is not None
trainable_backbone_layers = _validate_trainable_layers(
is_trained, trainable_backbone_layers, 5, 3
)
norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
backbone = resnet50(
weights=weights_backbone, progress=progress, norm_layer=norm_layer
)
backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
model = MyDetector(backbone, num_classes=num_classes, **kwargs)
if weights is not None:
model.load_state_dict(weights.get_state_dict(progress=progress))
if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1:
overwrite_eps(model, 0.0)
return model
utility for checking
# https://discuss.pytorch.org/t/check-if-models-have-same-weights/4351/6
def compare_models(model_1, model_2):
models_differ = 0
for key_item_1, key_item_2 in zip(
model_1.state_dict().items(), model_2.state_dict().items()
):
if torch.equal(key_item_1[1], key_item_2[1]):
pass
else:
models_differ += 1
if key_item_1[0] == key_item_2[0]:
print("Mismtach found at", key_item_1[0])
else:
raise Exception
if models_differ == 0:
print("Models match perfectly! :)")
test
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
)
my_model = mydetector_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
compare_models(model, my_model)
output
Models match perfectly! :)
And I also tried to make hard coded version. But as you know, customizing settings of FPN is somewhat complicated.
from torchvision.models.resnet import resnet50, ResNet50_Weights
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights, FasterRCNN
from torchvision.models.detection.backbone_utils import _resnet_fpn_extractor
from torchvision.ops import misc as misc_nn_ops
class MyDetector(FasterRCNN):
def __init__(self, **kwarg):
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
backbone = resnet50(
weights=ResNet50_Weights.IMAGENET1K_V1,
norm_layer=misc_nn_ops.FrozenBatchNorm2d,
)
backbone = _resnet_fpn_extractor(backbone, trainable_layers=3)
# default of num_classes is 91
# this num_classes is used for setting FastRCNNPreditcor
# https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py#L257
num_classes = len(weights.meta["categories"])
super().__init__(backbone=backbone, num_classes=num_classes, **kwarg)
self.load_state_dict(weights.get_state_dict(progress=True))
def some_func(self):
pass
m = MyDetector()

How can I apply pruning on a BERT model?

I have trained a BERT model using ktrain (TensorFlow wrapper) to recognize emotion on text. It works, but it suffers from really slow inference. That makes my model not suitable for a production environment. I have done some research, and it seems pruning could help.
TensorFlow provides some options for pruning, e.g., tf.contrib.model_pruning. The problem is that it is not a not a widely used technique. What would be a simple enough example that could help me to understand how to use it?
I provide my working code below for reference.
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
The distilbert model in ktrain is created using Hugging Face transformers, which means you can use that library to prune the model. See this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?.

Python HDBScan class always fails on second iteration before even entering first function

I am attempting to look at conglomerated outlier information, utilizing several different SKLearn, HDBScan, and custom outlier detection classes. However, for some reason I am consistently running into an error where any class utilizing HDBScan cannot be iterated over. All other Sklearn and Custom classes can. The issue I am getting seems to consistently occur on the second pass of the HDBScan class and instantly happens upon algorithm.fit(tmp). Upon debugging the script, it looks like the error is thrown before even getting to the first line of the Class.
Any help? Below is the minimum viable reproduction:
import numpy as np
import pandas as pd
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.fit = cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.fit.probabilities_
def OutlierEnsemble(df, anomaly_algorithms = None, num_slices = 5, num_columns = 7, outliers_fraction = 0.05):
if isinstance(df, np.ndarray):
df = pd.DataFrame(df)
assert isinstance(df, pd.DataFrame)
if not anomaly_algorithms:
anomaly_algorithms = [
("Robust covariance",
EllipticEnvelope(contamination=outliers_fraction)),
("One-Class SVM",
OneClassSVM(nu=outliers_fraction,
kernel="rbf")),
("Isolation Forest",
IsolationForest(contamination=outliers_fraction)),
("HDBScan LOF",
DBClass()),
]
data = []
for i in range(1, num_slices + 1):
data.append(df.sample(n = num_columns, axis = 1, replace = False))
predictions = []
names = []
for tmp in data:
counter = 0
for name, algorithm in anomaly_algorithms:
algorithm.fit(tmp)
predictions.append(algorithm.predict(tmp))
counter += 1
names.append(f"{name}{counter}")
return predictions
blobs, labels = make_blobs(n_samples=3000, n_features=12)
OutlierEnsemble(blobs)
The error provided is not the most helpful.
Traceback (most recent call last):
File "<ipython-input-4-e1d4b63cfccd>", line 75, in <module>
OutlierEnsemble(blobs)
File "<ipython-input-4-e1d4b63cfccd>", line 66, in OutlierEnsemble
algorithm.fit(tmp)
TypeError: 'HDBSCAN' object is not callable
In your DBClass.fit, DBClass.fit is unintentionally redefined.
You could perhaps use something like,
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.myfit = cluster # save calculated cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.myfit.probabilities_ # use calculated cluster

Use attributes of preceding estimators as parameters in ML pipeline

I am using Pipelines from Pyspark's ML library to preprocess text and calculate the TF-IDF values for all tokens. I also created a custom Transformer that returns for each text snippet the 5 tokens with the highest TF-IDF values. The main code looks like this:
%pyspark
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")
count_vectorizer = CountVectorizer(inputCol="filtered", outputCol="count", vocabSize=pow(2,10))
idf = IDF(inputCol="count", outputCol="TF-IDF")
normalizer = Normalizer(inputCol="TF-IDF", outputCol="normalized", p=2.0)
top_token_extractor = TopTokenExtractor(inputCol="normalized", outputCol="topTokens", vocabulary=model.stages[2].vocabulary) # !!! does not work
pipeline = Pipeline(stages=[tokenizer, remover, count_vectorizer, idf, normalizer, top_token_extractor])
model = pipeline.fit(df)
And here is the implementation of TopTokenExtractor:
%pyspark
from pyspark import keyword_only
from pyspark.ml.pipeline import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
class TopTokenExtractor(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None, vocabulary=None):
super(TopTokenExtractor, self).__init__()
self.vocabulary = Param(self, "vocabulary", "")
self._setDefault(vocabulary=set())
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None, vocabulary=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def setVocabulary(self, value):
self._paramMap[self.vocabulary] = value
return self
def getVocabulary(self):
return self.getOrDefault(self.vocabulary)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
vocabulary = self.getVocabulary()
def f(s):
token_tuples = sorted(list(zip(s.indices, s.values)), key=lambda x: x[1], reverse=True)
top_tokens = list()
for i in range(0, min(5, len(token_tuples))):
top_tokens.append(vocabulary[token_tuples[i][0]])
return top_tokens
t = ArrayType(StringType())
return dataset.withColumn(out_col, udf(f, t)(in_col))
The problem is that in order to return a list of tokens rather than indices, I need to pass the vocabulary from the CountVectorizer as a parameter to TopTokenExtractor. After calling pipeline.fit(df) the vocabulary could be accessed by model.stages[2].vocabulary, but I could not figure out how to pass it as a parameter in the course of a pipeline. Is this possible at all?
As a workaround, I might split up the pipeline into two parts, but I would really prefer to have a single pipeline if possible.

Error "AttributeError: 'Py4JError' object has no attribute 'message' building DecisionTreeModel

I'm following Chapter 4 from "Advanced Analytics with Spark" from O'Reilly. This book is in Scala and I'm having trouble converting this code to Python.
Scala Code
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
val rawData = sc.textFile("hdfs:///user/ds/covtype.data")
val data = rawData.map { line =>
val values = line.split(',').map(_.toDouble)
val featureVector = Vectors.dense(values.init)
val label = values.last - 1
LabeledPoint(label, featureVector)
}
val Array(trainData, cvData, testData) =
data.randomSplit(Array(0.8, 0.1, 0.1))
trainData.cache()
cvData.cache()
testData.cache()
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]):
MulticlassMetrics = {
val predictionsAndLabels = data.map(example =>
(model.predict(example.features), example.label)
)
new MulticlassMetrics(predictionsAndLabels)
}
val model = DecisionTree.trainClassifier(
trainData, 7, Map[Int,Int](), "gini", 4, 100)
val metrics = getMetrics(model, cvData)
metrics.confusionMatrix
My Python Code
from pyspark.sql.functions import col, split
import pyspark.mllib.linalg as linal
import pyspark.mllib.regression as regre
import pyspark.mllib.evaluation as eva
import pyspark.mllib.tree as tree
import pyspark.rdd as rd
raw_data = sc.textFile("covtype.data")
def fstDecisionTree(line):
values = list(map(float,line.split(",")))
featureVector = linal.Vectors.dense(values[:-1])
label = values[-1]-1
ret=regre.LabeledPoint(label, featureVector)
return regre.LabeledPoint(label, featureVector)
data = raw_data.map(fstDecisionTree)
trainData,cvData,testData=data.randomSplit([0.8,0.1,0.1])
trainData.cache()
cvData.cache()
testData.cache()
def help_lam(model):
def _help_lam(dataline):
print(dataline)
a=dataline.collect()
return (model.predict(a[1]),a[0])
return _help_lam
def getMetrics(model, data):
print(type(model),type(data))
predictionsAndLabels= data.map(help_lam(model))
return eva.MulticlassMetrics(predictionsAndLabels)
n_targets=7
max_depth=4
max_bin_count=100
model = tree.DecisionTree.trainClassifier(trainData, n_targets, {}, "gini", max_depth, max_bin_count)
metrics=getMetrics(model,cvData)
When I run this, I have this error in the method def _help_lam(dataline) inside of def help_lam(model) when I try to implicitly pass the map iteration in:
AttributeError: 'Py4JError' object has no attribute 'message'
I think the problem is in the model.predict function
From pyspark mllib/tree.py
Note: In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.
What you can do is pass the feature vector directly like so
>>> rdd = sc.parallelize([[1.0], [0.0]])
>>> model.predict(rdd).collect()
[1.0, 0.0]
Edit:
An update to your getMetrics could be:
def getMetrics(model, data):
labels = data.map(lambda d: d.label)
features = data.map(lambda d: d.features)
predictions = model.predict(features)
predictionsAndLabels = predictions.zip(labels)
return eva.MulticlassMetrics(predictionsAndLabels)

Categories

Resources