I am using Pipelines from Pyspark's ML library to preprocess text and calculate the TF-IDF values for all tokens. I also created a custom Transformer that returns for each text snippet the 5 tokens with the highest TF-IDF values. The main code looks like this:
%pyspark
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")
count_vectorizer = CountVectorizer(inputCol="filtered", outputCol="count", vocabSize=pow(2,10))
idf = IDF(inputCol="count", outputCol="TF-IDF")
normalizer = Normalizer(inputCol="TF-IDF", outputCol="normalized", p=2.0)
top_token_extractor = TopTokenExtractor(inputCol="normalized", outputCol="topTokens", vocabulary=model.stages[2].vocabulary) # !!! does not work
pipeline = Pipeline(stages=[tokenizer, remover, count_vectorizer, idf, normalizer, top_token_extractor])
model = pipeline.fit(df)
And here is the implementation of TopTokenExtractor:
%pyspark
from pyspark import keyword_only
from pyspark.ml.pipeline import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
class TopTokenExtractor(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None, vocabulary=None):
super(TopTokenExtractor, self).__init__()
self.vocabulary = Param(self, "vocabulary", "")
self._setDefault(vocabulary=set())
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None, vocabulary=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def setVocabulary(self, value):
self._paramMap[self.vocabulary] = value
return self
def getVocabulary(self):
return self.getOrDefault(self.vocabulary)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
vocabulary = self.getVocabulary()
def f(s):
token_tuples = sorted(list(zip(s.indices, s.values)), key=lambda x: x[1], reverse=True)
top_tokens = list()
for i in range(0, min(5, len(token_tuples))):
top_tokens.append(vocabulary[token_tuples[i][0]])
return top_tokens
t = ArrayType(StringType())
return dataset.withColumn(out_col, udf(f, t)(in_col))
The problem is that in order to return a list of tokens rather than indices, I need to pass the vocabulary from the CountVectorizer as a parameter to TopTokenExtractor. After calling pipeline.fit(df) the vocabulary could be accessed by model.stages[2].vocabulary, but I could not figure out how to pass it as a parameter in the course of a pipeline. Is this possible at all?
As a workaround, I might split up the pipeline into two parts, but I would really prefer to have a single pipeline if possible.
Related
I have trained a BERT model using ktrain (TensorFlow wrapper) to recognize emotion on text. It works, but it suffers from really slow inference. That makes my model not suitable for a production environment. I have done some research, and it seems pruning could help.
TensorFlow provides some options for pruning, e.g., tf.contrib.model_pruning. The problem is that it is not a not a widely used technique. What would be a simple enough example that could help me to understand how to use it?
I provide my working code below for reference.
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
The distilbert model in ktrain is created using Hugging Face transformers, which means you can use that library to prune the model. See this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?.
I am attempting to look at conglomerated outlier information, utilizing several different SKLearn, HDBScan, and custom outlier detection classes. However, for some reason I am consistently running into an error where any class utilizing HDBScan cannot be iterated over. All other Sklearn and Custom classes can. The issue I am getting seems to consistently occur on the second pass of the HDBScan class and instantly happens upon algorithm.fit(tmp). Upon debugging the script, it looks like the error is thrown before even getting to the first line of the Class.
Any help? Below is the minimum viable reproduction:
import numpy as np
import pandas as pd
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.fit = cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.fit.probabilities_
def OutlierEnsemble(df, anomaly_algorithms = None, num_slices = 5, num_columns = 7, outliers_fraction = 0.05):
if isinstance(df, np.ndarray):
df = pd.DataFrame(df)
assert isinstance(df, pd.DataFrame)
if not anomaly_algorithms:
anomaly_algorithms = [
("Robust covariance",
EllipticEnvelope(contamination=outliers_fraction)),
("One-Class SVM",
OneClassSVM(nu=outliers_fraction,
kernel="rbf")),
("Isolation Forest",
IsolationForest(contamination=outliers_fraction)),
("HDBScan LOF",
DBClass()),
]
data = []
for i in range(1, num_slices + 1):
data.append(df.sample(n = num_columns, axis = 1, replace = False))
predictions = []
names = []
for tmp in data:
counter = 0
for name, algorithm in anomaly_algorithms:
algorithm.fit(tmp)
predictions.append(algorithm.predict(tmp))
counter += 1
names.append(f"{name}{counter}")
return predictions
blobs, labels = make_blobs(n_samples=3000, n_features=12)
OutlierEnsemble(blobs)
The error provided is not the most helpful.
Traceback (most recent call last):
File "<ipython-input-4-e1d4b63cfccd>", line 75, in <module>
OutlierEnsemble(blobs)
File "<ipython-input-4-e1d4b63cfccd>", line 66, in OutlierEnsemble
algorithm.fit(tmp)
TypeError: 'HDBSCAN' object is not callable
In your DBClass.fit, DBClass.fit is unintentionally redefined.
You could perhaps use something like,
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.myfit = cluster # save calculated cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.myfit.probabilities_ # use calculated cluster
I'm trying to set a GridSearchCV in sklearn that uses a TimeSeriesSplit with data normalized on the training set. What I did is to create a TransformerMixin called DivisorTransform that gets the divisor of the normalization and store it. The DivisorTransform is instantiated before the Pipeline. Into the pipeline, I set the DivisorTransform (in order to fit it) and then a NormalizeTransformer takes as input the DivisorTransform and perform the division. However, using this pipeline into the GridSearchCV the transformer are pickled. This causes the DivisorTransform to be pickled and fitted, then the NormalizeTransformer is pickled but having the DivisorTransform in itself, the DivisorTransform is pickled again. This causes the NormalizeTransformer to use un un-fitted DivisorTransform.
Here is an example
dt = DivisorTransform()
pipe = Pipeline([('divisor',dt),('normalize',NormalizeTransformer(dt))])
gridS = GridSearchCV(pipe,params={...},cv=TimeSeriesSplit())
How different normalizations have to be managed into a GridSearchCV? Which are the best practice?
Here a python example
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
class DivisorTransform(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
print(f'{type(self).__name__} id {id(self)} fit')
self.divisor_ = X.max()
return self
def transform(self, X):
print(f'{type(self).__name__} id {id(self)} transform')
return X
def getDivisor(self):
return self.divisor_
class NormalizationTransform(BaseEstimator,TransformerMixin):
def __init__(self, divisorTransform, fakeParam):
self.divTrns = divisorTransform
self.fakeParam = fakeParam
print(f'{type(self).__name__} id {id(self)} init saving {type(self.divTrns).__name__} at {id(self.divTrns)}')
def fit(self, X, y=None):
print(f'{type(self).__name__} id {id(self)} fit going to fit {type(self.divTrns).__name__} {id(self.divTrns)}')
self.divisor_ = self.divTrns.fit(X).getDivisor()
return self
def transform(self, X):
print(f'{type(self).__name__} id {id(self)} transform')
res = X.copy()
res = res / self.divisor_
print('_______________________________________')
print(res)
return res
def anti_transform(self, X):
res = X.copy()
res = res * self.divisor_
return res
def score(self, X, y=None, sample_weight=None):
return 1
x = pd.DataFrame([[i+j*10 for j in range(3)] for i in range(10)],columns=['A','B','C'])
dvT = DivisorTransform()
print(type(dvT).__name__)
pipe = Pipeline([('divisor',dvT),('normalization',NormalizationTransform(dvT, 0))])
res1 = pipe.fit_transform(x)
params = {'normalization__fakeParam':[0,1]}
gs = GridSearchCV(pipe,params,cv=TimeSeriesSplit(n_splits=3).split(x))
print('Starting Grid Search')
gs.fit(x)
Thi produces as print:
Starting Grid Search
NormalizationTransform id 140321510292896 init saving NoneType at 94405154462352
NormalizationTransform id 140321722266344 init saving NoneType at 94405154462352
That shows the problem
I'm following Chapter 4 from "Advanced Analytics with Spark" from O'Reilly. This book is in Scala and I'm having trouble converting this code to Python.
Scala Code
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
val rawData = sc.textFile("hdfs:///user/ds/covtype.data")
val data = rawData.map { line =>
val values = line.split(',').map(_.toDouble)
val featureVector = Vectors.dense(values.init)
val label = values.last - 1
LabeledPoint(label, featureVector)
}
val Array(trainData, cvData, testData) =
data.randomSplit(Array(0.8, 0.1, 0.1))
trainData.cache()
cvData.cache()
testData.cache()
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]):
MulticlassMetrics = {
val predictionsAndLabels = data.map(example =>
(model.predict(example.features), example.label)
)
new MulticlassMetrics(predictionsAndLabels)
}
val model = DecisionTree.trainClassifier(
trainData, 7, Map[Int,Int](), "gini", 4, 100)
val metrics = getMetrics(model, cvData)
metrics.confusionMatrix
My Python Code
from pyspark.sql.functions import col, split
import pyspark.mllib.linalg as linal
import pyspark.mllib.regression as regre
import pyspark.mllib.evaluation as eva
import pyspark.mllib.tree as tree
import pyspark.rdd as rd
raw_data = sc.textFile("covtype.data")
def fstDecisionTree(line):
values = list(map(float,line.split(",")))
featureVector = linal.Vectors.dense(values[:-1])
label = values[-1]-1
ret=regre.LabeledPoint(label, featureVector)
return regre.LabeledPoint(label, featureVector)
data = raw_data.map(fstDecisionTree)
trainData,cvData,testData=data.randomSplit([0.8,0.1,0.1])
trainData.cache()
cvData.cache()
testData.cache()
def help_lam(model):
def _help_lam(dataline):
print(dataline)
a=dataline.collect()
return (model.predict(a[1]),a[0])
return _help_lam
def getMetrics(model, data):
print(type(model),type(data))
predictionsAndLabels= data.map(help_lam(model))
return eva.MulticlassMetrics(predictionsAndLabels)
n_targets=7
max_depth=4
max_bin_count=100
model = tree.DecisionTree.trainClassifier(trainData, n_targets, {}, "gini", max_depth, max_bin_count)
metrics=getMetrics(model,cvData)
When I run this, I have this error in the method def _help_lam(dataline) inside of def help_lam(model) when I try to implicitly pass the map iteration in:
AttributeError: 'Py4JError' object has no attribute 'message'
I think the problem is in the model.predict function
From pyspark mllib/tree.py
Note: In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.
What you can do is pass the feature vector directly like so
>>> rdd = sc.parallelize([[1.0], [0.0]])
>>> model.predict(rdd).collect()
[1.0, 0.0]
Edit:
An update to your getMetrics could be:
def getMetrics(model, data):
labels = data.map(lambda d: d.label)
features = data.map(lambda d: d.features)
predictions = model.predict(features)
predictionsAndLabels = predictions.zip(labels)
return eva.MulticlassMetrics(predictionsAndLabels)
I am new to Spark SQL DataFrames and ML on them (PySpark).
How can I create a custom tokenizer, which for example removes stop words and uses some libraries from nltk? Can I extend the default one?
Can I extend the default one?
Not really. Default Tokenizer is a subclass of pyspark.ml.wrapper.JavaTransformer and, same as other transfromers and estimators from pyspark.ml.feature, delegates actual processing to its Scala counterpart. Since you want to use Python you should extend pyspark.ml.pipeline.Transformer directly.
import nltk
from pyspark import keyword_only ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
# Available in PySpark >= 2.3.0
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
class NLTKWordPunctTokenizer(
Transformer, HasInputCol, HasOutputCol,
# Credits https://stackoverflow.com/a/52467470
# by https://stackoverflow.com/users/234944/benjamin-manns
DefaultParamsReadable, DefaultParamsWritable):
stopwords = Param(Params._dummy(), "stopwords", "stopwords",
typeConverter=TypeConverters.toListString)
#keyword_only
def __init__(self, inputCol=None, outputCol=None, stopwords=None):
super(NLTKWordPunctTokenizer, self).__init__()
self.stopwords = Param(self, "stopwords", "")
self._setDefault(stopwords=[])
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None, stopwords=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def setStopwords(self, value):
return self._set(stopwords=list(value))
def getStopwords(self):
return self.getOrDefault(self.stopwords)
# Required in Spark >= 3.0
def setInputCol(self, value):
"""
Sets the value of :py:attr:`inputCol`.
"""
return self._set(inputCol=value)
# Required in Spark >= 3.0
def setOutputCol(self, value):
"""
Sets the value of :py:attr:`outputCol`.
"""
return self._set(outputCol=value)
def _transform(self, dataset):
stopwords = set(self.getStopwords())
def f(s):
tokens = nltk.tokenize.wordpunct_tokenize(s)
return [t for t in tokens if t.lower() not in stopwords]
t = ArrayType(StringType())
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
return dataset.withColumn(out_col, udf(f, t)(in_col))
Example usage (data from ML - Features):
sentenceDataFrame = spark.createDataFrame([
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = NLTKWordPunctTokenizer(
inputCol="sentence", outputCol="words",
stopwords=nltk.corpus.stopwords.words('english'))
tokenizer.transform(sentenceDataFrame).show()
For custom Python Estimator see How to Roll a Custom Estimator in PySpark mllib
⚠ This answer depends on internal API and is compatible with Spark 2.0.3, 2.1.1, 2.2.0 or later (SPARK-19348). For code compatible with previous Spark versions please see revision 8.