error in loading a custom class in sklearn pipeline using joblib

error in loading a custom class in sklearn pipeline using joblib - python

I am trying to deploy an sklearn pipeline using FastApi
so first i saved my pipeline in a job lib file.
My pipeline looks like this:
pipe = Pipeline([('encoder', MultiColumnLabelEncoder(columns =['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet','pe', 'ane'] )),
('scaler', StandardScaler()),
('model', lgb_model.best_estimator_)])
and my custom label encoder class looks like this:
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
def transform(self,X):
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
The columns attribute in the instantiation is for specifying the categorical variables then i tried loading it in my web app using the code below.
import models.ml.classifier as clf
from fastapi import FastAPI
from joblib import load
from models.data import data_input
import pandas as pd
from utils import MultiColumnLabelEncoder
app = FastAPI(title="deployment",
description="API for machine learning project",
version="1.0")
#app.on_event('startup')
def load_model():
clf.model = load('models/ml/ckd_model.joblib')
I am getting the error below:
raise AttributeError("Can't get attribute {!r} on {!r}" attributeError: Can't get attribute 'MultiColumnLabelEncoder' on <module ' main ' from 'C:\Users\User\anaconda3\Scripts\uvicorn.exe\ main .ple>
[31mERROR.q0m: Application startup failed. Exiting.

Try importing the "main" module from anaconda home directory.

Related

Dataiku : Job failed: Error in Python process: At line 158: <class 'NameError'>

I work on Dataiku and I have a jupyter notebook which is work and now I want to include this on python recipe. The objective is to write dataframe pandas in a dataset.
data_f is the name of my dataframe and output_gen_python is the name of my dataset in dataiku.
I have this error :
Job failed: Error in Python process: At line 158: <class 'NameError'>: name 'data_df' is not defined
Here is my code :
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from datetime import datetime, timedelta
# Read recipe inputs
batches_types_copy = dataiku.Dataset("batches_types_copy")
batches_types_copy_df = batches_types_copy.get_dataframe()
Last_hour_extract = dataiku.Dataset("Last_hour_extract")
last_hour_extract_df = Last_hour_extract.get_dataframe()
class OutputMode(object):
...
class IDCalculation_I:
def _preGenerateID(self,outputMode,data_df):
...
def generateID(self,outputMode,data_df):
pass
class IDCase1(IDCalculation_I):
def generateID(self,outputMode,data_df):
...
return data_df
class IDCase2(IDCalculation_I):
def generateID(self,outputMode,data_df):
...
return data_df
class Fingerprinter(object):
def __init__(self,outputMode):
self._outputMode = outputMode
def _generateID(self,data_df):
return self._outputMode.getCaseID().generateID(self._outputMode,data_df)
def run(self,data_df):
# GenerateID
data_df = self._generateID(data_df)
return data_df
def __str__(self):
return str(self._outputMode)
outputMode = OutputMode('EEA','06:00:00','08:00:00',pytz.timezone('Europe/Paris'),CONST_MODE_CONT,IDCase1())
fp_calculator = Fingerprinter(outputMode)
output_gen_python_df = data_df # Compute a Pandas dataframe to write into output_gen_python
# Write recipe outputs
output_gen_python = dataiku.Dataset("output_gen_python")
output_gen_python.write_with_schema(output_gen_python_df)

The error says it all, on line 52 you are attempting to assign output_gen_python_df to data_df.
How to fix it?
Since you've initiated the variable fp_calculator with your Fingerprinter class in it the way you fix your error is:
data_df = fp_calculator.run()
which will return your dataframe

Error while unpickle custom pipeline Ml model in python

I create a custom Pipeline in python. I have used the sklearn pipeline and it seems running successfully.
But When I save the model as a pickle file and want to load that saved pickle file in a different notebook it shows an error.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
path = 'C:/Users/Desktop/'
df = pd.read_excel (path + "df.xlsx", sheet_name='df')
###################################################################################
# import the BaseEstimator
from sklearn.base import BaseEstimator
# define the class OutletTypeEncoder
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):
def __init__(self):
pass
#self.name = name
def fit(self, df, y=None):
return self
def transform(self, df):
# replace NaN
df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']] = df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']].fillna(value=-999)
df[['pdf_tbl_cnt']] = df[['pdf_tbl_cnt']].fillna(value=0)
# Replace gt 1 count as 0
df['pdf_tbl_cnt'] = np.where((df['pdf_tbl_cnt'] == '1'), 1, 0)
df['part_cnt'] = np.where((df['part_cnt'] == '1'), 1, 0)
# create numeric and categorica coulmns
obj_df= df[['pdf_tbl_pn_identifier','pdf_tbl_qty_identifier','pdf_header_present',
'pdf_body_pn_identifier','pdf_body_qty_identifier','pdf_model_rel_returned','pdf_model_ent_returned']]
num_df= df[['pdf_tbl_cnt', 'pdf_model_avg_relationship_score','pdf_model_avg_entity_score','part_cnt']]
# Labelencoding for categorica columns and then
obj_df=obj_df.apply(LabelEncoder().fit_transform)
df = pd.concat([obj_df, num_df], axis=1)
#df.reset_index(inplace=True, drop=True)
df.pdf_tbl_pn_identifier = df.pdf_tbl_pn_identifier.astype(str)
df.pdf_tbl_qty_identifier = df.pdf_tbl_qty_identifier.astype(str)
df.pdf_body_pn_identifier = df.pdf_body_pn_identifier.astype(str)
df.pdf_body_qty_identifier = df.pdf_body_qty_identifier.astype(str)
df.pdf_model_rel_returned = df.pdf_model_rel_returned.astype(str)
df.pdf_model_ent_returned = df.pdf_model_ent_returned.astype(str)
df.pdf_header_present = df.pdf_header_present.astype(str)
#df.matching = df.matching.astype(str)
#df['pdf_tbl_cnt'] = df['pdf_tbl_cnt'].apply(np.int64)
df.pdf_tbl_cnt = df.pdf_tbl_cnt.apply(np.int64)
return df
#################################################################################
feature_cols = df.drop(['matching'], axis=1)
X = feature_cols # Features
y = df.matching # Target variable
# split into train test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
# Create Pipeline
logreg = LogisticRegression()
model_pipeline = Pipeline(steps=[('preprocess', OutletTypeEncoder()),
('logreg', LogisticRegression())
])
# fit the pipeline with the training data
model_pipeline.fit(X_train,y_train)
# Predict
y_pred=model_pipeline.predict(X_test)
Now I save the model as a pickle file and want to use that pickle file in another notebook.
But got an error that:
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module 'main'>
# Save the Modle to file in the current working directory
Pkl_Filename = "C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(model_pipeline, file)
# Load the Model back from file
with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl', 'rb') as file:
logisic_Model_pipeline = pickle.load(file)
logisic_Model_pipeline
Please help me out how to resolve the issue.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-28376e81d621> in <module>
1 # Load the Model back from file
2 with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\OutletTypeEncoder.pkl', 'rb') as file:
----> 3 OutletTypeEncoder = pickle.load(file)
4
5 OutletTypeEncoder
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module '__main__'>

predict() got an unexpected keyword argument 'stats'

I am trying to get predictions from a Tensor Flow custom routine served on AI platform.
I have managed to serve it with the following settings: --runtime-version 2.3 --python-version 3.7 --machine-type mls1-c4-m2
But I keep getting this error when i try to make any predictions.
ERROR:root:Prediction failed: predict() got an unexpected keyword argument 'stats'
ERROR:root:Prediction failed: unknown error.
The routine has two steps:
Takes the input (a string) and transforms it into a embedding using a bow model in .pkl format
Uses the embeddings for getting the predictions using a keras model saved as an .h5 file
this is my setup.py
from setuptools import setup
REQUIRED_PACKAGES = ['Keras==2.3.1', 'sklearn==0.0', 'h5py<3.0.0', 'numpy==1.16.0', 'scipy==1.4.1', 'pyyaml==5.2']
setup(
name='my_custom_code',
version='0.1',
scripts=['predictor.py'],
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=False,
description=''
)
And this is my predictor.py
import os
import pickle
import tensorflow as tf
import numpy as np
class MyPredictor(object):
def __init__(self, model, bow_model):
self._model = model
self._bow_model = bow_model
def predict(self, instances):
outputs = []
for x in instances:
vector = self.embedding(x)
output = self._model.predict(vector)
outputs.append(output)
return outputs
def embedding(self, statement):
vector = self._bow_model.transform(statement).toarray()
vector = vector.to_list()
return vector
#classmethod
def from_path(cls, model_dir):
model_path = os.path.join(model_dir, 'model.h5')
model = tf.keras.models.load_model(model_path, compile = False)
preprocessor_path = os.path.join(model_dir, 'bow.pkl')
with open(preprocessor_path, 'rb') as f:
bow_model = pickle.load(f)
return cls(model, bow_model)
The script im using for testing is:
import googleapiclient.discovery
instances = ['test','test']]
service = googleapiclient.discovery.build('ml', 'v1')
name = 'projects/{}/models/{}/versions/{}'.format(PROJECT_ID, MODEL_NAME, VERSION_NAME)
response = service.projects().predict(
name=name,
body={'instances': instances}
).execute()
if 'error' in response:
raise RuntimeError(response['error'])
else:
print(response['predictions'])

According to the Custom prediction routine documentation, once creating the predictor class, predict() method should be supplied with self, instances, **kwargs arguments to properly handle the prediction request.
instances: A list of prediction input instances.
**kwargs: A dictionary of keyword args provided as additional fields on the predict request body.

Can't get attribute 'MyScaler' on <module 'main'>

I have a class in one of my notebooks as below :
class MyScaler(BaseEstimator,TransformerMixin):
def __init__(self,columns,with_mean=True,with_std=True,copy=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.columns = columns
self.mean_ = None
self.var_ = None
def fit(self,X,y=None):
self.scaler.fit(X[self.columns],y)
self.mean_ = np.array(np.mean(X[self.columns]))
self.var_ = np.array(np.var(X[self.columns]))
return self
def transform(self,X,y=None,copy=None):
initial_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled,X_scaled],axis=1)[initial_col_order]
I am pickling this class as :
with open('Custom_Scaler','wb') as file:
pickle.dump(MyScaler,file)
I have another module 'LogReg_Absent_Module' where i am trying to un-pickle this file. I have also defined this class in that module as follows:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
#The custom scaler that only scales the non-dummy value columns.
class MyScaler(BaseEstimator,TransformerMixin):
def __init__(self,columns,with_mean=True,with_std=True,copy=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.columns = columns
self.mean_ = None
self.var_ = None
def fit(self,X,y=None):
self.scaler.fit(X[self.columns],y)
self.mean_ = np.array(np.mean(X[self.columns]))
self.var_ = np.array(np.var(X[self.columns]))
return self
def transform(self,X,y=None,copy=None):
initial_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled,X_scaled],axis=1)[initial_col_order]
#The class that we are going to use from here on to predict new data
class absenteeism_model():
def __init__(self,model_file,scaler_file):
with open('Absenteeism_Model','rb') as model_file,open('Custom_Scaler','rb') as scaler_file:
self.log_reg = pickle.load(model_file) #Load the previously saved model
self.scaler = pickle.load(scaler_file) #and scaler.
self.data = None
From a new notebook, when i try model = absenteeism_model('Absenteeism_Model','Custom_Scaler')
I get:
<ipython-input-66-8631c175353f> in <module>
----> 1 model = absenteeism_model('Absenteeism_Model','Custom_Scaler')
~\LogReg_Absent_Module.py in __init__(self, model_file, scaler_file)
37 with open('Absenteeism_Model','rb') as model_file,open('Custom_Scaler','rb') as scaler_file:
38 self.log_reg = pickle.load(model_file) #Load the previously saved model
---> 39 self.scaler = pickle.load(scaler_file) #and scaler.
40 self.data = None
41
AttributeError: Can't get attribute 'MyScaler' on <module '__main__'>```

I dont understand why you pickle a class and not an object of that class

Do you mind showing the code of part where you have implemented your MyScaler. What i think is the issue with the object of your MyScaler class while you are saving your scaler.
for example you have declared:
scaler = MyScaler(X)
then in this case you will pickle using following code:
with open ('Custom_Scaler' , 'wb') as file:
pickle.dump(scaler, file)
See if this solves your issue.

Passing a pandas dataframe to FastAPI for NLP ML

I am trying to, for the first time, deploy an NLP ML model. To do this it was suggested that I use FastAPI and uvicorn. I have had some success in getting FastAPI to respond; however, I have not been able to successfully pass the dataframe and have it process it. I've tried using dictionaries and even attempted to convert the passed json to a dataframe.
With data_dict = data.dict() I get:
ValueError: Iterable over raw text documents expected, string object received.
With data_dict = pd.DataFrame(data.dict()) I get:
ValueError: If using all scalar values, you must pass an index
I believe I understand the problem, my Data class is expecting a string which this is not; however, I have not been able to determine how to set and / or pass the expected data so that fit_transform() will work. Ultimately I will have a prediction returned based on the submitted messages value. Bonus if I can pass a dataframe of 1 or more rows and have predictions made and returned for each of the rows. The response will include the id, project, and the prediction so that we are in future able to leverage this response to post the prediction back to the original (requesting) system.
test_connection.py
#%%
import requests
import pandas as pd
import json
import os
from pprint import pprint
url = 'http://127.0.0.1:8000/predict'
print(os.getcwd())
#%%
df = pd.DataFrame(
{
'id': ['ab410483801c38', 'cd34148639180'],
'project': ['project1', 'project2'],
'messages': ['This is message 1', 'This is message 2']
}
)
to_predict_dict = df.iloc[0].to_dict()
#%%
r = requests.post(url, json=to_predict_dict)
main.py
#!/usr/bin/env python
# coding: utf-8
import pickle
import pandas as pd
import numpy as np
from pydantic import BaseModel
from sklearn.feature_extraction.text import TfidfVectorizer
# Server
import uvicorn
from fastapi import FastAPI
# Model
import xgboost as xgb
app = FastAPI()
clf = pickle.load(open('data/xgbmodel.pickle', 'rb'))
class Data(BaseModel):
# id: str
project: str
messages: str
#app.get("/ping")
async def test():
return {"ping": "pong"}
#app.post("/predict")
async def predict(data: Data):
# data_dict = data.dict()
data_dict = pd.DataFrame(data.dict())
tfidf_vect = TfidfVectorizer(stop_words="english", analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit_transform(data_dict['messages'])
# to_predict = tfidf_vect.transform(data_dict['messages'])
# prediction = clf.predict(to_predict)
return {"response": "Success"}

Probably not the most elegant solution but I've made progress using the following:
def predict(data: Data):
data_dict = pd.DataFrame(
{
'id': [data.id],
'project': [data.project],
'messages': [data.messages]
}
)

Frist, encode your dataFrame df to JSON record-oriented:
r = requests.post(url, json=df.to_json(orient='records')).
Then, decode your data inside the /predict/ endpoint with:
df = pd.DataFrame(jsonable_encoder(data))
Remember to import the module from fastapi.encoders import jsonable_encoder.

A new library called pandera now supports direct passage of DataFrames without conversion via FastAPI. The docs are bit basic as of posting this, but may be worth reading: https://pandera.readthedocs.io/en/latest/fastapi.html#fastapi-integration.

I was able to address the issue by simply converting data.messages into a list. I also had to make some unrelated changes, I had failed to pickle my vectorizer (string tokenizer).
import pickle
import pandas as pd
import numpy as np
import json
import time
from pydantic import BaseModel
from sklearn.feature_extraction.text import TfidfVectorizer
# Server / endpoint
import uvicorn
from fastapi import FastAPI
# Model
import xgboost as xgb
app = FastAPI(debug=True)
clf = pickle.load(open('data/xgbmodel.pickle', 'rb'))
vect = pickle.load(open('data/tfidfvect.pickle', 'rb'))
class Data(BaseModel):
id: str = None
project: str
messages: str
#app.get("/ping")
async def ping():
return {"ping": "pong"}
#app.post("/predict/")
def predict(data: Data):
start = time.time()
data_l = [data.messages] # make messages iterable.
to_predict = vect.transform(data_l)
prediction = clf.predict(to_predict)
exec_time = round((time.time() - start), 3)
return {
"id": data.id,
"project": data.project,
"prediction": prediction[0],
"execution_time": exec_time
}
if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8000)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

error in loading a custom class in sklearn pipeline using joblib - python

Try importing the "main" module from anaconda home directory.

Related

Dataiku : Job failed: Error in Python process: At line 158: <class 'NameError'>

Error while unpickle custom pipeline Ml model in python

predict() got an unexpected keyword argument 'stats'

Can't get attribute 'MyScaler' on <module 'main'>

Passing a pandas dataframe to FastAPI for NLP ML

Categories

Resources