pyspark-2.3 sparkml LogisticRegression model load issue - python

I am doing a sample pyspark ml exercise where I need to store a model and read it back. I am able to successfully save the model, but when I am trying to read/load it back it is throwing below exception. I am new to spark ml and python, please guide me on this.
Code:
from pyspark.sql import *
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.tuning import TrainValidationSplitModel
spark = SparkSession.builder.appName("LocalMLSparkSession").master("local").getOrCreate()
df = spark.read.json("/to_data/simpleml.json").orderBy("value2")
df.select(df.color).distinct().show(10, False)
train, test = df.randomSplit([0.7, 0.3])
rForm = RFormula()
ls = LogisticRegression().setLabelCol("label").setFeaturesCol("features")
# setting pipeline
stages = [rForm,ls]
pipeline = Pipeline().setStages(stages)
#setting param grid builder
params = ParamGridBuilder()\
.addGrid(rForm.formula,["lab ~ . + color:value1", "lab ~ . + color:value1 + color:value2"])\
.addGrid(ls.elasticNetParam, [0.0, 0.5, 1.0])\
.addGrid(ls.regParam,[0.1, 0.2])\
.build()
#setting evaluator
evaluator = BinaryClassificationEvaluator()\
.setMetricName("areaUnderROC")\
.setRawPredictionCol("prediction")\
.setLabelCol("label")
#checking hyperparameters to train datasets
tvs = TrainValidationSplit()\
.setTrainRatio(0.75)\
.setEstimatorParamMaps(params)\
.setEstimator(pipeline)\
.setEvaluator(evaluator)
tvsFitted = tvs.fit(train)
evl = evaluator.evaluate(tvsFitted.transform(test))
tvsFitted.transform(test).select("features", "label", "prediction").show(10,False)
print(evl)
pip_model = tvsFitted.bestModel
pip_model.write().overwrite().save("/to_path/sparkml/model")
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
model.transform(test)
Exception:
Traceback (most recent call last):
File "/home/dd/dd/python-workspace/SparkMLPipelineDemo.py", line 59, in <module>
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
TypeError: __init__() missing 1 required positional argument: 'bestModel'
Process finished with exit code 1

You need to remove parenthesis when you load, e.g. replace :
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
with
model = TrainValidationSplitModel.load("/to_path/sparkml/model")

Related

Deploying a databricks model as a scoring webservice failed in Azure Machine Learning

I am doing an Azure Databricks lab 04. Integrating Azure Databricks and Azure Machine Learning -> 2. Deploying Models in Azure Machine Learning. The idea is to 1. train a model 2) deploy that model in an Azure Container Instance (ACI) in AML and 3) make predictions via HTTPS. However, I get an error when deploying the model.
The full code from the notebook is displayed at the bottom or can be found here: https://adb-4934989010098757.17.azuredatabricks.net/?o=4934989010098757#notebook/4364513836468644/command/4364513836468645 .
I run the actual model deployment in the following way:
aci_service_name='nyc-taxi-service'
service = Model.deploy(workspace=ws,
name=aci_service_name,
models=[registered_model],
inference_config=inference_config,
deployment_config= aci_config,
overwrite=True)
service.wait_for_deployment(show_output=True)
print(service.state)
After running the model deployment, the cell runs for over 25 minutes and breaks when checking the status of the inference endpoint. It gives the following error: "
"Service deployment polling reached non-successful terminal state, current service state: Failed
code": "AciDeploymentFailed",
"statusCode": 400,
"message": "Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.
The scoring script looks like this:
script_dir = 'scripts'
dbutils.fs.mkdirs(script_dir)
script_dir_path = os.path.join('/dbfs', script_dir)
print("Script directory path:", script_dir_path)
%%writefile $script_dir_path/score.py
import json
import numpy as np
import pandas as pd
import sklearn
import joblib
from azureml.core.model import Model
columns = ['passengerCount', 'tripDistance', 'hour_of_day', 'day_of_week',
'month_num', 'normalizeHolidayName', 'isPaidTimeOff', 'snowDepth',
'precipTime', 'precipDepth', 'temperature']
def init():
global model
model_path = Model.get_model_path('nyc-taxi-fare')
model = joblib.load(model_path)
print('model loaded')
def run(input_json):
# Get predictions and explanations for each data point
inputs = json.loads(input_json)
data_df = pd.DataFrame(np.array(inputs).reshape(-1, len(columns)), columns = columns)
# Make prediction
predictions = model.predict(data_df)
# You can return any data type as long as it is JSON-serializable
return {'predictions': predictions.tolist()}
Does someone know how I could fix this potentially? Thanks in advance!
The full code is displayed below:
**Required Libraries**:
* `azureml-sdk[databricks]` via PyPI
* `sklearn-pandas==2.1.0` via PyPI
* `azureml-mlflow` via PyPI
import os
import numpy as np
import pandas as pd
import pickle
import sklearn
import joblib
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
import azureml
from azureml.core import Workspace, Experiment, Run
from azureml.core.model import Model
print('The azureml.core version is {}'.format(azureml.core.VERSION))
%md
### Connect to the AML workspace
%md
In the following cell, be sure to set the values for `subscription_id`, `resource_group`, and `workspace_name` as directed by the comments. Please note, you can copy the subscription ID and resource group name from the **Overview** page on the blade for the Azure ML workspace in the Azure portal.
#Provide the Subscription ID of your existing Azure subscription
subscription_id = " ..... "
#Replace the name below with the name of your resource group
resource_group = "RG_1"
#Replace the name below with the name of your Azure Machine Learning workspace
workspace_name = "aml-ws"
print("subscription_id:", subscription_id)
print("resource_group:", resource_group)
print("workspace_name:", workspace_name)
%md
**Important Note**: You will be prompted to login in the text that is output below the cell. Be sure to navigate to the URL displayed and enter the code that is provided. Once you have entered the code, return to this notebook and wait for the output to read `Workspace configuration succeeded`.
*Also note that the sign-on link and code only appear the first time in a session. If an authenticated session is already established, you won't be prompted to enter the code and authenticate when creating an instance of the Workspace.*
ws = Workspace(subscription_id, resource_group, workspace_name)
print(ws)
print('Workspace region:', ws.location)
print('Workspace configuration succeeded')
%md
### Load the training data
In this notebook, we will be using a subset of NYC Taxi & Limousine Commission - green taxi trip records available from [Azure Open Datasets]( https://azure.microsoft.com/en-us/services/open-datasets/). The data is enriched with holiday and weather data. Each row of the table represents a taxi ride that includes columns such as number of passengers, trip distance, datetime information, holiday and weather information, and the taxi fare for the trip.
Run the following cell to load the table into a Spark dataframe and reivew the dataframe.
dataset = spark.sql("select * from nyc_taxi_1_csv").toPandas()
display(dataset)
%md
### Use MLflow with Azure Machine Learning for Model Training
In the subsequent cells you will learn to do the following:
- Set up MLflow tracking URI so as to use Azure ML
- Create MLflow experiment – this will create a corresponding experiment in Azure ML Workspace
- Train a model on Azure Databricks cluster while logging metrics and artifacts using MLflow
- Save the trained model to Databricks File System (DBFS)
import mlflow
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
experiment_name = 'MLflow-AML-Exercise'
mlflow.set_experiment(experiment_name)
print("Training model...")
output_folder = 'outputs'
model_file_name = 'nyc-taxi.pkl'
dbutils.fs.mkdirs(output_folder)
model_file_path = os.path.join('/dbfs', output_folder, model_file_name)
with mlflow.start_run() as run:
df = dataset.dropna(subset=['totalAmount'])
x_df = df.drop(['totalAmount'], axis=1)
y_df = df['totalAmount']
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)
numerical = ['passengerCount', 'tripDistance', 'snowDepth', 'precipTime', 'precipDepth', 'temperature']
categorical = ['hour_of_day', 'day_of_week', 'month_num', 'normalizeHolidayName', 'isPaidTimeOff']
numeric_transformations = [([f], Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])) for f in numerical]
categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical]
transformations = numeric_transformations + categorical_transformations
clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations, df_out=True)),
('regressor', GradientBoostingRegressor())])
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_actual = y_test.values.flatten().tolist()
rmse = math.sqrt(mean_squared_error(y_actual, y_predict))
mlflow.log_metric('rmse', rmse)
mae = mean_absolute_error(y_actual, y_predict)
mlflow.log_metric('mae', mae)
r2 = r2_score(y_actual, y_predict)
mlflow.log_metric('R2 score', r2)
plt.figure(figsize=(10,10))
plt.scatter(y_actual, y_predict, c='crimson')
plt.yscale('log')
plt.xscale('log')
p1 = max(max(y_predict), max(y_actual))
p2 = min(min(y_predict), min(y_actual))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
results_graph = os.path.join('/dbfs', output_folder, 'results.png')
plt.savefig(results_graph)
mlflow.log_artifact(results_graph)
joblib.dump(clf, open(model_file_path,'wb'))
mlflow.log_artifact(model_file_path)
%md
Run the cell below to list the experiment run in Azure Machine Learning Workspace that you just completed.
aml_run = list(ws.experiments[experiment_name].get_runs())[0]
aml_run
%md
## Exercise 1: Register a databricks-trained model in AML
Azure Machine Learning provides a Model Registry that acts like a version controlled repository for each of your trained models. To version a model, you use the SDK as follows. Run the following cell to register the model with Azure Machine Learning.
model_name = 'nyc-taxi-fare'
model_description = 'Model to predict taxi fares in NYC.'
model_tags = {"Type": "GradientBoostingRegressor",
"Run ID": aml_run.id,
"Metrics": aml_run.get_metrics()}
registered_model = Model.register(model_path=model_file_path, #Path to the saved model file
model_name=model_name,
tags=model_tags,
description=model_description,
workspace=ws)
print(registered_model)
%md
## Exercise 2: Deploy a service that uses the model
%md
### Create the scoring script
script_dir = 'scripts'
dbutils.fs.mkdirs(script_dir)
script_dir_path = os.path.join('/dbfs', script_dir)
print("Script directory path:", script_dir_path)
%%writefile $script_dir_path/score.py
import json
import numpy as np
import pandas as pd
import sklearn
import joblib
from azureml.core.model import Model
columns = ['passengerCount', 'tripDistance', 'hour_of_day', 'day_of_week',
'month_num', 'normalizeHolidayName', 'isPaidTimeOff', 'snowDepth',
'precipTime', 'precipDepth', 'temperature']
def init():
global model
model_path = Model.get_model_path('nyc-taxi-fare')
model = joblib.load(model_path)
print('model loaded')
def run(input_json):
# Get predictions and explanations for each data point
inputs = json.loads(input_json)
data_df = pd.DataFrame(np.array(inputs).reshape(-1, len(columns)), columns = columns)
# Make prediction
predictions = model.predict(data_df)
# You can return any data type as long as it is JSON-serializable
return {'predictions': predictions.tolist()}
%md
### Create the deployment environment
from azureml.core import Environment
from azureml.core.environment import CondaDependencies
my_env_name="nyc-taxi-env"
myenv = Environment.get(workspace=ws, name='AzureML-Minimal').clone(my_env_name)
conda_dep = CondaDependencies()
conda_dep.add_pip_package("numpy==1.18.1")
conda_dep.add_pip_package("pandas==1.1.5")
conda_dep.add_pip_package("joblib==0.14.1")
conda_dep.add_pip_package("scikit-learn==0.24.1")
conda_dep.add_pip_package("sklearn-pandas==2.1.0")
conda_dep.add_pip_package("azure-ml-api-sdk")
myenv.python.conda_dependencies=conda_dep
print("Review the deployment environment.")
myenv
%md
### Create the inference configuration
from azureml.core.model import InferenceConfig
inference_config = InferenceConfig(entry_script='score.py', source_directory=script_dir_path, environment=myenv)
print("InferenceConfig created.")
%md
### Create the deployment configuration
In this exercise we will use the Azure Container Instance (ACI) to deploy the model
from azureml.core.webservice import AciWebservice, Webservice
description = 'NYC Taxi Fare Predictor Service'
aci_config = AciWebservice.deploy_configuration(
cpu_cores=3,
memory_gb=15,
location='eastus',
description=description,
auth_enabled=True,
tags = {'name': 'ACI container',
'model_name': registered_model.name,
'model_version': registered_model.version
}
)
print("AciWebservice deployment configuration created.")
%md
### Deploy the model as a scoring webservice
Please note that it can take **10-15 minutes** for the deployment to complete.
aci_service_name='nyc-taxi-service'
service = Model.deploy(workspace=ws,
name=aci_service_name,
models=[registered_model],
inference_config=inference_config,
deployment_config= aci_config,
overwrite=True)
service.wait_for_deployment(show_output=True)
print(service.state)
%md
## Exercise 3: Consume the deployed service
%md
**Review the webservice endpoint URL and API key**
api_key, _ = service.get_keys()
print("Deployed ACI test Webservice: {} \nWebservice Uri: {} \nWebservice API Key: {}".
format(service.name, service.scoring_uri, api_key))
%md
**Prepare test data**
#['passengerCount', 'tripDistance', 'hour_of_day', 'day_of_week', 'month_num',
# 'normalizeHolidayName', 'isPaidTimeOff', 'snowDepth', 'precipTime', 'precipDepth', 'temperature']
data1 = [2, 5, 9, 4, 5, 'Memorial Day', True, 0, 0.0, 0.0, 65]
data2 = [[3, 10, 15, 4, 7, 'None', False, 0, 2.0, 1.0, 80],
[2, 5, 9, 4, 5, 'Memorial Day', True, 0, 0.0, 0.0, 65]]
print("Test data prepared.")
dataset.head()
%md
### Consume the deployed webservice over HTTP
import requests
import json
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
response = requests.post(service.scoring_uri, json.dumps(data1), headers=headers)
print('Predictions for data1')
print(response.text)
print("")
response = requests.post(service.scoring_uri, json.dumps(data2), headers=headers)
print('Predictions for data2')
print(response.text)
%md
### Clean-up
When you are done with the exercise, delete the deployed webservice by running the cell below.
service.delete()
print("Deployed webservice deleted.")

Trying to run Flask and getting message saying 'ModuleNotFoundError: No module named 'sklearn''

I have two python files. Here is my ml.py file.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import pickle
data = pd.read_csv("C:\\Users\\ryans\\Desktop\\insurance.csv")
le = LabelEncoder()
le.fit(data['sex'])
data['Sex'] = le.transform(data['sex'])
le.fit(data['smoker'])
data['Smoker'] = le.transform(data['smoker'])
le.fit(data['region'])
data['Region'] = le.transform(data['region'])
#independent and dependent columns
x = data[["age", "bmi", "children", "Sex", "Smoker", "Region"]]
y = data['charges']
#split in train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
#model training
linreg = LinearRegression()
linreg.fit(x_train, y_train)
#model testing
predictions = linreg.predict(x_test)
linreg.score(x_test,y_test)
#save the model
file = open("C:\\Users\\ryans\\Desktop\\expense_model.pkl", 'wb')
pickle.dump(linreg, file)
Here is my run.py file.
from flask import Flask, render_template, request
import pickle
app = Flask(__name__)
model = pickle.load(open('C:\\Users\\ryans\\Desktop\\expense_model.pkl','rb')) #read mode
#app.route("/")
def home():
return render_template('index.html')
#app.route("/predict", methods=['GET','POST'])
def predict():
if request.method == 'POST':
#access the data from form
## Age
age = int(request.form["age"])
bmi = int(request.form["bmi"])
children = int(request.form["children"])
Sex = int(request.form["Sex"])
Smoker = int(request.form["Smoker"])
Region = int(request.form["Region"])
#get prediction
input_cols = [[age, bmi, children, Sex, Smoker, Region]]
prediction = model.predict(input_cols)
output = round(prediction[0], 2)
return render_template("index.html", prediction_text='Your predicted annual Healthcare Expense is $ {}'.format(output))
if __name__ == "__main__":
app.run(debug=True)
Now, when I try to run run.py in my Anaconda Prompt, I get this error message.
(base) C:\Users\ryans\Desktop\run.py
Traceback (most recent call last):
File "C:\Users\ryans\Desktop\run.py", line 5, in <module>
model = pickle.load(open('C:\\Users\\ryans\\Desktop\\expense_model.pkl','rb')) #read mode
ModuleNotFoundError: No module named 'sklearn'
Sklearn is installed and it works fine when I run it in Spyder. I think, somehow, Flask is not finding the Sklearn correctly. Or, something isn't exposed. Or, rights are not set right. I don't know what's going on with Flask. This is my first time using it. Sklearn is working fine. I know that. Any thoughts on what's wrong here?
You haven't imported sklearn in run.py, so it can't unpickle the sklearn objects in your file.

Error while unpickle custom pipeline Ml model in python

I create a custom Pipeline in python. I have used the sklearn pipeline and it seems running successfully.
But When I save the model as a pickle file and want to load that saved pickle file in a different notebook it shows an error.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
path = 'C:/Users/Desktop/'
df = pd.read_excel (path + "df.xlsx", sheet_name='df')
###################################################################################
# import the BaseEstimator
from sklearn.base import BaseEstimator
# define the class OutletTypeEncoder
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):
def __init__(self):
pass
#self.name = name
def fit(self, df, y=None):
return self
def transform(self, df):
# replace NaN
df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']] = df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']].fillna(value=-999)
df[['pdf_tbl_cnt']] = df[['pdf_tbl_cnt']].fillna(value=0)
# Replace gt 1 count as 0
df['pdf_tbl_cnt'] = np.where((df['pdf_tbl_cnt'] == '1'), 1, 0)
df['part_cnt'] = np.where((df['part_cnt'] == '1'), 1, 0)
# create numeric and categorica coulmns
obj_df= df[['pdf_tbl_pn_identifier','pdf_tbl_qty_identifier','pdf_header_present',
'pdf_body_pn_identifier','pdf_body_qty_identifier','pdf_model_rel_returned','pdf_model_ent_returned']]
num_df= df[['pdf_tbl_cnt', 'pdf_model_avg_relationship_score','pdf_model_avg_entity_score','part_cnt']]
# Labelencoding for categorica columns and then
obj_df=obj_df.apply(LabelEncoder().fit_transform)
df = pd.concat([obj_df, num_df], axis=1)
#df.reset_index(inplace=True, drop=True)
df.pdf_tbl_pn_identifier = df.pdf_tbl_pn_identifier.astype(str)
df.pdf_tbl_qty_identifier = df.pdf_tbl_qty_identifier.astype(str)
df.pdf_body_pn_identifier = df.pdf_body_pn_identifier.astype(str)
df.pdf_body_qty_identifier = df.pdf_body_qty_identifier.astype(str)
df.pdf_model_rel_returned = df.pdf_model_rel_returned.astype(str)
df.pdf_model_ent_returned = df.pdf_model_ent_returned.astype(str)
df.pdf_header_present = df.pdf_header_present.astype(str)
#df.matching = df.matching.astype(str)
#df['pdf_tbl_cnt'] = df['pdf_tbl_cnt'].apply(np.int64)
df.pdf_tbl_cnt = df.pdf_tbl_cnt.apply(np.int64)
return df
#################################################################################
feature_cols = df.drop(['matching'], axis=1)
X = feature_cols # Features
y = df.matching # Target variable
# split into train test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
# Create Pipeline
logreg = LogisticRegression()
model_pipeline = Pipeline(steps=[('preprocess', OutletTypeEncoder()),
('logreg', LogisticRegression())
])
# fit the pipeline with the training data
model_pipeline.fit(X_train,y_train)
# Predict
y_pred=model_pipeline.predict(X_test)
Now I save the model as a pickle file and want to use that pickle file in another notebook.
But got an error that:
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module 'main'>
# Save the Modle to file in the current working directory
Pkl_Filename = "C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(model_pipeline, file)
# Load the Model back from file
with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl', 'rb') as file:
logisic_Model_pipeline = pickle.load(file)
logisic_Model_pipeline
Please help me out how to resolve the issue.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-28376e81d621> in <module>
1 # Load the Model back from file
2 with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\OutletTypeEncoder.pkl', 'rb') as file:
----> 3 OutletTypeEncoder = pickle.load(file)
4
5 OutletTypeEncoder
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module '__main__'>

Is there a preferred method of saving and loading h2o word2vec models in python?

I have trained a word2vec model in the python h2o package.
Is there a simple way for me to save that word2vec model and load it back later for use?
I have tried the h2o.save_model() and h2o.load_model() functions with no luck.
I get an error using that approach like
ERROR: Unexpected HTTP Status code: 412 Precondition Failed (url = http://localhost:54321/99/Models.bin/)
water.exceptions.H2OIllegalArgumentException
[1] "water.exceptions.H2OIllegalArgumentException: Illegal argument: dir of function: importModel:
I am using the same version of h2o to train and load the model back in so the issue outlined in this question is not applicable Can't import binay h2o model with h2o.loadModel() function: 412 Precondition Failed
Any one with any insights on how to save and load an h2o word2vec model?
My sample code with a few of the important snippets
import h2o
from h2o.estimators import H2OWord2vecEstimator
df['text'] = df['text'].ascharacter()
# Break text into sequence of words
words = tokenize(df["text"])
# Initializing h2o
print('Initializing h2o.')
h2o.init(ip=h2o_ip, port=h2o_port, min_mem_size=h2o_min_memory)
# Build word2vec model:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)
# Calculate a vector for each row
word_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
#Save model to path
wv_path = '/models/wordvec/'
model_path = h2o.save_model(model = w2v_model, path= wv_path ,force=True)
# Load model in later script
w2v_model = h2o.load_model(model_path)
It sounds like you might have an access issue with the directory you trying to read from. I just tested on H2O 3.30.0.1 following the w2v example from docs and it ran fine:
job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"),
col_names = ["category", "jobtitle"],
col_types = ["string", "string"],
header = 1)
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can",
"lines","re","what","there","all","we","one","the",
"a","an","of","or","in","for","by","on","but","is",
"in","a","not","with","as","was","if","they","are",
"this","and","it","have","from","at","my","be","by",
"not","that","to","from","com","org","like","likes",
"so"]
# Make the 'tokenize' function:
def tokenize(sentences, stop_word = STOP_WORDS):
tokenized = sentences.tokenize("\\W+")
tokenized_lower = tokenized.tolower()
tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
return tokenized_words
# Break job titles into a sequence of words:
words = tokenize(job_titles["jobtitle"])
# Build word2vec model:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)
#Save model
wv_path = 'models/'
model_path = h2o.save_model(model = w2v_model, path= wv_path ,force=True)
#Load Model
w2v_model2 = h2o.load_model(model_path)

flask _pickle.PicklingError:

I'm new in flask, I'm trying to implement my text (Bag of word) classifier model in python & deploy it with flask web application. but i got the error when proceeding to other pages by following code:
Implement by using naive bayes classifier, it will render it on news page which gives a result of pos or neg.
from flask import Flask,render_template,url_for,request
import pandas as pd
import pickle
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import sklearn
import _pickle as cPickle
import numpy as np
from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split,GridSearchCV,learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
app = Flask(__name__)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/predict',methods=['POST'])
def predict():
messages = pd.read_csv('bitcoin_reddit.csv', usecols=["title","class"])
messages['length'] = messages['title'].map(lambda text: len(text))
def split_into_tokens(title):
return TextBlob(title).words
def split_into_lemmas(title):
words = TextBlob(title).words.lower()
# for each word, take its "base form" = lemma
return [word.lemma for word in words]
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['title'])
messages_bow = bow_transformer.transform(messages['title'])
tfidf_transformer = TfidfTransformer().fit(messages_bow) #normalization can be done with TF-IDF
messages_tfidf = tfidf_transformer.transform(messages_bow)
#Training Model NB
spam_detector = MultinomialNB().fit(messages_tfidf, messages['class'])
all_predictions = spam_detector.predict(messages_tfidf)
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['title'], messages['class'], test_size=0.2)
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=split_into_lemmas)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MultinomialNB())]) # train on TF-IDF vectors w/ Naive Bayes classifier
scores = cross_val_score(pipeline, # steps to convert raw messages into models
msg_train, # training data
label_train, # training labels
cv=10, # split data randomly into 10 parts: 9 for training, 1 for scoring
scoring='accuracy') # which scoring metric?
params = {
'tfidf__use_idf': (True, False),
'bow__analyzer': (split_into_lemmas, split_into_tokens),
}
grid = GridSearchCV(
pipeline, # pipeline from above
params, # parameters to tune via cross validation
refit=True, # fit using all available data at the end, on the best found param combination
scoring='accuracy', # what score are we optimizing?
cv=StratifiedKFold(n_splits=5)) # what type of cross validation to use
nb_detector = grid.fit(msg_train, label_train)
predictions = nb_detector.predict(msg_test)
joblib.dump(nb_detector, 'NB_model_bow.pkl')
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = cv.transform(data).toarray()
my_prediction = nb_detector.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':
app.run(debug=True)
But i got this kind of error
_pickle.PicklingError: Can't pickle <function predict.<locals>.split_into_lemmas at 0x000001ABF618AE18>: it's not found as __main__.predict.<locals>.split_into_lemmas

Categories

Resources