Schedule training and testing machine learning - python

I have written this small machine learning code of a simple random forest regression in the class Model. After creating an object of this class I have printed the predictions and the accuracy score along with that I have written a code to schedule training every 30 days and testing every 7 days. But I'm facing an error
Code:
import schedule
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from main import data as df
class Model():
def __init__(self):
self.df = df
self.linear_reg = LinearRegression()
self.random_forest = RandomForestRegressor()
def split(self, test_size):
X = np.array(self.df[['age','experience','certificates']])
y = np.array(self.df['salary'])
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
def fit(self):
self.model = self.random_forest.fit(self.X_train, self.y_train)
def predict(self):
self.result = self.random_forest.predict(self.X_test)
print(self.result)
print("Accuracy: ", self.model.score(self.X_test, self.y_test))
if __name__ == '__main__':
model_instance = Model()
model_instance.split(0.2)
schedule.every(30).days.at("05:00").do(model_instance.fit())
schedule.every(7).days.at("05:00").do(model_instance.predict())
while 1:
schedule.run_pending()
time.sleep(1)
On this line schedule.every(30).days.at("05:00").do(model_instance.fit()) I'm getting the following error: the first argument must be callable

I'm not familiar with the schedule package, but I guess the argument to do must be a callable. Which means you shouldn't actually call that function. Try this:
schedule.every(30).days.at("05:00").do(model_instance.fit)
schedule.every(7).days.at("05:00").do(model_instance.predict)
Note I removed the parentheses after fit and predict.

I figured it out. Created separate modules for training and testing and then imported the Model class and then created a function which will perform the scheduling.
Function for Training:
import schedule
import time
def job():
model_instance.split(0.2)
model_instance.fit()
print("Training Completed")
schedule.every().minute.at(":17").do(job)
while True:
schedule.run_pending()
time.sleep(1)
Function for testing:
import schedule
import time
def job():
model_instance.predict()
print(model_instance.result)
print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test))
print("Testing Completed")
schedule.every().minute.at(":17").do(job)
while True:
schedule.run_pending()
time.sleep(1)

Related

Trying to run Flask and getting message saying 'ModuleNotFoundError: No module named 'sklearn''

I have two python files. Here is my ml.py file.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import pickle
data = pd.read_csv("C:\\Users\\ryans\\Desktop\\insurance.csv")
le = LabelEncoder()
le.fit(data['sex'])
data['Sex'] = le.transform(data['sex'])
le.fit(data['smoker'])
data['Smoker'] = le.transform(data['smoker'])
le.fit(data['region'])
data['Region'] = le.transform(data['region'])
#independent and dependent columns
x = data[["age", "bmi", "children", "Sex", "Smoker", "Region"]]
y = data['charges']
#split in train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
#model training
linreg = LinearRegression()
linreg.fit(x_train, y_train)
#model testing
predictions = linreg.predict(x_test)
linreg.score(x_test,y_test)
#save the model
file = open("C:\\Users\\ryans\\Desktop\\expense_model.pkl", 'wb')
pickle.dump(linreg, file)
Here is my run.py file.
from flask import Flask, render_template, request
import pickle
app = Flask(__name__)
model = pickle.load(open('C:\\Users\\ryans\\Desktop\\expense_model.pkl','rb')) #read mode
#app.route("/")
def home():
return render_template('index.html')
#app.route("/predict", methods=['GET','POST'])
def predict():
if request.method == 'POST':
#access the data from form
## Age
age = int(request.form["age"])
bmi = int(request.form["bmi"])
children = int(request.form["children"])
Sex = int(request.form["Sex"])
Smoker = int(request.form["Smoker"])
Region = int(request.form["Region"])
#get prediction
input_cols = [[age, bmi, children, Sex, Smoker, Region]]
prediction = model.predict(input_cols)
output = round(prediction[0], 2)
return render_template("index.html", prediction_text='Your predicted annual Healthcare Expense is $ {}'.format(output))
if __name__ == "__main__":
app.run(debug=True)
Now, when I try to run run.py in my Anaconda Prompt, I get this error message.
(base) C:\Users\ryans\Desktop\run.py
Traceback (most recent call last):
File "C:\Users\ryans\Desktop\run.py", line 5, in <module>
model = pickle.load(open('C:\\Users\\ryans\\Desktop\\expense_model.pkl','rb')) #read mode
ModuleNotFoundError: No module named 'sklearn'
Sklearn is installed and it works fine when I run it in Spyder. I think, somehow, Flask is not finding the Sklearn correctly. Or, something isn't exposed. Or, rights are not set right. I don't know what's going on with Flask. This is my first time using it. Sklearn is working fine. I know that. Any thoughts on what's wrong here?
You haven't imported sklearn in run.py, so it can't unpickle the sklearn objects in your file.

Error while unpickle custom pipeline Ml model in python

I create a custom Pipeline in python. I have used the sklearn pipeline and it seems running successfully.
But When I save the model as a pickle file and want to load that saved pickle file in a different notebook it shows an error.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
path = 'C:/Users/Desktop/'
df = pd.read_excel (path + "df.xlsx", sheet_name='df')
###################################################################################
# import the BaseEstimator
from sklearn.base import BaseEstimator
# define the class OutletTypeEncoder
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):
def __init__(self):
pass
#self.name = name
def fit(self, df, y=None):
return self
def transform(self, df):
# replace NaN
df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']] = df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']].fillna(value=-999)
df[['pdf_tbl_cnt']] = df[['pdf_tbl_cnt']].fillna(value=0)
# Replace gt 1 count as 0
df['pdf_tbl_cnt'] = np.where((df['pdf_tbl_cnt'] == '1'), 1, 0)
df['part_cnt'] = np.where((df['part_cnt'] == '1'), 1, 0)
# create numeric and categorica coulmns
obj_df= df[['pdf_tbl_pn_identifier','pdf_tbl_qty_identifier','pdf_header_present',
'pdf_body_pn_identifier','pdf_body_qty_identifier','pdf_model_rel_returned','pdf_model_ent_returned']]
num_df= df[['pdf_tbl_cnt', 'pdf_model_avg_relationship_score','pdf_model_avg_entity_score','part_cnt']]
# Labelencoding for categorica columns and then
obj_df=obj_df.apply(LabelEncoder().fit_transform)
df = pd.concat([obj_df, num_df], axis=1)
#df.reset_index(inplace=True, drop=True)
df.pdf_tbl_pn_identifier = df.pdf_tbl_pn_identifier.astype(str)
df.pdf_tbl_qty_identifier = df.pdf_tbl_qty_identifier.astype(str)
df.pdf_body_pn_identifier = df.pdf_body_pn_identifier.astype(str)
df.pdf_body_qty_identifier = df.pdf_body_qty_identifier.astype(str)
df.pdf_model_rel_returned = df.pdf_model_rel_returned.astype(str)
df.pdf_model_ent_returned = df.pdf_model_ent_returned.astype(str)
df.pdf_header_present = df.pdf_header_present.astype(str)
#df.matching = df.matching.astype(str)
#df['pdf_tbl_cnt'] = df['pdf_tbl_cnt'].apply(np.int64)
df.pdf_tbl_cnt = df.pdf_tbl_cnt.apply(np.int64)
return df
#################################################################################
feature_cols = df.drop(['matching'], axis=1)
X = feature_cols # Features
y = df.matching # Target variable
# split into train test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
# Create Pipeline
logreg = LogisticRegression()
model_pipeline = Pipeline(steps=[('preprocess', OutletTypeEncoder()),
('logreg', LogisticRegression())
])
# fit the pipeline with the training data
model_pipeline.fit(X_train,y_train)
# Predict
y_pred=model_pipeline.predict(X_test)
Now I save the model as a pickle file and want to use that pickle file in another notebook.
But got an error that:
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module 'main'>
# Save the Modle to file in the current working directory
Pkl_Filename = "C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(model_pipeline, file)
# Load the Model back from file
with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\logisic_Model_pipeline.pkl', 'rb') as file:
logisic_Model_pipeline = pickle.load(file)
logisic_Model_pipeline
Please help me out how to resolve the issue.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-28376e81d621> in <module>
1 # Load the Model back from file
2 with open('C:\\Users\\SafayetKarim\\Desktop\\confidence_score\\results_updated\\pdf\\OutletTypeEncoder.pkl', 'rb') as file:
----> 3 OutletTypeEncoder = pickle.load(file)
4
5 OutletTypeEncoder
AttributeError: Can't get attribute 'OutletTypeEncoder' on <module '__main__'>

flask _pickle.PicklingError:

I'm new in flask, I'm trying to implement my text (Bag of word) classifier model in python & deploy it with flask web application. but i got the error when proceeding to other pages by following code:
Implement by using naive bayes classifier, it will render it on news page which gives a result of pos or neg.
from flask import Flask,render_template,url_for,request
import pandas as pd
import pickle
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import sklearn
import _pickle as cPickle
import numpy as np
from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split,GridSearchCV,learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
app = Flask(__name__)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/predict',methods=['POST'])
def predict():
messages = pd.read_csv('bitcoin_reddit.csv', usecols=["title","class"])
messages['length'] = messages['title'].map(lambda text: len(text))
def split_into_tokens(title):
return TextBlob(title).words
def split_into_lemmas(title):
words = TextBlob(title).words.lower()
# for each word, take its "base form" = lemma
return [word.lemma for word in words]
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['title'])
messages_bow = bow_transformer.transform(messages['title'])
tfidf_transformer = TfidfTransformer().fit(messages_bow) #normalization can be done with TF-IDF
messages_tfidf = tfidf_transformer.transform(messages_bow)
#Training Model NB
spam_detector = MultinomialNB().fit(messages_tfidf, messages['class'])
all_predictions = spam_detector.predict(messages_tfidf)
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['title'], messages['class'], test_size=0.2)
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=split_into_lemmas)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MultinomialNB())]) # train on TF-IDF vectors w/ Naive Bayes classifier
scores = cross_val_score(pipeline, # steps to convert raw messages into models
msg_train, # training data
label_train, # training labels
cv=10, # split data randomly into 10 parts: 9 for training, 1 for scoring
scoring='accuracy') # which scoring metric?
params = {
'tfidf__use_idf': (True, False),
'bow__analyzer': (split_into_lemmas, split_into_tokens),
}
grid = GridSearchCV(
pipeline, # pipeline from above
params, # parameters to tune via cross validation
refit=True, # fit using all available data at the end, on the best found param combination
scoring='accuracy', # what score are we optimizing?
cv=StratifiedKFold(n_splits=5)) # what type of cross validation to use
nb_detector = grid.fit(msg_train, label_train)
predictions = nb_detector.predict(msg_test)
joblib.dump(nb_detector, 'NB_model_bow.pkl')
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = cv.transform(data).toarray()
my_prediction = nb_detector.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':
app.run(debug=True)
But i got this kind of error
_pickle.PicklingError: Can't pickle <function predict.<locals>.split_into_lemmas at 0x000001ABF618AE18>: it's not found as __main__.predict.<locals>.split_into_lemmas

pyspark-2.3 sparkml LogisticRegression model load issue

I am doing a sample pyspark ml exercise where I need to store a model and read it back. I am able to successfully save the model, but when I am trying to read/load it back it is throwing below exception. I am new to spark ml and python, please guide me on this.
Code:
from pyspark.sql import *
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.tuning import TrainValidationSplitModel
spark = SparkSession.builder.appName("LocalMLSparkSession").master("local").getOrCreate()
df = spark.read.json("/to_data/simpleml.json").orderBy("value2")
df.select(df.color).distinct().show(10, False)
train, test = df.randomSplit([0.7, 0.3])
rForm = RFormula()
ls = LogisticRegression().setLabelCol("label").setFeaturesCol("features")
# setting pipeline
stages = [rForm,ls]
pipeline = Pipeline().setStages(stages)
#setting param grid builder
params = ParamGridBuilder()\
.addGrid(rForm.formula,["lab ~ . + color:value1", "lab ~ . + color:value1 + color:value2"])\
.addGrid(ls.elasticNetParam, [0.0, 0.5, 1.0])\
.addGrid(ls.regParam,[0.1, 0.2])\
.build()
#setting evaluator
evaluator = BinaryClassificationEvaluator()\
.setMetricName("areaUnderROC")\
.setRawPredictionCol("prediction")\
.setLabelCol("label")
#checking hyperparameters to train datasets
tvs = TrainValidationSplit()\
.setTrainRatio(0.75)\
.setEstimatorParamMaps(params)\
.setEstimator(pipeline)\
.setEvaluator(evaluator)
tvsFitted = tvs.fit(train)
evl = evaluator.evaluate(tvsFitted.transform(test))
tvsFitted.transform(test).select("features", "label", "prediction").show(10,False)
print(evl)
pip_model = tvsFitted.bestModel
pip_model.write().overwrite().save("/to_path/sparkml/model")
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
model.transform(test)
Exception:
Traceback (most recent call last):
File "/home/dd/dd/python-workspace/SparkMLPipelineDemo.py", line 59, in <module>
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
TypeError: __init__() missing 1 required positional argument: 'bestModel'
Process finished with exit code 1
You need to remove parenthesis when you load, e.g. replace :
model = TrainValidationSplitModel().load("/to_path/sparkml/model")
with
model = TrainValidationSplitModel.load("/to_path/sparkml/model")

Trying to wrap up a keras model in a flask REST app but getting a ValueError

I can create a simple keras model by running
python create-flask-model.py
create-flask-model.py
##points in square that are in or out of a quarter circle
import random
import math
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
training_size = 8000
testing_size = 2000
batch_size = 10
epoch_no = 30
modelStructureFileName = 'simple-flask.json'
modelWeightFileName = 'simple-flask.h5'
def get_model():
model = Sequential()
model.add(Dense(4, input_dim=2, activation='tanh'))
model.add(Dense(4, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='rmsprop')
return model
def get_data_instances(size):
result = []
for i in range(0, size):
number_1 = random.uniform(0,1)
number_2 = random.uniform(0,1)
squares = math.pow(number_1,2) + math.pow(number_2,2)
target = 0
if squares < 0.49:
target = 1
line = number_1,number_2,target
result.append(line)
return np.array(result)
##create data and split in to training and test, features and targets
data_instances = get_data_instances(training_size+testing_size)
train_x, train_y = data_instances[:training_size,0:2], data_instances[:training_size,-1]
test_x, test_y = data_instances[training_size:,0:2], data_instances[training_size:,-1]
##load model and train
model = get_model()
history = model.fit(train_x, train_y, batch_size=batch_size, epochs=epoch_no, validation_data=(test_x, test_y))
##save the model
model_json = model.to_json()
with open(modelStructureFileName, 'w') as json_file:
json_file.write(model_json)
model.save_weights(modelWeightFileName)
##how to get prediction for an instance
#instance = np.array([0.3, 0.6])
#instance = instance.reshape(1,2)
#yhat = model.predict(instance)
#print(yhat)
I wish to load the resulting model in to a flask app and be able to pass instances as json objects and have predictions made and returned. Running
python flask-app.py
in the same directory as the model json and h5 files.
flask-app.py
import json
import numpy as np
from flask import Flask
from keras.models import model_from_yaml
app = Flask(__name__)
model = None
modelStructureFileName = 'simple-flask.json'
modelWeightFileName = 'simple-flask.h5'
def load_model():
yaml_file = open(modelStructureFileName, 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
global model
model = model_from_yaml(loaded_model_yaml)
model.load_weights(modelWeightFileName)
#app.route('/flask/<input>', methods=['GET'])
def predict(input):
input_array = json.loads(input)
instance = np.array(input_array)
instance = instance.reshape(1,2)
yhat = model.predict(instance)
return str(yhat)
if __name__ == '__main__':
load_model()
app.run(port = 9000, debug = True)
If I navigate to http://localhost:9000/flask/[0.3,0.6] I get an error
builtins.ValueError
ValueError: Tensor Tensor("dense_3/Sigmoid:0", shape=(?, 1), dtype=float32) is not an element of this graph.
I think it's something to do with the scope of the model in the app, but can't figure it out. If I load the model in the request method it works once, but then fails with another error. I only want to load the model once. How can I get the flask app to work as expected?
EDIT: I ended up using bottle instead of flask and it worked no problem.
bottle-app.py
from bottle import route, run
import json
import numpy as np
from keras.models import model_from_yaml
modelStructureFileName = 'simple-flask.json'
modelWeightFileName = 'simple-flask.h5'
yaml_file = open(modelStructureFileName, 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
model = model_from_yaml(loaded_model_yaml)
model.load_weights(modelWeightFileName)
print('model loaded')
#route('/bottle/<input>')
def predict(input):
input_array = json.loads(input)
instance = np.array(input_array)
instance = instance.reshape(1,2)
yhat = model.predict(instance)
print(input_array, yhat)
return str(yhat[0][0])
run(host='localhost', port=9000, debug=True)
This happens because, you have multiple threads enabled in flask by default. Tensorflow models are not working well with multiple threads. You can read more about this in the below links
https://github.com/keras-team/keras/issues/5640
https://github.com/tensorflow/tensorflow/issues/14356
The following workaround worked for me
global graph
graph = tf.get_default_graph()
with graph.as_default():
model.compile()
model.fit()
with graph.as_default():
model.predict()
This answer is with respect to flask API.
The problem is Flask API works only once and then after it gives errors. So, in that case, you should write K.clear_session() at the end of the API before the return statement.
And do not forget to write from keras import backend as K line at the top.

Categories

Resources