streaming kmeans prediction on Streaming Dataframe - python

I am working on making kmeans predictions on streaming dataframe from kafka but it fails at different functions notifying that streaming dataframe is not compatible with the function.
Whereas the same prediction code works for non-streaming dataframe.
My code:
##===== reading from csv as STREAM
inputDirectoryOfcsvFiles = "./providers"
fileSchema = (StructType()
.add(StructField("UTILIZATION", IntegerType()))
)
inputDF = (spark
.readStream
.format("csv")
.schema(fileSchema)
.option("header", "true")
.load(inputDirectoryOfcsvFiles))
records = inputDF.select("UTILIZATION")
print("====Dataset *read* from CSV!==")
def StreamingKmeansModel():
initCenters = [[0],[999],[1500]]
initWeights = [1.0 , 1.0 , 1.0]
stkm = StreamingKMeansModel(initCenters, initWeights)
print("streaming kmean model trained!")
return stkm
stkm=StreamingKmeansModel()
assembler=VectorAssembler(inputCols= inputDF.columns , outputCol='features')
print(assembler)
dataset=assembler.transform(inputDF)
datasett=dataset.select('features')
data_list = dataset.collect() **#=====Point of error**
data_list = [DenseVector(row.features) for row in data_list] **#=====Point of error**
decayFactor = 0.0
data = spark1.sparkContext.parallelize(data_list)
stkm1 = stkm.update(data, 1, "points")
ClusterCenter=stkm1.centers
StreamingKmeanCost=stkm1.computeCost(data)
ccl=[ClusterCenter]
cclS = ssc.queueStream(ccl)
cclS.pprint()
print('==Final Centers==')
ssc.start()
ssc.stop(stopSparkContext=True, stopGraceFully=True)
Error:
raise converted from None
pyspark.sql.utils.AnalysisException: Queries with streaming sources must be executed with writeStream.start();
FileSource[./providers]

Related

BioBERT Inferencing Using TF_Record file and GRPC

I have been trying to perform inference using bioBERT(https://github.com/dmis-lab/biobert)
Model and TF serve to perform QA task.
I have succesfully exported the model: My serving function looks like this:
feature_columns = [
tf.feature_column.numeric_column("unique_ids", shape=(FLAGS.max_seq_length,), dtype=tf.int64),
tf.feature_column.numeric_column("input_ids", shape=(FLAGS.max_seq_length,), dtype=tf.int64),
tf.feature_column.numeric_column("input_mask", shape=(FLAGS.max_seq_length,), dtype=tf.int64),
tf.feature_column.numeric_column("segment_ids", shape=(FLAGS.max_seq_length,), dtype=tf.int64)
]
serving_input_fn=tf.estimator.export.build_parsing_serving_input_receiver_fn(tf.feature_column.make_parse_example_spec(feature_columns))
estimator._export_to_tpu = False
estimator_path = estimator.export_saved_model(estimator_base_path, serving_input_fn, checkpoint_path)
##############################################
I am also able to generate a TFrecord File and trying to utilize TFrecordIterator to iterate over tht tf records file and call the GRPC generated stub.
#record_path is the path to TF_record filw
Function below....
all_results = [ ]
record_iterator = tf.python_io.tf_record_iterator(pathToTfRecordFile)
for string_record in record_iterator:
model_request.inputs['examples'].CopyFrom(
tf.contrib.util.make_tensor_proto(string_record,
dtype=tf.string,
shape=[batch_size])
)
result_future = stub.Predict.future(model_request, 30.0)
result = result_future.result().outputs
all_results.append(process_result(result))
The Error I am getting is as follows:
_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.INVALID_ARGUMENT
details = "Name: <unknown>, Key: unique_ids, Index: 0. Number of int64 values != expected. Values size: 1 but output shape: [384]
Any help on this issue is appreciated.
Try shaping the unique_ids like this in the service_inputfn.
tf.feature_column.numeric_column("unique_ids", shape=(1,), dtype=tf.int64),

get a result from flask

I have trained a deep learning model (lstm) with keras, save it with h5 and now I want to "hit" a web service in order to get back a category. This is the first time I have tried to do that so I am a little confused. I can not figure out how to take categories back. Also when I send a request to http://localhost:8000/predict I get the following error,
The server encountered an internal error and was unable to complete your
request. Either the server is overloaded or there is an error in the
application.
and in the notebook
ValueError: Tensor Tensor("dense_3/Softmax:0", shape=(?, 6), dtype=float32)
is not an element of this graph.
I try the solution from enter link description here but is not working
The code so far is below
from flask import Flask,request, jsonify#--jsonify will return the data
import os
from keras.models import load_model
app = Flask(__name__)
model=load_model('lstm-final-five-Copy1.h5')
#app.route('/predict', methods= ["GET","POST"])
def predict():
df_final = pd.read_csv('flask.csv')
activities = df_final['activity'].value_counts().index
label = LabelEncoder()
df_final['label'] = label.fit_transform(df_final['activity'])
X = df_final[['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz']]
y = df_final['label']
scaler = StandardScaler()
X = scaler.fit_transform(X)
df_final = pd.DataFrame(X, columns = ['accx', 'accy', 'accz', 'gyrx',
'gyry', 'gyrz'])
df_final['label'] = y.values
Fs = 50
frame_size = Fs*2 # 200 samples
hop_size = frame_size # 40 samples
def get_frames(df_final, frame_size, hop_size):
N_FEATURES = 6 #x,y,z (acc,gut)
frames = []
labels = []
for i in range(0, len(df_final) - frame_size, hop_size):
accx = df_final['accx'].values[i: i + frame_size]
accy = df_final['accy'].values[i: i + frame_size]
accz = df_final['accz'].values[i: i + frame_size]
gyrx = df_final['gyrx'].values[i: i + frame_size]
gyry = df_final['gyry'].values[i: i + frame_size]
gyrz = df_final['gyrz'].values[i: i + frame_size]
# Retrieve the most often used label in this segment
label = stats.mode(df_final['label'][i: i + frame_size])[0][0]
frames.append([accx, accy, accz, gyrx, gyry, gyrz])
labels.append(label)
# Bring the segments into a better shape
frames = np.asarray(frames).reshape(-1, frame_size, N_FEATURES)
labels = np.asarray(labels)
return frames, labels
X, y = get_frames(df_final, frame_size, hop_size)
pred = model.predict_classes(X)
return jsonify({"Prediction": pred}), 201
if __name__ == '__main__':
app.run(host="localhost", port=8000, debug=False)
It seems in your '/predict' POST endpoint you arent returning any values which is why you arent getting back a category as you expect.
If you wanted to add a GET method you could add something like what is mentioned below,
#app.route('/', methods=['GET'])
def check_server_status():
return ("Server Running!")
And in the POST method your case you could return your prediction in the endpoint,
#app.route('/predict', methods=['POST'])
def predict():
# Add in other steps here
pred = model.predict_classes(X)
return jsonify({"Prediction": pred}), 201
As far as I can see you need to install pandas if you haven't by doing pip install pandas and import it as import pandas as pd
Also you can add "GET" method in your /prediction endpoint like:
#app.route("/predict", methods=["GET", "POST"])

Key Error while preprocessing of data (onehot encoding)

I am getting a key error while converting the variables using onehot encoder. This is the code that i used:
def preprocessor(df):
res_df = df.copy()
le = preprocessing.LabelEncoder()
res_df['"job"'] = le.fit_transform(res_df['"job"'])
res_df['"marital"'] = le.fit_transform(res_df['"marital"'])
res_df['"education"'] = le.fit_transform(res_df['"education"'])
res_df['"default"'] = le.fit_transform(res_df['"default"'])
res_df['"housing"'] = le.fit_transform(res_df['"housing"'])
res_df['"month"'] = le.fit_transform(res_df['"month"'])
res_df['"loan"'] = le.fit_transform(res_df['"loan"'])
res_df['"contact"'] = le.fit_transform(res_df['"contact"'])
res_df['"day_of_week"'] = le.fit_transform(res_df['"day"'])
res_df['"poutcome"'] = le.fit_transform(res_df['"poutcome"'])
res_df['"y"'] = le.fit_transform(res_df['"y"'])
return res_df
while executing the function the function, i am getting a key error
encoded_df = preprocessor(df1)
x = encoded_df.drop(['"y"'],axis =1).values
y = encoded_df['"y"'].values
while executing the function the function, i am getting a key error, although i have split the column using sep=';'. Can anyone please help

Updating a Dataframe row by a dictionary

I've already seen the Replace a row by a new Dataframe solution, but It's pretty unclear to me.
I've got a DataFrame of results of some model with a MultiIndex at the name of the model and the mode (train/test) that I want to update with the last execution keeping the other model results (create the DataFrame if it doesn't exist or update the row with the same name and mode with the dic variable). Here is my code:
def save_results(dic, path = "../ModelsResults"):
try:
df_results = pd.read_pickle(path)
print("Updating ModelResults...")
df_now = pd.DataFrame.from_dict([dic])
if df_results.index.isin([(dic["Model"], dic["Mode"])]).any():
print("\tUpdating Model/Mode...")
df_now.drop(["Model", "Mode"],axis=1)
df_results.at[dic["Model"], dic["Mode"]] = df_now
else:
print("\tCreating Model/Mode...")
df_results = df_results.append(df_now)
except FileNotFoundError:
print("Creating ModelResults...")
df_results = pd.DataFrame.from_dict([dic])
df_results = df_results.set_index(["Model", "Mode"])
df_results.to_pickle(path)
print("Done")
return df_results
Every metric that I want to save is in the dic variable. For example:
dic = {
"Model": "Dummy-PredictingAlwaysZero",
"Mode": "Train",
"MSE": mse ,
"nRMSE": nrmse,
"nDCG#10": ndcg(train["rel"].values, y_pred, k = 10),
"nDCG#50": ndcg(train["rel"].values, y_pred, k = 50)
}
df_results = save_results(dic, path = "./ModelsResults")
And the expected DataFrame is like:
MSE nDCG#10 nDCG#50 nRMSE
Model Mode
Dummy-PredictingAlwaysZero Train 0.08639 0.162948 0.106816 0.293922

How to read from local directory, kmeans streaming pyspark

I need help with reading from a local directory when running kmeans streaming with pyspark. There is no good answer on this topic on stackoverflow
Here is my code
if __name__ == "__main__":
ssc = StreamingContext(sc, 1)
training_data_raw, training_data_df = prepare_data(TRAINING_DATA_SET)
trainingData = parse2(training_data_raw)
testing_data_raw, testing_data_df = prepare_data(TEST_DATA_SET)
testingData = testing_data_raw.map(parse1)
#print(testingData)
trainingQueue = [trainingData]
testingQueue = [testingData]
trainingStream = ssc.queueStream(trainingQueue)
testingStream = ssc.queueStream(testingQueue)
# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)
result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
result.pprint()
ssc.textFileStream('file:///Users/userrname/PycharmProjects/MLtest/training/data/')
ssc.start()
ssc.awaitTermination()
Thanks!!
from pyspark.mllib.linalg import Vectors
trainingData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
for test examples
from pyspark.mllib.regression import LabeledPoint
def parse(lp):
label = float(lp[lp.find('(') + 1: lp.find(',')])
vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
return LabeledPoint(label, vec)
testData = ssc.textFileStream("/testing/data/dir").map(parse)

Categories

Resources