I need to run a python script from a .net core web API repository to get the cluster of the current user and recommend the list of courses based on his ratings. (I'm implementing a collaborative filtering recommender system with the use of sklearn.NearestNeighbor to help with user clustering.
List Course Repository
public async Task<IEnumerable<Course>> GetList(int? userId, string name, ContextSession session, bool includeDeleted = false)
{
var entity = GetEntities(session, includeDeleted).AsQueryable();
var courses = await entity.Where(obj => obj.Id > 0).ToListAsync();
SaveToCsv<Course>(courses, Environment.CurrentDirectory + "/csv/course.csv");
var ratings = _dbContext.Set<CourseRating>().AsQueryable();
if (!includeDeleted)
{
ratings = ratings.Where(obj => !obj.IsDeleted);
}
var i = await ratings.Where(obj => obj.Id > 0).ToListAsync();
SaveToCsv<CourseRating>(i, Environment.CurrentDirectory + "/csv/ratings_Final.csv");
ScriptRuntimeSetup setup = Python.CreateRuntimeSetup(null);
ScriptRuntime runtime = new ScriptRuntime(setup);
ScriptEngine engine = Python.GetEngine(runtime);
ScriptSource source = engine.CreateScriptSourceFromFile(Environment.CurrentDirectory + "/CB_RS.py");
ScriptScope scope = engine.CreateScope();
List<String> argv = new List<String>();
argv.Add(session.UserId.ToString());
argv.Add(Environment.CurrentDirectory + "/csv/course.csv");
argv.Add(Environment.CurrentDirectory + "/csv/ratings_Final.csv");
engine.GetSysModule().SetVariable("argv", argv);
try
{
source.Execute(scope);
}
catch(Exception ex)
{
var error = ex;
}
return await entity.Where(obj => obj.Id > 0).ToListAsync();
}
The Python Script
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import operator
def get_rating_course_user(courses_path, ratings_path):
course_df = pd.read_csv(courses_path)
rating_df = pd.read_csv(ratings_path)
#drop useless columns
#course_df.drop(columns_list, axis = 1)
#rating_df.drop([columns_list], axis = 1)
#merge 2 dataset on courseID
df = rating_df.merge(course_df, on = 'courseId')
# Get the ratings per course per user
pvt = df.pivot_table(index='userId',columns='name',values='rating')
#convert all NaN to -1.
pvt.fillna(-1,inplace=True)
#convert pivot table to sparce matrix
spm = csr_matrix(pvt)
return spm, pvt
#returns a list of recommended courses according to similar user's ratings
def make_prediction(userId, courses_path, ratings_path, n_neighbors):
recommended = []
spm, pvt = get_rating_course_user(courses_path, ratings_path)
# Creating, training the model
nn = NearestNeighbors(algorithm = 'brute')
nn.fit(spm)
#testing the model
test = pvt.iloc[userId,:].values.reshape(1,len(pvt.columns))
distance, suggested = nn.kneighbors(test,n_neighbors=n_neighbors)
for i in range(len(suggested[0])):
x = pvt.iloc[suggested[0][i],:].values.reshape(1,len(pvt.columns))
result = np.where(x >= 3)
for j in range(len(result[1])):
course = pvt.iloc[:,result[1][j]].name
if course not in recommended:
recommended.append(course)
return recommended
#return dictionary {id_course: average_rating} of k courses having the best average_rating
def recommend_K_best_courses(recommendedList, courses_path, k):
courses = pd.read_csv(courses_path)
courseIds = {}
for i in (recommendedList):
ID = (courses.loc[course_df.name==i].courseId).to_string(index=False)
rate = (courses.loc[course_df.name==i].average).to_string(index=False)
courseIds[int(ID)] = float(rate)
sort = dict(sorted(courseIds.items(), key=operator.itemgetter(1), reverse=True)[:k])
return sort
def main(userId, courses_path, ratings_path, n_neighbors=7, k=10){
recommended_list = make_prediction(userId, courses_path, ratings_path, n_neighbors)
return recommend_K_best_courses(recommended_list, courses_path, k)
}
if __name__ == "__main__":
main(userId, courses_path, ratings_path, n_neighbors=7, k=10)
I got the following error:
unexpected token '{'
My current problem is running the python script, then using the result returned from the main method of the script
I made a C# dev error, I opened the method main with '{' instead of ':'
def main(userId, courses_path, ratings_path, n_neighbors=7, k=10):
recommended_list = make_prediction(userId, courses_path, ratings_path, n_neighbors)
return recommend_K_best_courses(recommended_list, courses_path, k)
Related
I have trained a BERT model using ktrain (TensorFlow wrapper) to recognize emotion on text. It works, but it suffers from really slow inference. That makes my model not suitable for a production environment. I have done some research, and it seems pruning could help.
TensorFlow provides some options for pruning, e.g., tf.contrib.model_pruning. The problem is that it is not a not a widely used technique. What would be a simple enough example that could help me to understand how to use it?
I provide my working code below for reference.
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
The distilbert model in ktrain is created using Hugging Face transformers, which means you can use that library to prune the model. See this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?.
I am attempting to look at conglomerated outlier information, utilizing several different SKLearn, HDBScan, and custom outlier detection classes. However, for some reason I am consistently running into an error where any class utilizing HDBScan cannot be iterated over. All other Sklearn and Custom classes can. The issue I am getting seems to consistently occur on the second pass of the HDBScan class and instantly happens upon algorithm.fit(tmp). Upon debugging the script, it looks like the error is thrown before even getting to the first line of the Class.
Any help? Below is the minimum viable reproduction:
import numpy as np
import pandas as pd
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.fit = cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.fit.probabilities_
def OutlierEnsemble(df, anomaly_algorithms = None, num_slices = 5, num_columns = 7, outliers_fraction = 0.05):
if isinstance(df, np.ndarray):
df = pd.DataFrame(df)
assert isinstance(df, pd.DataFrame)
if not anomaly_algorithms:
anomaly_algorithms = [
("Robust covariance",
EllipticEnvelope(contamination=outliers_fraction)),
("One-Class SVM",
OneClassSVM(nu=outliers_fraction,
kernel="rbf")),
("Isolation Forest",
IsolationForest(contamination=outliers_fraction)),
("HDBScan LOF",
DBClass()),
]
data = []
for i in range(1, num_slices + 1):
data.append(df.sample(n = num_columns, axis = 1, replace = False))
predictions = []
names = []
for tmp in data:
counter = 0
for name, algorithm in anomaly_algorithms:
algorithm.fit(tmp)
predictions.append(algorithm.predict(tmp))
counter += 1
names.append(f"{name}{counter}")
return predictions
blobs, labels = make_blobs(n_samples=3000, n_features=12)
OutlierEnsemble(blobs)
The error provided is not the most helpful.
Traceback (most recent call last):
File "<ipython-input-4-e1d4b63cfccd>", line 75, in <module>
OutlierEnsemble(blobs)
File "<ipython-input-4-e1d4b63cfccd>", line 66, in OutlierEnsemble
algorithm.fit(tmp)
TypeError: 'HDBSCAN' object is not callable
In your DBClass.fit, DBClass.fit is unintentionally redefined.
You could perhaps use something like,
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.myfit = cluster # save calculated cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.myfit.probabilities_ # use calculated cluster
I want to solve a multi-objective optimization problem using DEAP, a python based framework. Due to time consuming processes, i need to use all of my CPU power to compute. So i used multiprocessing library as suggested in DEAP documentation an this example, but it results in PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed .
My total code is too long to write it down hear, but the following code is similar to my code and results in the same error.Can you please tell me where do i make mistake?
Thanks in advance
import multiprocessing
from deap import creator, base, tools, algorithms
import random
import matplotlib.pyplot as plt
def TEST(dec_var):
return dec_var[0]**2+dec_var[1]**2,(dec_var[0]-2)**2+dec_var[1]**2
def feasible(dec_var):
if all(i>0 for i in dec_var):
return True
return False
creator.create("FitnessMin", base.Fitness, weights=(-1.0,-1.0))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox=base.Toolbox()
toolbox.register("uniform", random.uniform, 0.0, 7.0)
toolbox.register("individual",tools.initRepeat,creator.Individual,toolbox.uniform ,n=2)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.1)
toolbox.register("select", tools.selNSGA2)
toolbox.register("evaluate", TEST)
toolbox.decorate("evaluate", tools.DeltaPenalty(feasible,(1000,1000)))
def main(seed=None):
random.seed(seed)
NGEN = 250
MU = 100
CXPB = 0.9
stats_func1 = tools.Statistics(key=lambda ind: ind.fitness.values[0])
stats_func2 = tools.Statistics(key=lambda ind: ind.fitness.values[1])
stats = tools.MultiStatistics(func1=stats_func1, func2=stats_func2)
stats.register("avg", numpy.mean, axis=0)
stats.register("std", numpy.std, axis=0)
stats.register("min", numpy.min, axis=0)
stats.register("max", numpy.max, axis=0)
logbook = tools.Logbook()
logbook.header = "gen", "evals", "func1","func2"
logbook.chapters["func1"].header = "min", "max"
logbook.chapters["func2"].header = "min", "max"
pop = toolbox.population(n=MU)
invalid_ind = [ind for ind in pop if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop = toolbox.select(pop, len(pop))
record = stats.compile(pop)
logbook.record(gen=0, evals=len(invalid_ind), **record)
print(logbook.stream)
for gen in range(1, NGEN):
offspring = tools.selTournamentDCD(pop, len(pop))
offspring = [toolbox.clone(ind) for ind in offspring]
for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
if random.random() <= CXPB:
toolbox.mate(ind1, ind2)
toolbox.mutate(ind1)
toolbox.mutate(ind2)
del ind1.fitness.values, ind2.fitness.values
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop = toolbox.select(pop + offspring, MU)
record = stats.compile(pop)
logbook.record(gen=gen, evals=len(invalid_ind), **record)
print(logbook.stream)
return pop, logbook
if __name__ == "__main__":
pool = multiprocessing.Pool(processes=4)
toolbox.register("map", pool.map)
pop, stats = main()
pool.close()
print pop
This is noted on the documentation page you linked:
The pickling of lambda function is not yet available in Python.
You are using lambda functions in stats_func1 and stats_func2 – just move them to global functions and try again:
def stats_key_1(ind):
return ind.fitness.values[0]
def stats_key_2(ind):
return ind.fitness.values[1]
# ... snip ...
def main(seed=None):
# ... snip ...
stats_func1 = tools.Statistics(key=stats_key_1)
stats_func2 = tools.Statistics(key=stats_key_1)
I need help with reading from a local directory when running kmeans streaming with pyspark. There is no good answer on this topic on stackoverflow
Here is my code
if __name__ == "__main__":
ssc = StreamingContext(sc, 1)
training_data_raw, training_data_df = prepare_data(TRAINING_DATA_SET)
trainingData = parse2(training_data_raw)
testing_data_raw, testing_data_df = prepare_data(TEST_DATA_SET)
testingData = testing_data_raw.map(parse1)
#print(testingData)
trainingQueue = [trainingData]
testingQueue = [testingData]
trainingStream = ssc.queueStream(trainingQueue)
testingStream = ssc.queueStream(testingQueue)
# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)
result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
result.pprint()
ssc.textFileStream('file:///Users/userrname/PycharmProjects/MLtest/training/data/')
ssc.start()
ssc.awaitTermination()
Thanks!!
from pyspark.mllib.linalg import Vectors
trainingData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
for test examples
from pyspark.mllib.regression import LabeledPoint
def parse(lp):
label = float(lp[lp.find('(') + 1: lp.find(',')])
vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
return LabeledPoint(label, vec)
testData = ssc.textFileStream("/testing/data/dir").map(parse)
I'm following Chapter 4 from "Advanced Analytics with Spark" from O'Reilly. This book is in Scala and I'm having trouble converting this code to Python.
Scala Code
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
val rawData = sc.textFile("hdfs:///user/ds/covtype.data")
val data = rawData.map { line =>
val values = line.split(',').map(_.toDouble)
val featureVector = Vectors.dense(values.init)
val label = values.last - 1
LabeledPoint(label, featureVector)
}
val Array(trainData, cvData, testData) =
data.randomSplit(Array(0.8, 0.1, 0.1))
trainData.cache()
cvData.cache()
testData.cache()
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]):
MulticlassMetrics = {
val predictionsAndLabels = data.map(example =>
(model.predict(example.features), example.label)
)
new MulticlassMetrics(predictionsAndLabels)
}
val model = DecisionTree.trainClassifier(
trainData, 7, Map[Int,Int](), "gini", 4, 100)
val metrics = getMetrics(model, cvData)
metrics.confusionMatrix
My Python Code
from pyspark.sql.functions import col, split
import pyspark.mllib.linalg as linal
import pyspark.mllib.regression as regre
import pyspark.mllib.evaluation as eva
import pyspark.mllib.tree as tree
import pyspark.rdd as rd
raw_data = sc.textFile("covtype.data")
def fstDecisionTree(line):
values = list(map(float,line.split(",")))
featureVector = linal.Vectors.dense(values[:-1])
label = values[-1]-1
ret=regre.LabeledPoint(label, featureVector)
return regre.LabeledPoint(label, featureVector)
data = raw_data.map(fstDecisionTree)
trainData,cvData,testData=data.randomSplit([0.8,0.1,0.1])
trainData.cache()
cvData.cache()
testData.cache()
def help_lam(model):
def _help_lam(dataline):
print(dataline)
a=dataline.collect()
return (model.predict(a[1]),a[0])
return _help_lam
def getMetrics(model, data):
print(type(model),type(data))
predictionsAndLabels= data.map(help_lam(model))
return eva.MulticlassMetrics(predictionsAndLabels)
n_targets=7
max_depth=4
max_bin_count=100
model = tree.DecisionTree.trainClassifier(trainData, n_targets, {}, "gini", max_depth, max_bin_count)
metrics=getMetrics(model,cvData)
When I run this, I have this error in the method def _help_lam(dataline) inside of def help_lam(model) when I try to implicitly pass the map iteration in:
AttributeError: 'Py4JError' object has no attribute 'message'
I think the problem is in the model.predict function
From pyspark mllib/tree.py
Note: In Python, predict cannot currently be used within an RDD
transformation or action.
Call predict directly on the RDD instead.
What you can do is pass the feature vector directly like so
>>> rdd = sc.parallelize([[1.0], [0.0]])
>>> model.predict(rdd).collect()
[1.0, 0.0]
Edit:
An update to your getMetrics could be:
def getMetrics(model, data):
labels = data.map(lambda d: d.label)
features = data.map(lambda d: d.features)
predictions = model.predict(features)
predictionsAndLabels = predictions.zip(labels)
return eva.MulticlassMetrics(predictionsAndLabels)