PySpark & MLLib: random forest prediction result always 0 - python

df = pd.read_csv(r'main.csv', header=0)
spark = SparkSession \
.builder \
.master("local") \
.appName("myapp") \
.getOrCreate()
s_df = spark.createDataFrame(df)
transformed_df = s_df.rdd.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[1:])))
splits = [0.7, 0.3]
training_data, test_data = transformed_df.randomSplit(splits, 100)
model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
predictions = model.predict(test_data.map(lambda x: x.features))
when print test_data.map(lambda x: x.features) the result is
[DenseVector([1431500000.0, 9.3347, 79.8337, 44.6364, 194.0, 853.0, 196.9998]),
DenseVector([1431553600.0, 9.5484, 80.7409, 39.5968, 78.0, 923.0, 196.9994])....]
numbers inside the DenseVector([numbers]) are correct for prediction
but the result of the prediction is 0
[0.0, 0.0, 0.0, 0.0, 0.0...]

Related

GridSearchCV with LightFM

I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806

streaming kmeans prediction on Streaming Dataframe

I am working on making kmeans predictions on streaming dataframe from kafka but it fails at different functions notifying that streaming dataframe is not compatible with the function.
Whereas the same prediction code works for non-streaming dataframe.
My code:
##===== reading from csv as STREAM
inputDirectoryOfcsvFiles = "./providers"
fileSchema = (StructType()
.add(StructField("UTILIZATION", IntegerType()))
)
inputDF = (spark
.readStream
.format("csv")
.schema(fileSchema)
.option("header", "true")
.load(inputDirectoryOfcsvFiles))
records = inputDF.select("UTILIZATION")
print("====Dataset *read* from CSV!==")
def StreamingKmeansModel():
initCenters = [[0],[999],[1500]]
initWeights = [1.0 , 1.0 , 1.0]
stkm = StreamingKMeansModel(initCenters, initWeights)
print("streaming kmean model trained!")
return stkm
stkm=StreamingKmeansModel()
assembler=VectorAssembler(inputCols= inputDF.columns , outputCol='features')
print(assembler)
dataset=assembler.transform(inputDF)
datasett=dataset.select('features')
data_list = dataset.collect() **#=====Point of error**
data_list = [DenseVector(row.features) for row in data_list] **#=====Point of error**
decayFactor = 0.0
data = spark1.sparkContext.parallelize(data_list)
stkm1 = stkm.update(data, 1, "points")
ClusterCenter=stkm1.centers
StreamingKmeanCost=stkm1.computeCost(data)
ccl=[ClusterCenter]
cclS = ssc.queueStream(ccl)
cclS.pprint()
print('==Final Centers==')
ssc.start()
ssc.stop(stopSparkContext=True, stopGraceFully=True)
Error:
raise converted from None
pyspark.sql.utils.AnalysisException: Queries with streaming sources must be executed with writeStream.start();
FileSource[./providers]

IndexError for df_valid

I am using python 3.6.8.
I was using the loop to convert the values in some columns as int:
for i in cols:
df_valid[[i]] = df_valid[[i]].astype(int)
for which the given error was shown.
error:
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
As displayed by the full code below, I used the same thing with df_train. But, it didn't generate any error. I think it has to do something with
df_valid = imputer.transform(df_valid). But, I am not able to resolve it.
Can you please help and provide direction for solving this error.
My full code is as shown below:
import argparse
import os
import joblib
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn import metrics
import config
import model_dispatcher
def run(fold, model):
df = pd.read_csv(config.TRAINING_FILE)
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
df["Married"] = df["Married"].map({"No": 0, "Yes": 1})
df["Self_Employed"] = df["Self_Employed"].map({"No": 0, "Yes": 1})
df["Dependents"] = df["Dependents"].map({"0": 0, "1": 1, "2": 2, "3+": 3})
df["Education"] = df["Education"].map({"Graduate": 1, "Not Graduate": 0})
df["Loan_Status"] = df["Loan_Status"].map({"N": 0, "Y": 1})
cols = ["Gender",
"Married",
"Dependents",
"Education",
"Self_Employed",
"Credit_History",
"Loan_Status"]
dummy = pd.get_dummies(df["Property_Area"])
df = pd.concat([df, dummy], axis=1)
df = df.drop(["Loan_ID", "Property_Area"], axis=1)
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)
imputer = KNNImputer(n_neighbors=18)
df_train = pd.DataFrame(imputer.fit_transform(df_train),
columns=df_train.columns)
for i in cols:
df_train[[i]] = df_train[[i]].astype(int)
df_valid = imputer.transform(df_valid)
for i in cols:
df_valid[[i]] = df_valid[[i]].astype(int)
df_train['GxM'] = df_train.apply(lambda row:
(row['Gender']*row['Married']),
axis=1)
df_train['Income_sum'] = (
df_train.apply(lambda row:
(row['ApplicantIncome'] +
row['CoapplicantIncome']),
axis=1))
df_train['DxE'] = df_train.apply(lambda row: (row['Education'] *
row['Dependents']),
axis=1)
df_train['DxExG'] = (
df_train.apply(lambda row:
(row['Education'] *
row['Dependents'] *
row['Gender']),
axis=1))
df_valid['GxM'] = df_valid.apply(lambda row:
(row['Gender']*row['Married']),
axis=1)
df_valid['Income_sum'] = (
df_valid.apply(lambda row:
(row['ApplicantIncome'] +
row['CoapplicantIncome']),
axis=1))
df_valid['DxE'] = df_valid.apply(lambda row: (row['Education'] *
row['Dependents']),
axis=1)
df_valid['DxExG'] = (
df_valid.apply(lambda row:
(row['Education'] *
row['Dependents'] *
row['Gender']),
axis=1))
X_train = df_train.drop("Loan_Status", axis=1).values
y_train = df_train.Loan_Status.values
X_valid = df_valid.drop("Loan_Status", axis=1).values
y_valid = df_valid.Loan_Status.values
clf = model_dispatcher.models[model]
clf.fit(X_train, y_train)
preds = clf.predict(X_valid)
rascore = metrics.roc_auc_score(y_valid, preds)
print(f"Fold = {fold}, ROC-AUC = {rascore}")
joblib.dump(
clf,
os.path.join(config.MODEL_OUTPUT, f"dt_{fold}.bin")
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fold", type=int)
parser.add_argument("--model", type=str)
args = parser.parse_args()
run (fold=args.fold, model=args.model)
To convert all the columns to integer format, you can just give:
df_valid.apply(pd.to_numeric).dtypes
For more details on pd.to_numeric, see documentation
You may also want to read more about converting data to different datatypes in this Stack Overflow response

LightGBM vs Sklearn LightGBM- Mistake in Implementation- Exact same parameters giving different results

While passing the exact same parameters to LightGBM and sklearn's implementation of LightGBM, I am getting different results. Initially, I was getting the exact same results on doing this, however, I made some changes to my code and now I can't find out why they're not coming the same. This means that the performance metrics and feature importance are coming differently. Please help me figure it out, I can't figure out the mistake I am making. It could either be a mistake in the way I am implementing LightGBM using the original library or in sklearn's implementation. Link for explanation on why we should get identical results - light gbm - python API vs Scikit-learn API
x_train, x_test, y_train, y_test = train_test_split(df_dummy[df_merge.columns], labels, test_size=0.25,random_state=42)
n_folds = 5
lgb_train = lgb.Dataset(x_train, y_train)
def objective(params, n_folds = n_folds):
"""Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
print(params)
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])
params['min_child_samples'] = int(params['min_child_samples'])
params['subsample_freq'] = int(params['subsample_freq'])
# Perform n_fold cross validation with hyperparameters
# Use early stopping and evalute based on ROC AUC
cv_results = lgb.cv(params, lgb_train, nfold=n_folds, num_boost_round=10000,
early_stopping_rounds=100, metrics='auc')
# Extract the best score
best_score = max(cv_results['auc-mean'])
# Loss must be minimized
loss = 1 - best_score
num_iteration = int(np.argmax(cv_results['auc-mean']) + 1)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, num_iteration])
# Dictionary with information for evaluation
return {'loss': loss, 'params': params, 'status': STATUS_OK, 'estimators': num_iteration}
space = {
'min_child_samples': hp.quniform('min_child_samples', 5, 100, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
'max_depth' : hp.quniform('max_depth', 3, 10, 1),
'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
'subsample_freq': hp.quniform('subsample_freq',0,10,1),
'min_gain_to_split': hp.quniform('min_gain_to_split', 0.01, 0.1, 0.01),
'learning_rate' : 0.05,
'objective' : 'binary',
}
out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'estimators'])
of_connection.close()
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=10)
bayes_trials_results = sorted(trials.results, key = lambda x: x['loss'])
results = pd.read_csv('results/gbm_trials.csv')
# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
best_bayes_estimators = int(results.loc[0, 'estimators'])
best['max_depth'] = int(best['max_depth'])
best['num_leaves'] = int(best['num_leaves'])
best['min_child_samples'] = int(best['min_child_samples'])
num_boost_round=int(best_bayes_estimators * 1.1)
best['objective'] = 'binary'
best['boosting_type'] = 'gbdt'
best['subsample_freq'] = int(best['subsample_freq'])
#Actual LightGBM
best_gbm = lgb.train(params=best, train_set=lgb_train, num_boost_round=num_boost_round)
print('Plotting feature importances...')
ax = lgb.plot_importance(best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["importance_gain"] = best_gbm.feature_importance(importance_type='gain')
feature_imp["importance_split"] = best_gbm.feature_importance(importance_type='split')
feature_imp.to_clipboard()
y_pred_score = best_gbm.predict(x_test)
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()
#Sklearn's Implementation of LightGBM
best_sk = dict(best)
del best_sk['min_gain_to_split']
sk_best_gbm = lgb.LGBMClassifier(**best_sk, n_estimators=num_boost_round, learning_rate=0.05, min_split_gain=best['min_gain_to_split'])
sk_best_gbm.fit(x_train, y_train)
sk_best_gbm.get_params()
print('Plotting feature importances...')
ax = lgb.plot_importance(sk_best_gbm, max_num_features=15)
plt.show()
feature_imp = pd.DataFrame()
feature_imp["feature"] = list(x_train.columns)
feature_imp["Importance"] = sk_best_gbm.feature_importances_
feature_imp.to_clipboard()
y_pred_score = sk_best_gbm.predict_proba(x_test)[:,1]
roc_auc_score_list = []
f1_score_list = []
accuracy_score_list = []
precision_score_list = []
recall_score_list = []
thresholds = [0.4,0.5,0.6,0.7]
for threshold in thresholds:
print("threshold is {}".format(threshold))
y_pred = np.where(y_pred_score>=threshold, 1, 0)
print(roc_auc_score(y_test,y_pred_score))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
roc_auc_score_list.append(roc_auc_score(y_test,y_pred_score))
f1_score_list.append(f1_score(y_test,y_pred))
accuracy_score_list.append(accuracy_score(y_test,y_pred))
precision_score_list.append(precision_score(y_test,y_pred))
recall_score_list.append(recall_score(y_test,y_pred))
performance_metrics = pd.DataFrame(
{'thresholds':thresholds,
'roc_auc_score':roc_auc_score_list,
'f1_score':f1_score_list,
'accuracy_score':accuracy_score_list,
'precision_score':precision_score_list,
'recall_score':recall_score_list })
performance_metrics.transpose().to_clipboard()

Python K means clustering

I am trying to implement the code on this website to estimate what value of K I should use for my K means clustering.
https://datasciencelab.wordpress.com/2014/01/21/selection-of-k-in-k-means-clustering-reloaded/
However I am not getting any success - in particular I am trying to get the f(k) vs the number of clusters k graph which I can use to procure the ideal value of k to use.
My data format is as follows:
Each of the coordinates have 5 dimensions/variables i.e. they are data points that live in a five-dimensional space.
The list of the coordinates are below, where for example the first data point has coordinates ( 35.38361202590826,-24.022420305129415, 0.9608968122051765, -11.700331772145386, -9.4393980963685).
Variable1 = [35.38361202590826, 3.0, 10.0, 10.04987562112089, 5.385164807134505, 24.35159132377184, 10.77032961426901, 10.816653826391967, 18.384776310850235, 14.317821063276353, 24.18677324489565, 3.0, 24.33105012119288, 8.94427190999916, 2.82842712474619, 4.123105625617661, 4.47213595499958, 13.453624047073712, 12.529964086141668, 19.4164878389476, 5.385164807134505, 5.0, 24.041630560342618, 30.083217912982647, 15.132745950421555, 1.414213562373095, 21.470910553583888, 12.649110640673516, 9.0, 9.055385138137416, 16.124515496597102, 18.027756377319946, 7.615773105863908, 4.47213595499958, 5.0, 16.124515496597102, 8.246211251235321, 3.0, 23.02172886644268, 2.23606797749979, 10.0, 13.416407864998737, 14.7648230602334, 12.649110640673516, 2.82842712474619, 9.899494936611665, 12.806248474865697, 13.0, 10.19803902718557, 10.440306508910549]
Variable2 = [-24.022420305129415, -40.0, -21.0, -36.020346285601605, -14.298541039632994, -10.225204451297113, -7.242118188905023, -10.816653826391967, -16.263455967290593, -0.9079593845004517, -5.70559779110359, -1.0, -17.426292654367874, -0.4472135954999579, -12.727922061357855, -38.32062875574061, -15.205262246998569, -13.89960053482201, -6.943355894868313, -18.43793805396085, -14.298541039632994, -8.0, -9.899494936611665, -10.537436550735357, -9.251460406371256, -1.414213562373095, -0.23287321641631115, -4.743416490252569, -10.0, -25.951408627588936, -5.457528321925173, -11.648704120729812, -15.231546211727816, -9.838699100999074, -2.2, 4.713319914389921, -3.395498750508662, -32.0, -16.59301967354925, -4.47213595499958, -3.4, -13.416407864998737, 4.944183868793753, -3.478505426185217, -21.213203435596423, -18.384776310850235, -6.871645523098667, -21.0, -5.491251783869154, -8.620436566990362]
Variable3 = [0.9608968122051765, 22.0, 21.0, 18.507691737905798, 15.412713068695306, -8.08982038917884, -0.7427813527082074, -7.211102550927978, -14.849242404917499, -0.4190581774617469, -10.170848236315095, -7.0, 1.150792911137501, -5.366563145999495, -12.727922061357855, 4.85071250072666, 9.838699100999074, -8.473553267217696, 6.065460321953928, -10.249021432229634, 4.642383454426297, -9.0, 9.899494936611665, 4.354587344310195, -8.854969246098202, -8.48528137423857, -10.292996165600954, -11.067971810589327, -30.0, -10.932721081409808, -14.6360986815266, -22.188007849009164, 0.0, -7.155417527999327, -5.4, -12.279438724331637, 19.40285000290664, -7.0, 18.938629784469825, 8.94427190999916, 3.8, -8.94427190999916, -43.549455173073746, -8.538149682454623, -11.31370849898476, 1.4142135623730951, -10.619815808425212, 12.0, 7.060180864974626, -7.854175538813441]
Variable4 = [-11.700331772145386, -8.0, -5.0, -2.9851115706299676, -10.398938937914904, -8.459406092237773, -7.242118188905023, -10.539303728279352, -21.920310216782973, -8.03194840135015, -10.791021909261136, -10.0, -9.69954025101608, -2.6832815729997477, -23.33452377915607, -7.761140001162655, -17.44133022449836, -4.980070779856015, -2.7134954071899156, -6.48933015307002, -12.441587657862476, -5.2, -18.384776310850235, -10.603918800266811, -14.604091070057484, -4.949747468305833, -1.3506646552146047, -7.905694150420948, -14.0, -29.706080514133717, -2.4806946917841692, -23.574758339572238, -3.2826608214930637, -5.813776741499453, -13.4, -4.9613893835683385, -11.884245626780316, -19.0, -5.473090258814675, -2.23606797749979, -2.0, -2.6832815729997477, -6.163297699455227, -12.01665510863984, -12.727922061357855, -12.020815280171307, -8.589556903873333, -18.53846153846154, -5.491251783869154, -4.789131426105757]
Variable5 = [-9.4393980963685, -4.0, -2.0, -0.29851115706299675, -9.84185292338375, 6.118696639531204, -6.127946159842712, -2.218800784900916, 10.606601717798213, 0.6984302957695782, 0.7442084075352507, -0.0, 3.452378733412503, 1.3416407864998738, -6.363961030678928, 6.305926250944657, -5.813776741499453, -0.4459764877482998, -0.7980868844676221, 7.673890419106611, -1.4855627054164149, 1.4, -2.8284271247461903, -2.925218979383948, 3.9649116027305387, 0.7071067811865475, 0.4191717895493601, 1.5811388300841895, -4.0, 4.748555621218401, 4.341215710622296, 4.714951667914447, -5.120950881529179, 4.919349550499537, 6.2, 0.6201736729460423, -6.305926250944657, -9.0, -6.168085847235585, 0.0, -1.0, 1.3416407864998738, 3.3186987612451224, 4.427188724235731, 4.242640687119285, 4.949747468305833, 5.9346029517670305, 2.3076923076923075, -3.1378581622109447, 1.436739427831727]
I am able to use scikit-learn to create clusters with these coordinates however I am interested in finding the optimal k value to use - however scikit-learn does not have a feature where I can estimate the optimal value of K with this technique (or any technique as far as I am aware).
You can try the code in the last comment by Monte Shaffer.
Here's a simplified version:
import numpy as np
import random
from numpy import zeros
class KMeansFK():
def __init__(self, K, X):
self.K = K
self.X = X
self.N = len(X)
self.mu = None
self.clusters = None
self.method = None
def _cluster_points(self):
mu = self.mu
clusters = {}
for x in self.X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
self.clusters = clusters
def _reevaluate_centers(self):
clusters = self.clusters
newmu = []
keys = sorted(self.clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
self.mu = newmu
def _has_converged(self):
K = len(self.oldmu)
return(set([tuple(a) for a in self.mu]) == \
set([tuple(a) for a in self.oldmu])\
and len(set([tuple(a) for a in self.mu])) == K)
def find_centers(self, K, method='random'):
self.method = method
X = self.X
K = self.K
# https://stackoverflow.com/questions/44372231/population-must-be-a-sequence-or-set-for-dicts-use-listd
self.oldmu = random.sample(list(X), K)
if method != '++':
# Initialize to K random centers
self.mu = random.sample(list(X), K)
while not self._has_converged():
self.oldmu = self.mu
# Assign all points in X to clusters
self._cluster_points()
# Reevaluate centers
self._reevaluate_centers()
def _dist_from_centers(self):
cent = self.mu
X = self.X
D2 = np.array([min([np.linalg.norm(x-c)**2 for c in cent]) for x in X])
self.D2 = D2
def _choose_next_center(self):
self.probs = self.D2/self.D2.sum()
self.cumprobs = self.probs.cumsum()
r = random.random()
ind = np.where(self.cumprobs >= r)[0][0]
return(self.X[ind])
def init_centers(self,K):
self.K = K
#self.mu = random.sample(self.X, 1)
self.mu = random.sample(list(self.X), 1)
while len(self.mu) < self.K:
self._dist_from_centers()
self.mu.append(self._choose_next_center())
def get_ak(self,k, Nd):
if k == 2:
return( 1 - 3.0 / (4.0 * Nd ) )
else:
previous_a = self.get_ak(k-1, Nd)
return ( previous_a + (1.0-previous_a)/6.0 )
def fK(self, thisk, Skm1=0):
X = self.X
Nd = len(X[0])
self.find_centers(thisk, method='++')
mu, clusters = self.mu, self.clusters
Sk = sum([np.linalg.norm(mu[i]-c)**2 \
for i in range(thisk) for c in clusters[i]])
if thisk == 1:
fs = 1
elif Skm1 == 0:
fs = 1
else:
fs = Sk/(self.get_ak(thisk,Nd)*Skm1)
return fs, Sk
def run(self, maxk):
ks = range(1,maxk)
fs = zeros(len(ks))
Wks,Wkbs,sks = zeros(len(ks)+1),zeros(len(ks)+1),zeros(len(ks)+1)
# Special case K=1
self.init_centers(1)
fs[0], Sk = self.fK(1)
# Rest of Ks
for k in ks[1:]:
self.init_centers(k)
fs[k-1], Sk = self.fK(k, Skm1=Sk)
self.fs = fs
And then run it on your data:
X = np.array([Variable1, Variable2, Variable3, Variable4, Variable5])
km = kmeans.KMeansFK(2, X)
km.run(5)
Now km.clusters has the result.

Categories

Resources