IndexError for df_valid - python

I am using python 3.6.8.
I was using the loop to convert the values in some columns as int:
for i in cols:
df_valid[[i]] = df_valid[[i]].astype(int)
for which the given error was shown.
error:
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
As displayed by the full code below, I used the same thing with df_train. But, it didn't generate any error. I think it has to do something with
df_valid = imputer.transform(df_valid). But, I am not able to resolve it.
Can you please help and provide direction for solving this error.
My full code is as shown below:
import argparse
import os
import joblib
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn import metrics
import config
import model_dispatcher
def run(fold, model):
df = pd.read_csv(config.TRAINING_FILE)
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
df["Married"] = df["Married"].map({"No": 0, "Yes": 1})
df["Self_Employed"] = df["Self_Employed"].map({"No": 0, "Yes": 1})
df["Dependents"] = df["Dependents"].map({"0": 0, "1": 1, "2": 2, "3+": 3})
df["Education"] = df["Education"].map({"Graduate": 1, "Not Graduate": 0})
df["Loan_Status"] = df["Loan_Status"].map({"N": 0, "Y": 1})
cols = ["Gender",
"Married",
"Dependents",
"Education",
"Self_Employed",
"Credit_History",
"Loan_Status"]
dummy = pd.get_dummies(df["Property_Area"])
df = pd.concat([df, dummy], axis=1)
df = df.drop(["Loan_ID", "Property_Area"], axis=1)
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)
imputer = KNNImputer(n_neighbors=18)
df_train = pd.DataFrame(imputer.fit_transform(df_train),
columns=df_train.columns)
for i in cols:
df_train[[i]] = df_train[[i]].astype(int)
df_valid = imputer.transform(df_valid)
for i in cols:
df_valid[[i]] = df_valid[[i]].astype(int)
df_train['GxM'] = df_train.apply(lambda row:
(row['Gender']*row['Married']),
axis=1)
df_train['Income_sum'] = (
df_train.apply(lambda row:
(row['ApplicantIncome'] +
row['CoapplicantIncome']),
axis=1))
df_train['DxE'] = df_train.apply(lambda row: (row['Education'] *
row['Dependents']),
axis=1)
df_train['DxExG'] = (
df_train.apply(lambda row:
(row['Education'] *
row['Dependents'] *
row['Gender']),
axis=1))
df_valid['GxM'] = df_valid.apply(lambda row:
(row['Gender']*row['Married']),
axis=1)
df_valid['Income_sum'] = (
df_valid.apply(lambda row:
(row['ApplicantIncome'] +
row['CoapplicantIncome']),
axis=1))
df_valid['DxE'] = df_valid.apply(lambda row: (row['Education'] *
row['Dependents']),
axis=1)
df_valid['DxExG'] = (
df_valid.apply(lambda row:
(row['Education'] *
row['Dependents'] *
row['Gender']),
axis=1))
X_train = df_train.drop("Loan_Status", axis=1).values
y_train = df_train.Loan_Status.values
X_valid = df_valid.drop("Loan_Status", axis=1).values
y_valid = df_valid.Loan_Status.values
clf = model_dispatcher.models[model]
clf.fit(X_train, y_train)
preds = clf.predict(X_valid)
rascore = metrics.roc_auc_score(y_valid, preds)
print(f"Fold = {fold}, ROC-AUC = {rascore}")
joblib.dump(
clf,
os.path.join(config.MODEL_OUTPUT, f"dt_{fold}.bin")
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fold", type=int)
parser.add_argument("--model", type=str)
args = parser.parse_args()
run (fold=args.fold, model=args.model)

To convert all the columns to integer format, you can just give:
df_valid.apply(pd.to_numeric).dtypes
For more details on pd.to_numeric, see documentation
You may also want to read more about converting data to different datatypes in this Stack Overflow response

Related

GridSearchCV with LightFM

I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806

Import output of kubectl get pods -o json in to pandas dataframe

I'd like to import the output of:
kubectl get pods -o json
into a python pandas dataframe. This should contain also all containers and there resource request and limits.
My code starts as follows:
import json
import numpy as np
import pandas as pd
import os
pods_raw = os.popen('kubectl get pods -o json').read()
pods_json = json.loads(pods_raw)['items']
from here on I struggle to get the data in a correct way in a dataframe, especially the 'spec.containers' should be split up when multiple containers exist.
Here is an example how you can extract the data of interest to the dataframe. The output is only an example (as you didn't specify the required output in the question):
import json
import pandas as pd
# open the Json data from file (or use os.popen):
with open("data.json", "r") as f_in:
data = json.load(f_in)
df = pd.DataFrame(data["items"])
# metadata:
df = pd.concat(
[df, df.pop("metadata").apply(pd.Series).add_prefix("meta_")], axis=1
)
# spec:
df = pd.concat(
[df, df.pop("spec").apply(pd.Series).add_prefix("spec_")], axis=1
)
# status:
df = pd.concat(
[df, df.pop("status").apply(pd.Series).add_prefix("status_")], axis=1
)
# keep only columns of interests:
df = df[["meta_name", "meta_namespace", "status_phase", "spec_containers"]]
# explode spec_containers column
df = df.explode("spec_containers")
df = pd.concat(
[
df,
df.pop("spec_containers")
.apply(pd.Series)
.add_prefix("spec_")[["spec_image", "spec_name"]],
],
axis=1,
)
print(df)
Prints:
meta_name meta_namespace status_phase spec_image spec_name
0 apache-lb-648c5cb8cb-mw5zh default Running httpd apache
0 apache-lb-648c5cb8cb-mw5zh default Running index.docker.io/istio/proxyv2:1.13.4 istio-proxy
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/sig-storage/csi-node-driver-registrar:v2.0.1 driver-registrar
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/cephcsi/cephcsi:v3.3.1 csi-cephfsplugin
1 csi-cephfsplugin-fc79l default Running rocks.canonical.com:443/cdk/cephcsi/cephcsi:v3.3.1 liveness-prometheus
...and so on.
Currently I have the following code to solve this:
#!/usr/bin/env python
import json
import pandas as pd
import os
kb = 1024
mb = kb * kb
gb = mb * kb
tb = gb * kb
def main():
pods_raw = os.popen('kubectl get pods -A -o json').read()
pods_json = json.loads(pods_raw)['items']
first_split = ['status','metadata','spec']
second_split = ['spec.containers','spec.containers.resources',"spec.containers.resources.limits","spec.containers.resources.requests"]
df_pods = pd.DataFrame.from_dict(pods_json)
df_pods = concat_data(df_pods, first_split)
df_pods = expand_data(df_pods, ['spec.containers'])
df_pods = concat_data(df_pods, second_split)
df_pods.index
df_pods.index.name='index'
col_to_normalize = ['spec.containers.resources.limits.cpu',
'spec.containers.resources.limits.memory',
'spec.containers.resources.requests.cpu',
'spec.containers.resources.requests.memory']
for col_name in col_to_normalize:
df_pods[col_name] = df_pods[col_name].map(normalize_values)
df_pods[col_to_normalize] = df_pods.groupby('index')[col_to_normalize].sum()
df_pods = df_pods.drop_duplicates(['metadata.name'])
df_pods[df_pods['status.phase'] == 'Running']
print(df_pods)
def concat_data(df: pd.DataFrame, expands: list) -> pd.DataFrame:
for expantion in expands:
# df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)
df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)
return df
def expand_data(df: pd.DataFrame, expands: list) -> pd.DataFrame:
for expantion in expands:
s = df[expantion].apply(pd.Series).stack()
s.index = s.index.droplevel(-1)
s.index
df.index = [x for x in df.index]
del df[expantion]
s.name = expantion
df=df.join(s)
return df
def normalize_values(val: str) -> int:
try:
if val[-1] == 'm':
return int(val[:-1]) / 1000
if val[-2].lower() == "k":
return int(val[:-2]) * kb
if val[-2].lower() == "m":
return int(val[:-2]) * mb
if val[-2].lower() == "g":
return int(val[:-2]) * gb
if val[-2].lower() == "t":
return int(val[:-2]) * tb
return int(val)
except:
return 0
if __name__ == '__main__':
main()
This works fine except for the following FutureWarning I get and don't know how to solve this yet:
./resources.py:43: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
df = pd.concat( [df, df.pop(expantion).apply(pd.Series).add_prefix(f"{expantion}.")], axis=1)

Why is target encoder encoding some values as NaN?

I am using a target encoder from category_encoders to encode a feature, here is the code I m using:
from category_encoders import TargetEncoder
def encode_large_features(features, X_train, X_test, y_train):
print('target encoding features ...')
for _ in features:
target_encoder = TargetEncoder(_)
target_encoder.fit(X_train[_], y_train)
name = _ + '_encoded'
X_train[name] = target_encoder.transform(X_train[_])
X_train.drop([_], axis=1, inplace=True)
X_test[name] = target_encoder.transform(X_test[_])
X_test.drop([_], axis=1, inplace=True)
return X_train, X_test
the target encoder encodes some values as NaN and I dont know why? here is an example:
Faced the same issue: Raised Issue n Repo
Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting.
This will not return NaN in the training Dataset like category_encoder library.
Below example: chid is a categorical column apply KFoldTargetEncoder on it.
Libraries required:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn import base
Training Dataset:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
Fit_Transform on Training Data:
targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5)
train_df = targetc_chid.fit_transform(train_df)
Test Dataset:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames,
self.encodedName]].groupby(
self.colNames).mean().reset_index()
dd = {}
for row in tqdm(mean.itertuples(index=False)):
dd[row[0]] = row[1]
X[self.encodedName] = X[self.colNames]
X[self.encodedName] = X[self.encodedName].map(dd.get)
return X
Fit on Test Data:
test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc')
valid_df = test_targetc_chid.fit_transform(valid_df)

How to use Dask for with .groupby.apply?

I have dataframe df
I would like to partition df into sub-dataframes and apply function find_root on each of them. My function only takes columns id and parent_id as input.
Then I would like to concatenate resulted dataframes. Because my dataframe is huge (over 4 million rows), I would like to use Dask. Then I have an error
ValueError: The columns in the computed data do not match the columns in the provided metadata
Extra: []
Missing: [2]
Could you please elaborate on how to solve this error?
import pandas as pd
import networkx as nx
from dask.distributed import Client
import dask.dataframe as dd
client = Client(n_workers=2, threads_per_worker=1, processes=False, memory_limit='4GB')
def find_root(df):
g = nx.from_pandas_edgelist(df, source = 'parent_id', target = 'id', create_using = nx.DiGraph())
roots = {n for n, d in g.in_degree() if d == 0}
tmp = {}
for r in roots:
tree = dfs_tree(g, r)
tmp[r] = list(tree.nodes)
tmp = pd.DataFrame.from_dict(tmp, orient = 'index').T
tmp = tmp.melt(value_name = 'node', var_name = 'root').dropna()
return tmp
path = 'https://raw.githubusercontent.com/leanhdung1994/WebMining/main/sample_df.csv'
df = dd.read_csv(path, header = 0)
df = df[['id', 'created_utc', 'ups', 'link_id', 'author', 'body', 'parent_id']]
df['parent_id'] = df['parent_id'].str.split('_', expand = True, n = 2)[1]
df['link_id'] = df['link_id'].str.split('_', expand = True, n = 2)[1]
result = df.groupby('link_id').apply(find_root, meta = object)
computed_result = result.compute()
Update: I added dtype to dd.read_csv
df = dd.read_csv(path, header = 0, dtype = {'id': 'str', 'parent_id': 'str', 'link_id': 'str'})
but the problem persists.

Porting PyMC2 code to PyMC3 - hierarchical model for sports analytics

I tried the following code, but I ran into problems.
I think .values is the problem but how do I encode this as a Theano object?
The following is my data source
home_team,away_team,home_score,away_score
Wales,Italy,23,15
France,England,26,24
Ireland,Scotland,28,6
Ireland,Wales,26,3
Scotland,England,0,20
France,Italy,30,10
Wales,France,27,6
Italy,Scotland,20,21
England,Ireland,13,10
Ireland,Italy,46,7
Scotland,France,17,19
England,Wales,29,18
Italy,England,11,52
Wales,Scotland,51,3
France,Ireland,20,22
Here is the PyMC2 Code which works:
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
#hyperpriors
home = pymc.Normal('home', 0, .0001, value=0)
tau_att = pymc.Gamma('tau_att', .1, .1, value=10)
tau_def = pymc.Gamma('tau_def', .1, .1, value=10)
intercept = pymc.Normal('intercept', 0, .0001, value=0)
#team-specific parameters
atts_star = pymc.Normal("atts_star",
mu=0,
tau=tau_att,
size=num_teams,
value=att_starting_points.values)
defs_star = pymc.Normal("defs_star",
mu=0,
tau=tau_def,
size=num_teams,
value=def_starting_points.values)
# trick to code the sum to zero constraint
#pymc.deterministic
def atts(atts_star=atts_star):
atts = atts_star.copy()
atts = atts - np.mean(atts_star)
return atts
#pymc.deterministic
def defs(defs_star=defs_star):
defs = defs_star.copy()
defs = defs - np.mean(defs_star)
return defs
#pymc.deterministic
def home_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
home +
atts[home_team] +
defs[away_team])
#pymc.deterministic
def away_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
atts[away_team] +
defs[home_team])
home_points = pymc.Poisson('home_points',
mu=home_theta,
value=observed_home_goals,
observed=True)
away_points = pymc.Poisson('away_points',
mu=away_theta,
value=observed_away_goals,
observed=True)
mcmc = pymc.MCMC([home, intercept, tau_att, tau_def,
home_theta, away_theta,
atts_star, defs_star, atts, defs,
home_points, away_points])
map_ = pymc.MAP( mcmc )
map_.fit()
mcmc.sample(200000, 40000, 20)
My attempt at porting to PyMC3 :)
And I include the wrangling code.
I defined my own data directory etc.
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
x = att_starting_points.values
y = def_starting_points.values
model = pm.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
observed=x)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
observed=y)
#Seems to be the error here.
atts = pm3.Deterministic('regression',
atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression',
T.exp(intercept3 + atts[away_team] + defs[home_team]))
atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=home_theta3, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=home_theta3, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)
And I get an error like values isn't a Theano object.
I think this is the .values part above. But i'm confused about how to convert this into a Theano tensor. The tensors are confusing me :)
And the error for clarity, because I've misunderstood something in PyMC3 syntax.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-71-ce51c1a64412> in <module>()
23
24 #Seems to be the error here.
---> 25 atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
26 home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
27
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2733
2734 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2735 out=out, keepdims=keepdims)
2736
2737 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 ret = ret.dtype.type(ret / rcount)
72 else:
---> 73 ret = ret / rcount
74
75 return ret
TypeError: unsupported operand type(s) for /: 'ObservedRV' and 'int'
Here is my translation of your PyMC2 model:
model = pm.Model()
with pm.Model() as model:
# global model parameters
home = pm.Normal('home', 0, .0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, .0001)
# team-specific model parameters
atts_star = pm.Normal("atts_star",
mu =0,
tau =tau_att,
shape=num_teams)
defs_star = pm.Normal("defs_star",
mu =0,
tau =tau_def,
shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team]
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
The big difference, as I see it, between PyMC2 and 3 model building is that the whole business of initial values in PyMC2 is not included in model building in PyMC3. It is pushed off into the model fitting portion of the code.
Here is a notebook that puts this model in context with your data and some fitting code: http://nbviewer.ipython.org/gist/aflaxman/55e23195fe0a0b089103
Your model is failing because you can't use NumPy functions on theano tensors. Thus
np.mean(atts_star3)
Will give you an error. You can remove atts_star3 = pm3.Normal("atts_star",...) and just use the NumPy array directly atts_star3 = x.
I don't think you need to explicitly model tau_att3, tau_def3 or defs_star either.
Alternatively, if you want to keep those variables you can replace np.mean with theano.tensor.mean, which should work.
So I did this. It isn't a direct port of my previous version but it gives me an answer. Does anyone have any feedback?
import os
import math
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm3# I know folks are switching to "as pm" but I'm just not there yet
%matplotlib inline
import seaborn as sns
from IPython.core.pylabtools import figsize
import seaborn as sns
import theano.tensor as T
figsize(12, 12)
DATA_DIR = os.path.join(os.getcwd(), 'data/')
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
'''
def atts3(atts_star3=atts_star3):
atts3 = atts_star.copy()
atts3 = atts3 - np.mean(atts_star)
return atts3
def defs3(defs_star3=defs_star3):
defs3 = defs_star3.copy()
defs3 = defs3 - np.mean(defs_star3)
return defs
'''
model = pm3.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
shape=num_teams,
observed=att_starting_points.values)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
shape=num_teams,
observed=def_starting_points.values)
#home_theta3 = atts3 + defs3
#away_theta3 = atts3 + defs3
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=1, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=1, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)

Categories

Resources