Why is target encoder encoding some values as NaN? - python

I am using a target encoder from category_encoders to encode a feature, here is the code I m using:
from category_encoders import TargetEncoder
def encode_large_features(features, X_train, X_test, y_train):
print('target encoding features ...')
for _ in features:
target_encoder = TargetEncoder(_)
target_encoder.fit(X_train[_], y_train)
name = _ + '_encoded'
X_train[name] = target_encoder.transform(X_train[_])
X_train.drop([_], axis=1, inplace=True)
X_test[name] = target_encoder.transform(X_test[_])
X_test.drop([_], axis=1, inplace=True)
return X_train, X_test
the target encoder encodes some values as NaN and I dont know why? here is an example:

Faced the same issue: Raised Issue n Repo
Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting.
This will not return NaN in the training Dataset like category_encoder library.
Below example: chid is a categorical column apply KFoldTargetEncoder on it.
Libraries required:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn import base
Training Dataset:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
Fit_Transform on Training Data:
targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5)
train_df = targetc_chid.fit_transform(train_df)
Test Dataset:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames,
self.encodedName]].groupby(
self.colNames).mean().reset_index()
dd = {}
for row in tqdm(mean.itertuples(index=False)):
dd[row[0]] = row[1]
X[self.encodedName] = X[self.colNames]
X[self.encodedName] = X[self.encodedName].map(dd.get)
return X
Fit on Test Data:
test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc')
valid_df = test_targetc_chid.fit_transform(valid_df)

Related

GridSearchCV with LightFM

I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806

None Type error in Python when running streamlit

Hi I am trying create an App in python that will allow users to choose which classification model they want to implement on one of three open source data in SK-Learn library The code is the following:
import streamlit as st
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
st.title("Streamlit example")
st.write("""
# Explore different classifier
which one is the best?
""")
dataset_name = st.sidebar.selectbox("Select Dataset", ("Iris","Breast Cancer","Wine Dataset") )
classifier_name = st.sidebar.selectbox("Select Classifier", ("KNN","SVM","Random Forest") )
def get_dataset(dataset_name):
if dataset_name == "Iris":
data = datasets.load_iris()
elif dataset_name == "Breast Cancer":
data = datasets.load_breast_cancer()
else:
data = datasets.load_wine()
X = data.data
y = data.target
return X, y
X, y = get_dataset(dataset_name)
st.write("Shape of datset", X.shape)
st.write("Number of classes", len(np.unique(y)))
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
else:
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params
params = add_parameter_ui(classifier_name)
def get_classifier(clf_name,params):
if clf_name == "KNN":
clf = KNeighborsClassifier(n_neighbors=params['K'])
elif clf_name == "SVM":
clf = SVC(C= params['C'])
else:
clf = RandomForestClassifier(n_estimators=params["n_estimators"],max_depth=params["max_depth"],random_state=1234)
return clf
clf = get_classifier(classifier_name,params)
The error is:
clf = KNeighborsClassifier(n_neighbors=params['K'])
TypeError: 'NoneType' object is not subscriptable
I know the error is supposed to be self-explanatory but I tried to state clf = None but still get the same error and i'm asking someone to put me in the right direction.
The problem is in your add_parameter_ui function. you are not returning a value in the case of clf_name is KNN or SVM and this causes params in the main code to be None so calling params['K'] is not because 'NoneType' object is not subscriptable.
Here is the fixed code:
def add_parameter_ui(clf_name):
params = dict()
if clf_name =="KNN":
K = st.sidebar.slider("K",1,15)
params["K"] = K
return params
elif clf_name =="SVM":
C = st.sidebar.slider("C", 0.01,10.0)
params["C"] = C
return params
# If Random Forest
max_depth = st.sidebar.slider("max_depth", 2,15)
n_estimators = st.sidebar.slider("n_estimators",1,100)
params["max_depth"]= max_depth
params["n_estimators"] = n_estimators
return params

How can I apply pruning on a BERT model?

I have trained a BERT model using ktrain (TensorFlow wrapper) to recognize emotion on text. It works, but it suffers from really slow inference. That makes my model not suitable for a production environment. I have done some research, and it seems pruning could help.
TensorFlow provides some options for pruning, e.g., tf.contrib.model_pruning. The problem is that it is not a not a widely used technique. What would be a simple enough example that could help me to understand how to use it?
I provide my working code below for reference.
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
The distilbert model in ktrain is created using Hugging Face transformers, which means you can use that library to prune the model. See this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?.

Always getting Attribute Error when using GridSearchCSV with KNN

I am trying to solve a twitter sentiment analysis problem. I am using the code:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1,3,6,9,12,15,18,21,25]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10,15,20,25,30,35,40,45,50]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
Every time it comes around to the getOptimumParameters function, I get a multitude of errors. Some say AttributeError, but for most of them, I cannot find an error name. I think most of those other errors are meant to direct me to the AttributeError. I cannot figure out why this error is occurring. I know that something is wrong with my GridSearch, but I do not know if something is wrong with the parameters(which I triple checked and cannot find any problems with), or if there is some other problem. Any help is greatly appreciated. Thanks.
D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None})
47 return result
48
49
50 def _wrapfunc(obj, method, *args, **kwds):
51 try:
---> 52 return getattr(obj, method)(*args, **kwds)
obj = array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]])
method = 'argpartition'
args = (0,)
kwds = {'axis': 1, 'kind': 'introselect', 'order': None}
53
54 # An AttributeError occurs if the object does not have
55 # such a method in its class.
56
MemoryError:
The data is from a problem my analyticsvidhya. Here is the link for the download of the training data - it is a dropbox link.
https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0
Here is the test data link:
https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0
Thanks.
Have you updated your modules ?
It's bizarre because the following code runs without any error on my macbook:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
#classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
My versions:
import sklearn
print(sklearn.__version__)
#0.19.1
import nltk
print(nltk.__version__)
#3.3
I know it has been a while, so sorry.
Just wanted to let you guys know that for long Grid Searches, it is NECESSARY, at least for Windows users, to import not
sklearn.model_selection.GridSearchCV
but actually
sklearn.grid_search.GridSearchCV
The former almost always throws a memory error, while the latter works fine even on long Grid Searches.

K-fold cross validation implementation python

I am trying to implement the k-fold cross-validation algorithm in python.
I know SKLearn provides an implementation but still...
This is my code as of right now.
from sklearn import metrics
import numpy as np
class Cross_Validation:
#staticmethod
def partition(vector, fold, k):
size = vector.shape[0]
start = (size/k)*fold
end = (size/k)*(fold+1)
validation = vector[start:end]
if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>":
indices = range(start, end)
mask = np.ones(vector.shape[0], dtype=bool)
mask[indices] = False
training = vector[mask]
elif str(type(vector)) == "<type 'numpy.ndarray'>":
training = np.concatenate((vector[:start], vector[end:]))
return training, validation
#staticmethod
def Cross_Validation(learner, k, examples, labels):
train_folds_score = []
validation_folds_score = []
for fold in range(0, k):
training_set, validation_set = Cross_Validation.partition(examples, fold, k)
training_labels, validation_labels = Cross_Validation.partition(labels, fold, k)
learner.fit(training_set, training_labels)
training_predicted = learner.predict(training_set)
validation_predicted = learner.predict(validation_set)
train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted))
validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted))
return train_folds_score, validation_folds_score
The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words.
For example:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from Cross_Validation import Cross_Validation as cv
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform("""textual data""")
clfMNB = MultinomialNB(alpha=.0001)
score = cv.Cross_Validation(clfMNB, 10, data, labels)
print "Train score" + str(score[0])
print "Test score" + str(score[1])
I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it.
I hope I was clear.
Thanks in advance.
________________________________EDIT___________________________________
This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector.
from nltk.tokenize import word_tokenize
from Categories_Data import categories
import numpy as np
import codecs
import glob
import os
import re
class Data_Preprocessor:
def tokenize(self, text):
tokens = word_tokenize(text)
alpha = [t for t in tokens if unicode(t).isalpha()]
return alpha
def header_not_fully_removed(self, text):
if ":" in text.splitlines()[0]:
return len(text.splitlines()[0].split(":")[0].split()) == 1
else:
return False
def strip_newsgroup_header(self, text):
_before, _blankline, after = text.partition('\n\n')
if len(after) > 0 and self.header_not_fully_removed(after):
after = self.strip_newsgroup_header(after)
return after
def strip_newsgroup_quoting(self, text):
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)')
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
def strip_newsgroup_footer(self, text):
lines = text.strip().split('\n')
for line_num in range(len(lines) - 1, -1, -1):
line = lines[line_num]
if line.strip().strip('-') == '':
break
if line_num > 0:
return '\n'.join(lines[:line_num])
else:
return text
def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1):
base_dir = os.getcwd()
train_data = []
label_data = []
for category in categories:
os.chdir(base_dir)
os.chdir(path+"/"+category[0])
for filename in glob.glob("*"):
with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target:
data = target.read()
if "quoting" in to_be_stripped:
data = self.strip_newsgroup_quoting(data)
if "header" in to_be_stripped:
data = self.strip_newsgroup_header(data)
if "footer" in to_be_stripped:
data = self.strip_newsgroup_footer(data)
if len(data) > noise_threshold:
train_data.append(data)
label_data.append(category[1])
os.chdir(base_dir)
return np.array(train_data), np.array(label_data)
This is what "from Categories_Data import categories" imports...
categories = [
('alt.atheism',0),
('comp.graphics',1),
('comp.os.ms-windows.misc',2),
('comp.sys.ibm.pc.hardware',3),
('comp.sys.mac.hardware',4),
('comp.windows.x',5),
('misc.forsale',6),
('rec.autos',7),
('rec.motorcycles',8),
('rec.sport.baseball',9),
('rec.sport.hockey',10),
('sci.crypt',11),
('sci.electronics',12),
('sci.med',13),
('sci.space',14),
('soc.religion.christian',15),
('talk.politics.guns',16),
('talk.politics.mideast',17),
('talk.politics.misc',18),
('talk.religion.misc',19)
]
The reason why your validation score is low is subtle.
The issue is how you have partitioned the dataset. Remember, when doing cross-validation you should randomly split the dataset. It is the randomness that you are missing.
Your data is loaded category by category, which means in your input dataset, class labels and examples follow one after the other. By not doing the random split, you have completely removed a class which your model never sees during the training phase and hence you get a bad result on your test/validation phase.
You can solve this by doing a random shuffle. So, do this:
from sklearn.utils import shuffle
processor = Data_Preprocessor()
td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/")
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform(td)
# Shuffle the data and labels
data, tl = shuffle(data, tl, random_state=0)
clfMNB = MultinomialNB(alpha=.0001)
score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl)
print("Train score" + str(score[0]))
print("Test score" + str(score[1]))

Categories

Resources