Always getting Attribute Error when using GridSearchCSV with KNN - python
I am trying to solve a twitter sentiment analysis problem. I am using the code:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1,3,6,9,12,15,18,21,25]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10,15,20,25,30,35,40,45,50]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
Every time it comes around to the getOptimumParameters function, I get a multitude of errors. Some say AttributeError, but for most of them, I cannot find an error name. I think most of those other errors are meant to direct me to the AttributeError. I cannot figure out why this error is occurring. I know that something is wrong with my GridSearch, but I do not know if something is wrong with the parameters(which I triple checked and cannot find any problems with), or if there is some other problem. Any help is greatly appreciated. Thanks.
D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None})
47 return result
48
49
50 def _wrapfunc(obj, method, *args, **kwds):
51 try:
---> 52 return getattr(obj, method)(*args, **kwds)
obj = array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]])
method = 'argpartition'
args = (0,)
kwds = {'axis': 1, 'kind': 'introselect', 'order': None}
53
54 # An AttributeError occurs if the object does not have
55 # such a method in its class.
56
MemoryError:
The data is from a problem my analyticsvidhya. Here is the link for the download of the training data - it is a dropbox link.
https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0
Here is the test data link:
https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0
Thanks.
Have you updated your modules ?
It's bizarre because the following code runs without any error on my macbook:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
#classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
My versions:
import sklearn
print(sklearn.__version__)
#0.19.1
import nltk
print(nltk.__version__)
#3.3
I know it has been a while, so sorry.
Just wanted to let you guys know that for long Grid Searches, it is NECESSARY, at least for Windows users, to import not
sklearn.model_selection.GridSearchCV
but actually
sklearn.grid_search.GridSearchCV
The former almost always throws a memory error, while the latter works fine even on long Grid Searches.
Related
GridSearchCV with LightFM
I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features. I'm stuck and I do not know how to go on. import pandas as pd import scipy.ndimage.tests import turicreate as tc from gensim.models import KeyedVectors import os import numpy as np from lightfm import LightFM from lightfm.data import Dataset from lightfm.evaluation import auc_score from sklearn.model_selection import KFold, train_test_split, GridSearchCV from lightfm.cross_validation import random_train_test_split import itertools import scipy.sparse def create_processed_dataset(): """ One-Time execution Returns: embeddings.csv and observations.csv """ output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered') os.makedirs(output_path, exist_ok=True) """ Data imports """ u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1') r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1') vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv') # Load mappings and filter them if a corresponding embedding is found mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"]) mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)] mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])] # Create a pandas dataframe with embeddings embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]]) embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"])) embeddings.set_index("movie_id", inplace=True) ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])] embeddings.to_csv(os.path.join(output_path, 'embeddings.csv')) ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False) def generate_list_of_hyper_parameters(parameters_grid): return ( {y: z for y, z in zip(parameters_grid.keys(), x)} for x in itertools.product(*parameters_grid.values()) ) def create_csr_from_dataset(observations, embeddings): dataset = Dataset(item_identity_features=True, user_identity_features=False) feature_names = [str(i) for i in range(0, 200)] dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) num_items, num_fts = dataset.item_features_shape() print(f'Num items: {num_items}, num_features: {num_fts}.') interactions, weights = dataset.build_interactions( observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None) ) item_features = [] for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")): for x, y in row.items(): item_features.append((item_id, {x: y})) item_features = dataset.build_item_features(item_features) return interactions, item_features def folding(interactions, k_folds=10): if not scipy.sparse.issparse(interactions): return None coo = interactions.tocoo() kf = KFold(n_splits=k_folds) # Define the split - into 2 folds shape = interactions.shape uids, iids, data = (coo.row, coo.col, coo.data) def to_coo_matrix(indexes): return scipy.sparse.coo_matrix( (data[indexes], (uids[indexes], iids[indexes])), shape=shape, dtype=coo.dtype, ) return [ (to_coo_matrix(train_index), to_coo_matrix(validation_index)) for train_index, validation_index in kf.split(data) ] def grid_search(parameters_grid, k_fold, interactions, item_features=None): results = [] for hyper_params in generate_list_of_hyper_parameters(parameters_grid): for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)): print(f"{hyper_params} && current_fold:{current_fold}") model = LightFM(**hyper_params) model.fit(train, epochs=50, item_features=item_features, num_threads=6) score = auc_score(model, validation, train_interactions=train, num_threads=6).mean() results.append((score, hyper_params, model)) print(f"{hyper_params} && current_fold:{current_fold} && score: {score}") results.sort(key=lambda x: x[0]) return results def main(): observations = pd.read_csv('data/ml-100k-filtered/observations.csv') embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id") interactions, item_features = create_csr_from_dataset(observations, embeddings) train, test = random_train_test_split(interactions, test_percentage=0.2) print(embeddings.head()) num_movies = len(embeddings.index) num_ratings = len(observations.index) num_users = observations.user_id.unique().size sparsity = 1 - num_ratings / (num_users * num_movies) print( f"num_users: {num_users}, num_movies: {num_movies}, " f"num_observations: {num_ratings}, " f"sparsity: ~{sparsity * 100}" ) model = LightFM() # parametri da testare param_grid = { 'no_components': range(10, 110, 10), 'learning_rate': [0.01, 0.05, 0.1], 'item_alpha': [0.0001, 0.001, 0.01], 'user_alpha': [0.0001, 0.001, 0.01], } results = grid_search(param_grid, 10, train, item_features=item_features) print(results[0][0]) # grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10) # grid.fit(train) # # # stampare i migliori parametri # print("Best parameters found: ", grid.best_params_) if __name__ == "__main__": main() Head of embeddings.csv movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199 781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097 521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05 1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898 Head of observations.csv user_id,movie_id,rating,unix_timestamp 196,242,3,881250949 22,377,1,878887116 166,346,1,886397596 298,474,4,884182806
None Type error in Python when running streamlit
Hi I am trying create an App in python that will allow users to choose which classification model they want to implement on one of three open source data in SK-Learn library The code is the following: import streamlit as st import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier st.title("Streamlit example") st.write(""" # Explore different classifier which one is the best? """) dataset_name = st.sidebar.selectbox("Select Dataset", ("Iris","Breast Cancer","Wine Dataset") ) classifier_name = st.sidebar.selectbox("Select Classifier", ("KNN","SVM","Random Forest") ) def get_dataset(dataset_name): if dataset_name == "Iris": data = datasets.load_iris() elif dataset_name == "Breast Cancer": data = datasets.load_breast_cancer() else: data = datasets.load_wine() X = data.data y = data.target return X, y X, y = get_dataset(dataset_name) st.write("Shape of datset", X.shape) st.write("Number of classes", len(np.unique(y))) def add_parameter_ui(clf_name): params = dict() if clf_name =="KNN": K = st.sidebar.slider("K",1,15) params["K"] = K elif clf_name =="SVM": C = st.sidebar.slider("C", 0.01,10.0) params["C"] = C else: max_depth = st.sidebar.slider("max_depth", 2,15) n_estimators = st.sidebar.slider("n_estimators",1,100) params["max_depth"]= max_depth params["n_estimators"] = n_estimators return params params = add_parameter_ui(classifier_name) def get_classifier(clf_name,params): if clf_name == "KNN": clf = KNeighborsClassifier(n_neighbors=params['K']) elif clf_name == "SVM": clf = SVC(C= params['C']) else: clf = RandomForestClassifier(n_estimators=params["n_estimators"],max_depth=params["max_depth"],random_state=1234) return clf clf = get_classifier(classifier_name,params) The error is: clf = KNeighborsClassifier(n_neighbors=params['K']) TypeError: 'NoneType' object is not subscriptable I know the error is supposed to be self-explanatory but I tried to state clf = None but still get the same error and i'm asking someone to put me in the right direction.
The problem is in your add_parameter_ui function. you are not returning a value in the case of clf_name is KNN or SVM and this causes params in the main code to be None so calling params['K'] is not because 'NoneType' object is not subscriptable. Here is the fixed code: def add_parameter_ui(clf_name): params = dict() if clf_name =="KNN": K = st.sidebar.slider("K",1,15) params["K"] = K return params elif clf_name =="SVM": C = st.sidebar.slider("C", 0.01,10.0) params["C"] = C return params # If Random Forest max_depth = st.sidebar.slider("max_depth", 2,15) n_estimators = st.sidebar.slider("n_estimators",1,100) params["max_depth"]= max_depth params["n_estimators"] = n_estimators return params
Why is target encoder encoding some values as NaN?
I am using a target encoder from category_encoders to encode a feature, here is the code I m using: from category_encoders import TargetEncoder def encode_large_features(features, X_train, X_test, y_train): print('target encoding features ...') for _ in features: target_encoder = TargetEncoder(_) target_encoder.fit(X_train[_], y_train) name = _ + '_encoded' X_train[name] = target_encoder.transform(X_train[_]) X_train.drop([_], axis=1, inplace=True) X_test[name] = target_encoder.transform(X_test[_]) X_test.drop([_], axis=1, inplace=True) return X_train, X_test the target encoder encodes some values as NaN and I dont know why? here is an example:
Faced the same issue: Raised Issue n Repo Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting. This will not return NaN in the training Dataset like category_encoder library. Below example: chid is a categorical column apply KFoldTargetEncoder on it. Libraries required: from tqdm import tqdm from sklearn.model_selection import KFold from sklearn import base Training Dataset: class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin): def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False): self.colnames = colnames self.targetName = targetName self.n_fold = n_fold self.verbosity = verbosity self.discardOriginal_col = discardOriginal_col def fit(self, X, y=None): return self def transform(self,X): assert(type(self.targetName) == str) assert(type(self.colnames) == str) assert(self.colnames in X.columns) assert(self.targetName in X.columns) mean_of_target = X[self.targetName].mean() kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019) col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc' X[col_mean_name] = np.nan for tr_ind, val_ind in kf.split(X): X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind] X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean()) X[col_mean_name].fillna(mean_of_target, inplace = True) if self.verbosity: encoded_feature = X[col_mean_name].values print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name, self.targetName, np.corrcoef(X[self.targetName].values, encoded_feature)[0][1])) if self.discardOriginal_col: X = X.drop(self.targetName, axis=1) return X Fit_Transform on Training Data: targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5) train_df = targetc_chid.fit_transform(train_df) Test Dataset: class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin): def __init__(self,train,colNames,encodedName): self.train = train self.colNames = colNames self.encodedName = encodedName def fit(self, X, y=None): return self def transform(self,X): mean = self.train[[self.colNames, self.encodedName]].groupby( self.colNames).mean().reset_index() dd = {} for row in tqdm(mean.itertuples(index=False)): dd[row[0]] = row[1] X[self.encodedName] = X[self.colNames] X[self.encodedName] = X[self.encodedName].map(dd.get) return X Fit on Test Data: test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc') valid_df = test_targetc_chid.fit_transform(valid_df)
Save the output of xgb.train of XGBoost as a log file with python logging
I tried to save the output of xgb.train of XGBoost as a log file by logging, but I could not record the output. How can I record it? I tried to refer to the existing Stackoverflow question but it was impossible. I would like you to show it with a concrete sample. import sys import logging # ---------------------------------------------- # # Some logging settings # ---------------------------------------------- # import xgboost as xgb import numpy as np from sklearn.model_selection import KFold from sklearn.datasets import load_digits rng = np.random.RandomState(31337) print("Zeros and Ones from the Digits dataset: binary classification") digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X): param = {'max_depth':2, 'eta':0.3, 'silent':1, 'objective':'binary:logistic' } dtrain = xgb.DMatrix(X[train_index], y[train_index]) dtest = xgb.DMatrix(X[test_index], y[test_index]) # specify validations set to watch performance watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) # I want to record this output. # Zeros and Ones from the Digits dataset: binary classification # [0] eval-error:0.011111 train-error:0.011111 # [1] eval-error:0.011111 train-error:0.005556 # [0] eval-error:0.016667 train-error:0.005556 # [1] eval-error:0.005556 train-error:0
xgboost prints their log into standard output directly and you cannot change the behaviour. But callbacks parameter of xgb.train has ability to record the result as same timing as internal prints. Following code is a sample using callback to record xgboost log into logger. log_evaluation() returns a callback function called from xgboost internal and you can add the callback function to callbacks from logging import getLogger, basicConfig, INFO import numpy as np import xgboost as xgb from sklearn.datasets import load_digits from sklearn.model_selection import KFold # Some logging settings basicConfig(level=INFO) logger = getLogger(__name__) def log_evaluation(period=1, show_stdv=True): """Create a callback that logs evaluation result with logger. Parameters ---------- period : int The period to log the evaluation results show_stdv : bool, optional Whether show stdv if provided Returns ------- callback : function A callback that logs evaluation every period iterations into logger. """ def _fmt_metric(value, show_stdv=True): """format metric string""" if len(value) == 2: return '%s:%g' % (value[0], value[1]) elif len(value) == 3: if show_stdv: return '%s:%g+%g' % (value[0], value[1], value[2]) else: return '%s:%g' % (value[0], value[1]) else: raise ValueError("wrong metric value") def callback(env): if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False: return i = env.iteration if i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration: msg = '\t'.join([_fmt_metric(x, show_stdv) for x in env.evaluation_result_list]) logger.info('[%d]\t%s\n' % (i, msg)) return callback rng = np.random.RandomState(31337) print("Zeros and Ones from the Digits dataset: binary classification") digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X): param = {'max_depth': 2, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'} dtrain = xgb.DMatrix(X[train_index], y[train_index]) dtest = xgb.DMatrix(X[test_index], y[test_index]) # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 2 # add logger callbacks = [log_evaluation(1, True)] bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=callbacks)
The accepted solution does not work with xgboost version 1.3 and above. (Tested on 1.6.1), due to following: In XGBoost 1.3, a new callback interface is designed for Python package. (Source: https://xgboost.readthedocs.io/en/latest/python/callbacks.html) You can achieve python logging for xgboost.train by defining custom logging callback and passing it as argument to xgb.train as shown below: import logging logger = logging.getLogger(__name__) import xgboost class XGBLogging(xgboost.callback.TrainingCallback): """log train logs to file""" def __init__(self, epoch_log_interval=100): self.epoch_log_interval = epoch_log_interval def after_iteration(self, model, epoch, evals_log): if epoch % self.epoch_log_interval == 0: for data, metric in evals_log.items(): metrics = list(metric.keys()) metrics_str = "" for m_key in metrics: metrics_str = metrics_str + f"{m_key}: {metric[m_key][-1]}" logger.info(f"Epoch: {epoch}, {data}: {metrics_str}") # False to indicate training should not stop. return False model = xgboost.train( xgboost_parms, dtrain=dtrain, evals=[(dtrain,"train"),(dvalid,"valid")] callbacks=[XGBLogging(epoch_log_interval=100)] )
import sys %logstart -o "test.log" sys.stdout = open('test.log', 'a') import xgboost as xgb import numpy as np from sklearn.model_selection import KFold from sklearn.datasets import load_digits rng = np.random.RandomState(31337) print("Zeros and Ones from the Digits dataset: binary classification") digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X): param = {'max_depth':2, 'eta':0.3, 'silent':1, 'objective':'binary:logistic' } dtrain = xgb.DMatrix(X[train_index], y[train_index]) dtest = xgb.DMatrix(X[test_index], y[test_index]) # specify validations set to watch performance watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) This will start saving everything in the file test.log. The output as well as the input.
K-fold cross validation implementation python
I am trying to implement the k-fold cross-validation algorithm in python. I know SKLearn provides an implementation but still... This is my code as of right now. from sklearn import metrics import numpy as np class Cross_Validation: #staticmethod def partition(vector, fold, k): size = vector.shape[0] start = (size/k)*fold end = (size/k)*(fold+1) validation = vector[start:end] if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>": indices = range(start, end) mask = np.ones(vector.shape[0], dtype=bool) mask[indices] = False training = vector[mask] elif str(type(vector)) == "<type 'numpy.ndarray'>": training = np.concatenate((vector[:start], vector[end:])) return training, validation #staticmethod def Cross_Validation(learner, k, examples, labels): train_folds_score = [] validation_folds_score = [] for fold in range(0, k): training_set, validation_set = Cross_Validation.partition(examples, fold, k) training_labels, validation_labels = Cross_Validation.partition(labels, fold, k) learner.fit(training_set, training_labels) training_predicted = learner.predict(training_set) validation_predicted = learner.predict(validation_set) train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted)) validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted)) return train_folds_score, validation_folds_score The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words. For example: from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from Cross_Validation import Cross_Validation as cv vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word") data = vectorizer.fit_transform("""textual data""") clfMNB = MultinomialNB(alpha=.0001) score = cv.Cross_Validation(clfMNB, 10, data, labels) print "Train score" + str(score[0]) print "Test score" + str(score[1]) I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it. I hope I was clear. Thanks in advance. ________________________________EDIT___________________________________ This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector. from nltk.tokenize import word_tokenize from Categories_Data import categories import numpy as np import codecs import glob import os import re class Data_Preprocessor: def tokenize(self, text): tokens = word_tokenize(text) alpha = [t for t in tokens if unicode(t).isalpha()] return alpha def header_not_fully_removed(self, text): if ":" in text.splitlines()[0]: return len(text.splitlines()[0].split(":")[0].split()) == 1 else: return False def strip_newsgroup_header(self, text): _before, _blankline, after = text.partition('\n\n') if len(after) > 0 and self.header_not_fully_removed(after): after = self.strip_newsgroup_header(after) return after def strip_newsgroup_quoting(self, text): _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)') good_lines = [line for line in text.split('\n') if not _QUOTE_RE.search(line)] return '\n'.join(good_lines) def strip_newsgroup_footer(self, text): lines = text.strip().split('\n') for line_num in range(len(lines) - 1, -1, -1): line = lines[line_num] if line.strip().strip('-') == '': break if line_num > 0: return '\n'.join(lines[:line_num]) else: return text def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1): base_dir = os.getcwd() train_data = [] label_data = [] for category in categories: os.chdir(base_dir) os.chdir(path+"/"+category[0]) for filename in glob.glob("*"): with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target: data = target.read() if "quoting" in to_be_stripped: data = self.strip_newsgroup_quoting(data) if "header" in to_be_stripped: data = self.strip_newsgroup_header(data) if "footer" in to_be_stripped: data = self.strip_newsgroup_footer(data) if len(data) > noise_threshold: train_data.append(data) label_data.append(category[1]) os.chdir(base_dir) return np.array(train_data), np.array(label_data) This is what "from Categories_Data import categories" imports... categories = [ ('alt.atheism',0), ('comp.graphics',1), ('comp.os.ms-windows.misc',2), ('comp.sys.ibm.pc.hardware',3), ('comp.sys.mac.hardware',4), ('comp.windows.x',5), ('misc.forsale',6), ('rec.autos',7), ('rec.motorcycles',8), ('rec.sport.baseball',9), ('rec.sport.hockey',10), ('sci.crypt',11), ('sci.electronics',12), ('sci.med',13), ('sci.space',14), ('soc.religion.christian',15), ('talk.politics.guns',16), ('talk.politics.mideast',17), ('talk.politics.misc',18), ('talk.religion.misc',19) ]
The reason why your validation score is low is subtle. The issue is how you have partitioned the dataset. Remember, when doing cross-validation you should randomly split the dataset. It is the randomness that you are missing. Your data is loaded category by category, which means in your input dataset, class labels and examples follow one after the other. By not doing the random split, you have completely removed a class which your model never sees during the training phase and hence you get a bad result on your test/validation phase. You can solve this by doing a random shuffle. So, do this: from sklearn.utils import shuffle processor = Data_Preprocessor() td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/") vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word") data = vectorizer.fit_transform(td) # Shuffle the data and labels data, tl = shuffle(data, tl, random_state=0) clfMNB = MultinomialNB(alpha=.0001) score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl) print("Train score" + str(score[0])) print("Test score" + str(score[1]))