I'm a newbie so my apologies if something I ask might be to obvious and my english is not quite good. I'm stuck in doing a custom grid search with cross validation with LightFM which does not come with those functions. It seem the way I split the dataset is wrong but I do not understand why since I've replicated the code of the function random_train_test_split to get the folds. The error I get is Incorrect number of features in item_features.
I'm stuck and I do not know how to go on.
import pandas as pd
import scipy.ndimage.tests
import turicreate as tc
from gensim.models import KeyedVectors
import os
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from lightfm.cross_validation import random_train_test_split
import itertools
import scipy.sparse
def create_processed_dataset():
"""
One-Time execution
Returns:
embeddings.csv and observations.csv
"""
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'ml-100k-filtered')
os.makedirs(output_path, exist_ok=True)
"""
Data imports
"""
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
vectors: KeyedVectors = KeyedVectors.load('data/dbpedia/model.kv')
# Load mappings and filter them if a corresponding embedding is found
mappings = pd.read_csv('data/LODrecsys/mappings.tsv', sep='\t', header=None, names=["movie_id", "movie_name", "movie_uri"])
mappings = mappings[mappings.apply(lambda x: vectors.__contains__(x["movie_uri"]), axis=1)]
mappings = mappings[mappings["movie_id"].isin(ratings["movie_id"])]
# Create a pandas dataframe with embeddings
embeddings = pd.DataFrame([vectors[uri] for uri in mappings["movie_uri"]])
embeddings.insert(loc=0, column='movie_id', value=list(mappings["movie_id"]))
embeddings.set_index("movie_id", inplace=True)
ratings = ratings[ratings["movie_id"].isin(mappings["movie_id"])]
embeddings.to_csv(os.path.join(output_path, 'embeddings.csv'))
ratings.to_csv(os.path.join(output_path, 'observations.csv'), index=False)
def generate_list_of_hyper_parameters(parameters_grid):
return (
{y: z for y, z in zip(parameters_grid.keys(), x)}
for x in itertools.product(*parameters_grid.values())
)
def create_csr_from_dataset(observations, embeddings):
dataset = Dataset(item_identity_features=True, user_identity_features=False)
feature_names = [str(i) for i in range(0, 200)]
dataset.fit(observations['user_id'], observations['movie_id'], item_features=feature_names)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
num_items, num_fts = dataset.item_features_shape()
print(f'Num items: {num_items}, num_features: {num_fts}.')
interactions, weights = dataset.build_interactions(
observations[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None)
)
item_features = []
for item_id, row in zip(embeddings.index.to_list(), embeddings.to_dict(orient="records")):
for x, y in row.items():
item_features.append((item_id, {x: y}))
item_features = dataset.build_item_features(item_features)
return interactions, item_features
def folding(interactions, k_folds=10):
if not scipy.sparse.issparse(interactions):
return None
coo = interactions.tocoo()
kf = KFold(n_splits=k_folds) # Define the split - into 2 folds
shape = interactions.shape
uids, iids, data = (coo.row, coo.col, coo.data)
def to_coo_matrix(indexes):
return scipy.sparse.coo_matrix(
(data[indexes], (uids[indexes], iids[indexes])),
shape=shape,
dtype=coo.dtype,
)
return [
(to_coo_matrix(train_index), to_coo_matrix(validation_index))
for train_index, validation_index in kf.split(data)
]
def grid_search(parameters_grid, k_fold, interactions, item_features=None):
results = []
for hyper_params in generate_list_of_hyper_parameters(parameters_grid):
for current_fold, (train, validation) in enumerate(folding(interactions, k_folds=10)):
print(f"{hyper_params} && current_fold:{current_fold}")
model = LightFM(**hyper_params)
model.fit(train, epochs=50, item_features=item_features, num_threads=6)
score = auc_score(model, validation, train_interactions=train, num_threads=6).mean()
results.append((score, hyper_params, model))
print(f"{hyper_params} && current_fold:{current_fold} && score: {score}")
results.sort(key=lambda x: x[0])
return results
def main():
observations = pd.read_csv('data/ml-100k-filtered/observations.csv')
embeddings = pd.read_csv('data/ml-100k-filtered/embeddings.csv').set_index("movie_id")
interactions, item_features = create_csr_from_dataset(observations, embeddings)
train, test = random_train_test_split(interactions, test_percentage=0.2)
print(embeddings.head())
num_movies = len(embeddings.index)
num_ratings = len(observations.index)
num_users = observations.user_id.unique().size
sparsity = 1 - num_ratings / (num_users * num_movies)
print(
f"num_users: {num_users}, num_movies: {num_movies}, "
f"num_observations: {num_ratings}, "
f"sparsity: ~{sparsity * 100}"
)
model = LightFM()
# parametri da testare
param_grid = {
'no_components': range(10, 110, 10),
'learning_rate': [0.01, 0.05, 0.1],
'item_alpha': [0.0001, 0.001, 0.01],
'user_alpha': [0.0001, 0.001, 0.01],
}
results = grid_search(param_grid, 10, train, item_features=item_features)
print(results[0][0])
# grid = GridSearchCV(model, param_grid, scoring='roc_auc', cv=10)
# grid.fit(train)
#
# # stampare i migliori parametri
# print("Best parameters found: ", grid.best_params_)
if __name__ == "__main__":
main()
Head of embeddings.csv
movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
781,0.104976304,-0.28639936,0.263389,-0.063659474,0.2572639,-0.020438952,-0.539728,-0.5362033,0.044485092,-0.2635477,-0.16790706,-0.3090492,-0.16604371,-0.17663258,-0.52484894,0.18765214,0.023662027,0.30391097,-0.20567082,0.0017149863,-0.5396369,0.5048874,-0.1330814,0.20542468,0.30167308,-0.7394157,-0.72330767,0.19829337,0.114596725,-0.21563736,0.036049057,0.17444284,-0.048169367,0.072739236,0.45243305,0.30419606,0.05917972,0.095685355,0.47091144,0.82561576,0.39543882,-0.17032664,0.20288855,0.9243431,0.8003851,0.38405365,0.6077287,0.013964407,0.17004211,-0.3161952,-0.026656324,-0.53144175,0.51453334,-0.088666946,-0.043593623,-0.40192905,0.16968574,0.49007356,-0.061701216,0.22878993,0.39561245,0.68686026,0.19645824,-0.29711974,-0.39910316,0.75740165,0.19224961,-0.5461575,-0.5391435,-0.039670262,-0.41069844,-0.0040386477,-0.46357092,0.31994164,0.4489141,0.029307673,0.14275625,0.598504,0.30107188,0.17440903,0.19279842,-0.5319882,-0.16329569,0.13279761,0.3125511,-0.076068535,0.04027855,0.15937261,0.030322008,-0.25054383,0.3420725,0.0023631598,-0.15594675,-0.02108332,-0.33198243,-0.09107834,0.10918749,-0.20812488,0.48240393,0.1413759,0.19932991,-0.04550627,-0.4199228,-0.30975172,-0.16584149,0.13618651,0.032270815,0.21531013,-0.34754023,0.38745317,-0.3141335,-0.0076772026,-0.15902501,-0.1922333,-0.91181076,0.30101702,-0.5477423,0.21788768,-0.37916282,0.2178647,-0.23305914,0.39835364,0.29663038,0.17434639,-0.2767167,-0.079150155,-0.020879027,0.24703448,0.026067395,0.30733135,-0.18035492,0.098099545,0.012437648,-0.37087408,-0.43842456,-0.0740163,-0.16759877,0.2330794,0.36284205,0.042673703,0.08767547,-0.26393065,-0.044456694,0.519393,0.6997318,-0.015339097,-0.12928426,0.3939398,0.21620893,0.08203938,0.59946024,-0.01698428,0.0012696922,0.22144872,-0.7580897,-0.15163377,0.22549058,0.21746552,0.5356927,0.20340215,-0.15772144,-0.12937415,-0.10244009,0.25065783,0.094861135,0.172628,-0.287088,0.23041421,-0.14308949,0.13672677,-0.37433547,0.33438677,0.80673337,-0.34667587,0.47028127,-0.4950244,0.24330682,0.11687778,-0.44560146,-0.119554825,0.22739832,0.2406247,-0.091462746,-0.9168895,-0.40797755,-0.09773033,0.21946639,-0.15086696,-0.20639573,-0.012351767,1.1847337,0.12334188,0.101606116,0.19813639,-0.4772674,-0.6815623,-0.48542064,-0.278218,-0.2703869,0.35741097
521,0.4834846,-0.23845299,-0.21415482,-0.14914818,0.37452647,-0.2639882,-0.19339855,-0.5819728,-0.5480068,-0.680737,-0.5018884,0.15885419,-0.52158093,-0.32109717,-0.4306464,-0.15114668,0.19270135,-0.25596684,0.3264883,0.038799148,-0.5314147,0.5727659,-0.6976444,-0.0031756312,0.4308029,-0.9178242,-0.4543698,-0.07639094,-0.048227325,-0.21814795,-0.12718941,0.25438586,-0.076513454,-0.007188802,0.06668828,0.28282973,0.31041262,0.011750209,-0.06269789,0.6973704,0.15802476,0.0066345967,-0.017412819,0.43328476,0.016537199,0.40507087,0.7983648,0.29395765,0.05465501,-0.42503813,-0.07169553,-0.22310269,-0.0841079,-0.28536376,-0.29453915,0.18276429,0.51880515,-0.1363985,-0.20796828,-0.23383135,0.21936962,0.16077477,-0.08352809,-0.44291374,-0.006436026,0.5807399,0.3369641,-0.42017564,-0.1765961,0.002688498,-0.49212384,0.44475305,0.4833789,0.4590813,0.19189888,0.18402466,-0.5216376,0.35626128,-0.26259816,0.10202889,0.33155227,0.1554108,-0.34849754,-0.0835181,0.3608791,-0.24104835,-0.3426349,-0.39945003,0.19826588,-0.013716115,-0.18012097,0.017895179,-0.20326746,-0.28829327,-0.27310565,0.08799436,-0.090023905,-0.33734864,-0.4057884,0.4391738,-0.19845818,0.28421938,-0.13515925,-0.034714248,-0.14890312,-0.6278702,0.16775073,0.29424798,-0.37155896,-0.04562982,-0.16632678,-0.48772115,-0.0829048,-0.12879832,-1.1941701,0.036262244,-0.54917175,0.08452879,-0.020562846,0.5727009,-0.38378647,-0.16947998,0.23402393,0.1757261,0.18268874,0.19349255,0.5213705,0.04873449,0.26911566,-0.15686822,-0.7430511,0.35789433,0.025986547,-0.73101807,-0.15174152,-0.6247366,-0.3085124,0.06883673,0.283824,-0.29984295,-0.15076798,0.07029077,-0.31470934,0.27179474,0.24899411,-0.057006147,-0.46430832,0.293169,0.20246102,0.11565917,0.4896067,-0.16753878,0.053250737,0.42725414,0.031641196,0.2438955,-0.020254094,0.13220254,-0.08638797,0.4737355,0.26201698,-0.17828363,-0.2764023,-0.04341643,-0.07235413,-0.44729337,-0.095581695,0.15628703,-0.017644022,-0.10891184,-0.1982593,0.1994896,0.6321398,0.036708854,0.49601346,-0.3402982,-0.095669836,0.037039768,-0.2889446,-0.1277229,-0.113685735,0.57858396,0.030328764,-0.6693496,-0.39052898,-0.64047015,0.58858204,-0.24054149,0.034169126,0.3630536,0.5616578,-0.29867598,-0.07564583,0.2850233,0.056441583,-0.49339303,-0.5660689,-0.65997607,-0.47282198,1.8606243e-05
1590,0.05941767,-0.3993399,-0.1298459,-0.080818005,0.44435924,-0.11421722,-0.31332758,-0.81384706,0.08015667,-0.39844254,-0.81037426,-0.30531615,-0.48657808,-0.16939472,-0.046779584,-0.20503436,-0.40876153,0.24482553,-0.045942448,0.5312148,-0.8579908,0.6439102,-0.5025662,-0.19216116,0.32369378,-0.17766032,-0.3439799,-0.09829475,0.48353088,-0.19016655,0.13181841,0.5165478,-0.43528923,0.14950746,0.26477075,0.20312098,-0.20503096,0.050996274,0.2862533,0.8499676,-0.26986682,-0.114738576,-0.15050523,0.2713783,0.20189986,0.12967147,0.22785097,-0.079153396,0.36194524,-0.6376741,-0.21367697,0.041446075,-0.12271453,-0.65323865,-0.28616807,-0.111520484,0.43526977,0.5031802,0.4039687,-0.279708,0.2243983,0.28985283,-0.1668437,-0.2898966,-0.5576508,0.491614,0.30399892,-0.69570065,-0.43999743,0.117331214,-0.67416537,0.047031827,0.5364804,-0.041629195,0.66792035,0.35590017,-0.16253334,0.46751112,-0.79641575,0.14861014,0.31830528,-0.567578,0.15521573,-0.19457583,-0.23927484,-0.31114638,0.4783339,-0.041086923,0.33376405,-0.17237572,-0.13189459,0.062240843,0.018567545,0.20897199,-0.41638336,-0.034222282,-0.00867459,-0.41689333,-0.03165012,0.49717176,0.10709976,0.19650076,-0.3332431,-0.103964016,-0.53446937,0.32072574,0.16265534,0.5113785,-0.10267297,-0.27707252,0.1787905,-0.37411007,0.21731602,0.10512698,-0.8509798,0.36154267,-0.4811016,0.57361645,-0.49470577,0.48559442,-0.6293668,0.16920403,0.1583842,0.3939669,-0.19239852,0.012528246,0.045776017,0.11170228,0.64706856,0.20509283,-0.509191,-0.05886244,-0.5023932,-0.29391384,-0.20070714,-0.3791569,0.09131153,0.13778323,-0.099376984,-0.7821524,0.34264925,-0.2860546,-0.0055139684,0.08234838,0.32018226,-0.28082213,0.20966247,0.039263353,0.5605049,-0.23947746,0.4547303,0.6292773,-0.7470398,0.18514062,-0.6196754,0.23065008,-0.21438336,0.09843864,0.26463908,0.44211373,0.22545318,-0.23579475,-0.4698368,0.119940385,-0.33248,-0.17298971,-0.047025036,-0.31992626,-0.13884223,0.33602548,-0.14379616,0.01660432,0.69129556,-0.2623254,0.48632252,-0.2283669,0.07059559,0.1516157,-0.44664145,0.054038346,0.029984698,0.6208362,-0.2540388,-0.43699056,-0.69213647,-0.41838953,0.4951119,0.24951442,0.041442018,0.3817064,0.4745367,-0.13778052,0.092584506,0.28134617,-0.23201333,-0.22493492,-0.0953396,-0.17562813,0.17628315,-0.34246898
Head of observations.csv
user_id,movie_id,rating,unix_timestamp
196,242,3,881250949
22,377,1,878887116
166,346,1,886397596
298,474,4,884182806
I am trying to solve a twitter sentiment analysis problem. I am using the code:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1,3,6,9,12,15,18,21,25]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10,15,20,25,30,35,40,45,50]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
Every time it comes around to the getOptimumParameters function, I get a multitude of errors. Some say AttributeError, but for most of them, I cannot find an error name. I think most of those other errors are meant to direct me to the AttributeError. I cannot figure out why this error is occurring. I know that something is wrong with my GridSearch, but I do not know if something is wrong with the parameters(which I triple checked and cannot find any problems with), or if there is some other problem. Any help is greatly appreciated. Thanks.
D:\Anaconda\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj=array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]]), method='argpartition', *args=(0,), **kwds={'axis': 1, 'kind': 'introselect', 'order': None})
47 return result
48
49
50 def _wrapfunc(obj, method, *args, **kwds):
51 try:
---> 52 return getattr(obj, method)(*args, **kwds)
obj = array([[ 0. , 30.70562651, 27.84020028, .... 38.11465899,
25.22553572, 0. ]])
method = 'argpartition'
args = (0,)
kwds = {'axis': 1, 'kind': 'introselect', 'order': None}
53
54 # An AttributeError occurs if the object does not have
55 # such a method in its class.
56
MemoryError:
The data is from a problem my analyticsvidhya. Here is the link for the download of the training data - it is a dropbox link.
https://www.dropbox.com/s/w4tagiewcuoxgkt/train.csv?dl=0
Here is the test data link:
https://www.dropbox.com/s/qiitwlpnkbs2c3m/test_tweets.csv?dl=0
Thanks.
Have you updated your modules ?
It's bizarre because the following code runs without any error on my macbook:
print()
print("Importing")
print()
#IMPORTS
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
def getting_data(train_dataset_name, test_dataset_name):
print()
print("Getting the data")
print()
#Parameter names are self explanatory - file names for datasets
#This assumes you are executing this code statement from inside the directory with your datasets
train = pd.read_csv(train_dataset_name).values
train_y = train[:,1]
train_x = train[:,2]
test = pd.read_csv(test_dataset_name).values
test = test[:,1]
test = np.reshape(test,(test.shape[0],1))
return train_x,train_y,test
def bagOfWords(test,train_x):
print()
print("Creating bag of words model")
print()
#Creates and returns bag-of-words versions of the test and train x
#Train transformations
corpus_train = []
for i in range(0,train_x.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', train_x[i])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_train.append(review)
#Test transformations
corpus_test = []
for i in range(0,test.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', test[i][0])
review = review.lower().split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus_test.append(review)
return corpus_train,corpus_test
def dimensionality_reduction(corpus_train,corpus_test, return_ratio, components):
print()
print("Performing Dimensionality Reduction")
print()
#CountVectorizer
cv = CountVectorizer(max_features = 1500)
train_x = cv.fit_transform(corpus_train).toarray()
#PCA
pca = PCA(n_components=components)
train_x = pca.fit_transform(train_x)
explained_variance = pca.explained_variance_ratio_
test = cv.transform(corpus_test).toarray()
test = pca.transform(test)
test = test.astype('float32')
if (return_ratio):
return train_x,test, explained_variance
else:
return train_x,test
def getOptimumParameters(train_x,train_y, return_stats):
print()
print("Getting optimum parameters")
print("This optimization algorithm may take a while, so please be patient.")
print("Please do not do other tasks while this runs.")
print()
train_x = train_x.astype('float32')
train_y = train_y.astype('float32')
classifier = KNeighborsClassifier()
#classifier.fit(train_x,train_y)
#For the sake of my program I used my own parameter lists.
#If you use this code, please change them
neighbor_list = [1]
algorithm_list = ['brute', 'kd_tree', 'ball_tree']
weights_list = ['uniform', 'distance']
p_list = [1] #p_list = [1,2,3,4]
leaf_list = [10]
parameters = [{'n_neighbors':neighbor_list, 'weights':weights_list, 'algorithm':algorithm_list, 'p':p_list, 'leaf_size':leaf_list}]
clf = GridSearchCV(estimator=classifier, param_grid = parameters, cv=5,refit=True, error_score=0, n_jobs = -1)
clf = clf.fit(train_x,train_y)
bc = clf.best_score_
bp = clf.best_params_
if return_stats:
return clf, bc, bp
else:
return clf
def predictions(classifier, train_x, train_y, test, ratio):
print()
print("Making predictions")
print()
#Changing types to work with a classifier
train_x= train_x.astype('float32')
train_y = train_y.astype('float32')
#Splitting training set into a training + dev set
train_x,dev_x,train_y,dev_y = train_test_split(train_x,train_y,test_size = ratio, random_state=0)
#Making predictions
test = test.astype('float32')
pred = classifier.predict(test)
return pred
def convertPredToCsv(pred, csv_name):
df = pd.DataFrame(pred)
df.index.name = 'id'
df.columns = ['label']
df.to_csv("predictions.csv")
def main():
#Retrieving the data
train_x,train_y,test = getting_data('train.csv', 'test_tweets.csv')
#Constructing Bag of words model
corpus_train,corpus_test = bagOfWords(test,train_x)
#Performing Dimensionality Reduction
train_x,test = dimensionality_reduction(corpus_train,corpus_test,False,350)
#Getting the optimum classifier
classifier= getOptimumParameters(train_x,train_y, False)
#Predicting + converting to csv
pred = predictions(classifier, train_x, train_y, test, 0.1)
convertPredToCsv(pred, 'predictions.csv')
if __name__ == "__main__":
main()
My versions:
import sklearn
print(sklearn.__version__)
#0.19.1
import nltk
print(nltk.__version__)
#3.3
I know it has been a while, so sorry.
Just wanted to let you guys know that for long Grid Searches, it is NECESSARY, at least for Windows users, to import not
sklearn.model_selection.GridSearchCV
but actually
sklearn.grid_search.GridSearchCV
The former almost always throws a memory error, while the latter works fine even on long Grid Searches.