Recently I am doing a text classification problem and want to use Doc2vec for experiment. I googled and saw the tutorial post of #susanli2016. I was wondering how can I use cross validation to evaluate instead of train/test split in this case?
P.s.: As I couldn't really find anyone who use cv for evaluation, is train/test split better here?
Here is her code:
train, test = train_test_split(df, test_size=0.3, random_state=42)
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
if len(word) < 2:
continue
tokens.append(word.lower())
return tokens
train_tagged = train.apply(
lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)
test_tagged = test.apply(
lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
def vec_for_learning(model, tagged_docs):
sents = tagged_docs.values
targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
return targets, regressors
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
Here is my thought:
tagged = df.apply(
lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.label]), axis=1)
doc = vec_for_learning(model_dbow, tagged)
text_clf = SVC(C=1, kernel= 'linear', max_iter= 1000, tol=0.0001, probability=True)
scores = cross_validate(text_clf, doc, df['label'],
cv=10, return_train_score=False)
sorted(scores.keys())
scores['test_score']
but I got the following error message:
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-77-105f56d4a65a> in <module>()
1 scores = cross_validate(text_clf, doc, df['label'],
----> 2 cv=10, return_train_score=False)
3 sorted(scores.keys())
4
5 scores['test_score']
/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
223
224 """
--> 225 X, y, groups = indexable(X, y, groups)
226
227 cv = check_cv(cv, y, classifier=is_classifier(estimator))
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in indexable(*iterables)
258 else:
259 result.append(np.array(X))
--> 260 check_consistent_length(*result)
261 return result
262
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
233 if len(uniques) > 1:
234 raise ValueError("Found input variables with inconsistent numbers of"
--> 235 " samples: %r" % [int(l) for l in lengths])
236
237
ValueError: Found input variables with inconsistent numbers of samples: [2, 2234]
I don't know where did I do wrong and how to fix it.
Since I am very new to python and NLP domain, I would be glad if someone can give me a hint on this so my work can proceed:)
Thank you in advanced!
Related
I wrote a function to find the best combination of given dataframe features, f1 score, and auc score using LogisticRegression. The problem is that when I try to pass a list of dataframes combinations, using itertools combinations, LogisticRegression doesn't recognize each combination as its own X variable/ dataframe.
I'm starting with a dataframe of 10 feature columns and 10k rows. When I run the below code I get a "ValueError: X has 10 features, but LogisticRegression is expecting 1 features as input".
def find_best_combination(X, y):
#initialize variables
best_f1 = 0
best_auc = 0
best_variables = []
# get all possible combinations of variables
for i in range(1, X.shape[1]):
for combination in combinations(X.columns, i):
X_subset = X[list(combination)]
logreg = LogisticRegression()
logreg.fit(X_subset, y)
y_pred = logreg.predict(X_subset)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, logreg.predict_proba(X)[:,1])
# evaluate performance on current combination of variables
if f1> best_f1 and auc > best_auc:
best_f1 = f1
best_auc = auc
best_variables = combination
return best_variables, best_f1, best_auc
and the error
C:\Users\katurner\Anaconda3\lib\site-packages\sklearn\base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- IBE1273_01_11.0
- IBE1273_01_6.0
- IBE7808
- IBE8439_2.0
- IBE8557_7.0
- ...
warnings.warn(message, FutureWarning)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\2\ipykernel_15932\895415673.py in <module>
----> 1 best_combo = ml.find_best_combination(X,lg_y)
2 best_combo
~\Documents\Arcadia\modeling_library.py in find_best_combination(X, y)
176 # print(y_test)
177 f1 = f1_score(y, y_pred)
--> 178 auc = roc_auc_score(y, logreg.predict_proba(X)[:,1])
179 # evaluate performance on current combination of variables
180 if f1> best_f1 and auc > best_auc:
~\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in predict_proba(self, X)
1309 )
1310 if ovr:
-> 1311 return super()._predict_proba_lr(X)
1312 else:
1313 decision = self.decision_function(X)
~\Anaconda3\lib\site-packages\sklearn\linear_model\_base.py in _predict_proba_lr(self, X)
459 multiclass is handled by normalizing that over all classes.
460 """
--> 461 prob = self.decision_function(X)
462 expit(prob, out=prob)
463 if prob.ndim == 1:
~\Anaconda3\lib\site-packages\sklearn\linear_model\_base.py in decision_function(self, X)
427 check_is_fitted(self)
428
--> 429 X = self._validate_data(X, accept_sparse="csr", reset=False)
430 scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
431 return scores.ravel() if scores.shape[1] == 1 else scores
~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
598
599 if not no_val_X and check_params.get("ensure_2d", True):
--> 600 self._check_n_features(X, reset=reset)
601
602 return out
~\Anaconda3\lib\site-packages\sklearn\base.py in _check_n_features(self, X, reset)
398
399 if n_features != self.n_features_in_:
--> 400 raise ValueError(
401 f"X has {n_features} features, but {self.__class__.__name__} "
402 f"is expecting {self.n_features_in_} features as input."
ValueError: X has 10 features, but LogisticRegression is expecting 1 features as input.
I'm xpecting the function to return a combination of best_variables, and accociated best_f1, best_auc.
I've also tried running the function using train, test, split. When I add train, test, split into the below code the function does run but returns "[], 0, 0" for best_variables, best_f1, best_auc.
def find_best_combination(X, y):
#initialize variables
best_f1 = 0
best_auc = 0
best_variables = []
# get all possible combinations of variables
for i in range(1, X.shape[1]):
for combination in combinations(X.columns, i):
X_subset = X[list(combination)]
X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, stratify=y, random_state=73)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])
# evaluate performance on current combination of variables
if f1> best_f1 and auc > best_auc:
best_f1 = f1
best_auc = auc
best_variables = combination
return best_variables, best_f1, best_auc
I'm not sure what's going on under the hood of train, test, split that enables the function to iterate through and not error like before.
I hope this explains it enough. Thanks in advance for any help.
I'm working on a text classification project.
While exploring different classifiers I came across XGBClassifier
My classification task is multi class.
I'm getting the above mentioned error when trying to score the classifier - I'm guessing some reshaping is needed, but I fail to understand why.
What's strange to me is that other classifiers work just fine (even this one with its default params)
Here's the relevant section from my code:
algorithms = [
svm.LinearSVC(), # <<<=== Works
linear_model.RidgeClassifier(), # <<<=== Works
XGBClassifier(), # <<<=== Works
XGBClassifier(objective='multi:softprob', num_class=len(groups_count_dict), eval_metric='merror') # <<<=== Not working
]
def train(algorithm, X_train, y_train):
model = Pipeline([
('vect', transformer),
('classifier', OneVsRestClassifier(algorithm))
])
model.fit(X_train, y_train)
return model
score_dict = {}
algorithm_to_model_dict = {}
for algorithm in algorithms:
print()
print(f'trying {algorithm}')
model = train(algorithm, X_train, y_train)
score = model.score(X_test, y_test)
score_dict[algorithm] = int(score * 100)
algorithm_to_model_dict[algorithm] = model
sorted_score_dict = {k: v for k, v in sorted(score_dict.items(), key=lambda item: item[1])}
for classifier, score in sorted_score_dict.items():
print(f'{classifier.__class__.__name__}: score is {score}%')
Here's the error again:
ValueError: operands could not be broadcast together with shapes (2557,) (8,) (2557,)
Not sure it's related but I'll mention it anyway - my transformer is being created as such:
tuples = []
tfidf_kwargs = {'ngram_range': (1, 2), 'stop_words': 'english', 'sublinear_tf': True}
for col in list(features.columns):
tuples.append((f'vec_{col}', TfidfVectorizer(**tfidf_kwargs), col))
transformer = ColumnTransformer(tuples, remainder='passthrough')
Thanks in advance
EDIT:
Adding the full trace:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-15-576cd62f3df0> in <module>
84 print(f'trying {algorithm}')
85 model = train(algorithm, X_train, y_train)
---> 86 score = model.score(X_test, y_test)
87 score_dict[algorithm] = int(score * 100)
88 algorithm_to_model_dict[algorithm] = model
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
118
119 # lambda, but not partial, allows help() to work with update_wrapper
--> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
121 # update the docstring of the returned function
122 update_wrapper(out, self.fn)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/pipeline.py in score(self, X, y, sample_weight)
620 if sample_weight is not None:
621 score_params['sample_weight'] = sample_weight
--> 622 return self.steps[-1][-1].score(Xt, y, **score_params)
623
624 #property
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/base.py in score(self, X, y, sample_weight)
498 """
499 from .metrics import accuracy_score
--> 500 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
501
502 def _more_tags(self):
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/multiclass.py in predict(self, X)
365 for i, e in enumerate(self.estimators_):
366 pred = _predict_binary(e, X)
--> 367 np.maximum(maxima, pred, out=maxima)
368 argmaxima[maxima == pred] = i
369 return self.classes_[argmaxima]
ValueError: operands could not be broadcast together with shapes (2557,) (8,) (2557,)
Printing the shapes of X_test and y_test yields: (2557, 12) (2557,)
I was able to understand where does the (8,) comes from - it's the length of groups_count_dict
Turns out the solution was to remove the OneVsRestClassifier usage from the pipeline
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
I'm getting an error with sklearn classification report.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-30-6a63be1ce4c8> in <module>
----> 1 classification_report(y_test, predictions)
/usr/local/lib/python3.7/site-packages/sklearn/metrics/classification.py in classification_report(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict)
1522 """
1523
-> 1524 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
1525
1526 labels_given = True
/usr/local/lib/python3.7/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
69 y_pred : array or indicator matrix
70 """
---> 71 check_consistent_length(y_true, y_pred)
72 type_true = type_of_target(y_true)
73 type_pred = type_of_target(y_pred)
/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
233 if len(uniques) > 1:
234 raise ValueError("Found input variables with inconsistent numbers of"
--> 235 " samples: %r" % [int(l) for l in lengths])
236
237
ValueError: Found input variables with inconsistent numbers of samples: [360, 144]
This is the only thing I'm passing in, and y_test.shape is (360,) and predictions.shape is (144,).
classification_report(y_test, predictions)
Do they need to be the same length? (I'm assuming so because of that second stack trace).. If so, how can the length of X and Y can be the same when you split your data? Wouldn't they have different length always?
It seems like there's a bit of a misunderstanding here about the stats/ML data splitting framework.
Like you suspected, y_test and pred need to be the same length—let's call it k. Why? Because we need there to be k testing examples ((x, y) pairs) to test the model. X_test and y_test are each k entries long. (Each entry x in X_test may have several features, but it counts as one record.) For each x in X_test, we make a prediction about its label. Then, to compute a metric like classification accuracy, we compare the predicted label to the true label for each testing example.
If so, how can the length of X and Y can be the same when you split your data?
Peek at the API of sklearn.model_selection.train_test_split. You'd call it something like this:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
What this shows is that X_test and y_test will have the same number of records in them—they'll always be the same shape, by design. Then for each entry in X_test, you make a prediction using your model. It'll be paired with the corresponding entry in y_test, and that's how you can compute your classification score.
I'm working through Aurelien Geron's Hands-On ML textbook and have got stuck trying to train an SGDClassifier.
I'm using the MNIST handwritten numbers data and running my code in a Jupyter Notebook via Anaconda. Both my anaconda (1.7.0) and sklearn (0.20.dev0) are updated. I've pasted the code I used to load the data, select the first 60k rows, shuffle the order and convert the labels to 1 (True) for all 5's and 0 (False) for all other numbers. Both X_train and y_train_5 are numpy arrays.
I've pasted the error message I get below.
Nothing seems to be wrong with the dimensions of the data, I tried converting X_train to a sparse matrix (the suggested format for SGDClassifier) and various max_iter values and got the same error message each time. Am I missing something obvious? Do I need to use a different version of sklearn? I've searched online but couldn't find any posts describing similar issues with SGDClassifier. I'd be super grateful for any kind of pointer.
Code
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
# Load MNIST data #
from scipy.io import loadmat
mnist_alternative_url = "https://github.com/amplab/datascience-
sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000],
y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)
Error Message
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<ipython-input-10-5a25eed28833> in <module>()
37 # Train SGDClassifier
38 sgd_clf = SGDClassifier(max_iter=5, random_state=42)
---> 39 sgd_clf.fit(X_train, y_train_5)
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
712 loss=self.loss, learning_rate=self.learning_rate,
713 coef_init=coef_init, intercept_init=intercept_init,
--> 714 sample_weight=sample_weight)
715
716
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
570
571 self._partial_fit(X, y, alpha, C, loss, learning_rate, self._max_iter,
--> 572 classes, sample_weight, coef_init, intercept_init)
573
574 if (self._tol is not None and self._tol > -np.inf
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init)
529 learning_rate=learning_rate,
530 sample_weight=sample_weight,
--> 531 max_iter=max_iter)
532 else:
533 raise ValueError(
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter)
587 self._expanded_class_weight[1],
588 self._expanded_class_weight[0],
--> 589 sample_weight)
590
591 self.t_ += n_iter_ * X.shape[0]
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, pos_weight, neg_weight, sample_weight)
419 pos_weight, neg_weight,
420 learning_rate_type, est.eta0,
--> 421 est.power_t, est.t_, intercept_decay)
422
423 else:
~\Anaconda3\lib\site-packages\sklearn\linear_model\sgd_fast.pyx in sklearn.linear_model.sgd_fast.plain_sgd()
TypeError: plain_sgd() takes at most 21 positional arguments (25 given)
It appears your version of scikit-learn is just a little outdated. Try running:
pip install -U scikit-learn
then your code will run (with some slight formatting updates):
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
from scipy.io import loadmat
# Load MNIST data #
mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)
I'm using anaconda-navigator -> python3.6
when I run the following code I get this error:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.grid_search import GridSearchCV
def fit_model(X, y):
""" Performs grid search over the 'max_depth' parameter for a
decision tree regressor trained on the input data [X, y]. """
# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
# TODO: Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
params = {'max_depth':range(1,10)}
# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
scoring_fnc = make_scorer(performance_metric)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_`
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
# Produce the value for 'max_depth'
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
Here are the error messages:
ValueError Traceback (most recent call last)
<ipython-input-12-05857a84a7c5> in <module>()
1 # Fit the training data to the model using grid search
----> 2 reg = fit_model(X_train, y_train)
3
4 # Produce the value for 'max_depth'
5 print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
<ipython-input-11-2c0c19498236> in fit_model(X, y)
26 # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
27
---> 28 grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
29
30 # Fit the grid search object to the data to compute the optimal model
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in __init__(self, estimator, param_grid, scoring, fit_params, n_jobs, iid, refit, cv, verbose, pre_dispatch, error_score)
819 refit, cv, verbose, pre_dispatch, error_score)
820 self.param_grid = param_grid
--> 821 _check_param_grid(param_grid)
822
823 def fit(self, X, y=None):
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in _check_param_grid(param_grid)
349 if True not in check:
350 raise ValueError("Parameter values for parameter ({0}) need "
--> 351 "to be a sequence.".format(name))
352
353 if len(v) == 0:
ValueError: Parameter values for parameter (max_depth) need to be a sequence.
grid_search.py checks this:
check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
It seems like you can't use a range. I would try this:
params = {'max_depth': np.arange(1,10)}
or without numpy:
params = {'max_depth': [x for x in range(1,10)]}