I have a python that is written using jupyter notebook and deal with classification topics project that have in the an unbalanced dataset, for this i used SMOTE but when I tried to split the dataset and create a pipeline to use machine learning model the system crash and display the below error:
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-17-7ae8518f1892> in <module>
15 ('clf',MultinomialNB()), # model classifier
16 ])
---> 17 nb.fit(x_train,y_train)
f:\AIenv\lib\site-packages\sklearn\pipeline.py in fit(self, X, y,
**fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y,
**fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
f:\AIenv\lib\site-packages\joblib\memory.py in __call__(self, *args,
**kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in
_fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y) 1197 1198 vocabulary, X = self._count_vocab(raw_documents,
-> 1199 self.fixed_vocabulary_) 1200 1201 if self.binary:
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_count_vocab(self, raw_documents, fixed_vocab) 1108 for doc in raw_documents: 1109 feature_counter = {}
-> 1110 for feature in analyze(doc): 1111 try: 1112 feature_idx = vocabulary[feature]
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
f:\AIenv\lib\site-packages\scipy\sparse\base.py in __getattr__(self, attr)
685 return self.getnnz()
686 else:
--> 687 raise AttributeError(attr + " not found")
688
689 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
code:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE# for inbalance dataset
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv("data/emotion_dataset_raw.csv")
df["clean_text"] = df["Text"].apply(clean_text)
vectorizer =TfidfVectorizer(ngram_range=(1,2))
vect_df =vectorizer.fit_transform(df["clean_text"])
oversample = SMOTE(random_state = 42)
x_smote,y_smote = oversample.fit_resample(vect_df, df["Emotion"])
print("shape x before SMOTE: {}".format(vect_df.shape))
print("shape x after SMOTE: {}".format(x_smote.shape))
print("balance of targets feild %")
y_smote.value_counts(normalize = True)*100
# the result of the code above :
#shape x before SMOTE: (34792, 209330)
#shape x after SMOTE: (88360, 209330)
x_train,x_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size = 0.2,random_state =42)
#Naiive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([
('vect',CountVectorizer(ngram_range=(1,2))),
('tfidf',TfidfTransformer()),
('clf',MultinomialNB()), # model classifier
])
nb.fit(x_train,y_train)
where is the error in my code and what it mean ???
I believe that the TfidfTransformer is good enough to generate text embeddings. You can drop the CountVectorizer and run the code again. I should work!
pipe = Pipeline(
[
('tfidf', TfidfVectorizer()),
('sampler', RandomOverSampler(sampling_strategy='not majority', random_state=7)),
('model', XGBClassifier())
]
)
pipe.fit(data['features'], data['labels'])
Related
I was trying to do some text classification with spacy but i get an error about my vucabulary being empty.
I tried a classic dataset but i get the same error, i've seen some suggestion to split the text part but i have many lines not a huge one.
this is the code:
#
df_amazon = pd.read_csv("amazon_alexa.tsv",sep="\t")
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)
classifier = LogisticRegression()
pipe = Pipeline ([("cleaner", predictors()),
("vectorizer", bow_vector),
("classifier", classifier)])
pipe.fit(X_train, y_train)
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-b5a14e655d5a> in <module>
10
11 # Model generation
---> 12 pipe.fit(X_train, y_train)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1131 vocabulary = dict(vocabulary)
1132 if not vocabulary:
-> 1133 raise ValueError("empty vocabulary; perhaps the documents only"
1134 " contain stop words")
1135
ValueError: empty vocabulary; perhaps the documents only contain stop words
It looks like you're just using the spaCy tokenizer? I'm not sure what's going on, but you should check the output of the tokenizer on your documents.
Note that while I think you can use the tokenizer that way, it would be more typical to use a blank pipeline, like this:
import spacy
nlp = spacy.blank("en")
words = [tok.text for tok in nlp("this is my input text")]
I've got a dataset with multiple text columns and a target column. I'm trying to use a Cusom Class of Spacy to use Glove embeddings for my text column, and also trying to do it with a Pipeline. But I'm getting a ValueError. Following is my code:
data_features = df.copy()[["title", "description"]]
train_data, test_data, train_target, test_target = train_test_split(data_features, df['target'], test_size = 0.1)
I created this custom class to use glove embeddings. I got the code from this tutorial.
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return [self.nlp(text).vector for text in X]
Loading the nlp model:
nlp = spacy.load("en_core_web_sm")
This is the column transformer that I'm trying to use in my pipeline:
col_preprocessor = ColumnTransformer(
[
('title_glove', SpacyVectorTransformer(nlp), 'title'),
('description_glove', SpacyVectorTransformer(nlp), 'description'),
],
remainder='drop',
n_jobs=1
)
Here is my pipeline:
pipeline_glove = Pipeline([
('col_preprocessor', col_preprocessor),
('classifier', LogisticRegression())
])
When I run the fit method, I get the error that follows:
pipeline_glove.fit(train_data, train_target)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-219-8543ea744205> in <module>
----> 1 pipeline_glove.fit(train_data, train_target)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
549
550 self._update_fitted_transformers(transformers)
--> 551 self._validate_output(Xs)
552
553 return self._hstack(list(Xs))
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_output(self, result)
410 raise ValueError(
411 "The output of the '{0}' transformer should be 2D (scipy "
--> 412 "matrix, array, or pandas DataFrame).".format(name))
413
414 def _validate_features(self, n_features, feature_names):
ValueError: The output of the 'title_glove' transformer should be 2D (scipy matrix, array, or pandas DataFrame).
the error message tells you, what you need to fix.
ValueError: The output of the 'title_glove' transformer should be 2D
(scipy matrix, array, or pandas DataFrame).
But what you are returning with your current transformer (SpacyVectorTransformer) is a list. You can fix it, by turning the list into a pandas DataFrame for instance like this:
import pandas as pd
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return pd.DataFrame([self.nlp(text).vector for text in X])
Next time, please also provide a minimal, reproducible example. In your provided code, there are no imports as well as no DataFrame called "df".
I am attempting to assign binary values to 10 labels using 3 features, a headline of an article, a summary of the article, and an id of who created the labels. I'm stuck on trying create a model that can accept all 3 fields as input. Currently, it only works if I only pass just one field. I know I am likely messing something up with the tfidvectorizer, but I can't quite figure it out. Any help would be appreciated. The error I receive (full traceback below) is usually
ValueError: Found input variables with inconsistent numbers of samples: [3, 75897].
screenshot of dataframe
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import string
df = pd.read_csv('../data/homework_clean.csv')
emotion_cols = ['emotion_0', 'emotion_1', 'emotion_2', 'emotion_3', 'emotion_4', 'emotion_5', 'emotion_6', 'emotion_7', 'emotion_8', 'emotion_9']
def removeStopWords(sentence):
global re_stop_words
return re_stop_words.sub(" ", sentence)
def stemming(sentence):
stemSentence = ""
for word in sentence.split():
stem = stemmer.stem(word)
stemSentence += stem
stemSentence += " "
stemSentence = stemSentence.strip()
return stemSentence
df['headline'] = df['headline'].str.lower()
df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '')
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].str.replace(r'[^\w\s]+', '')
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
df['headline'] = df['headline'].apply(removeStopWords)
df['summary'] = df['summary'].apply(removeStopWords)
stemmer = SnowballStemmer('english')
df['headline'] = df['headline'].apply(stemming)
df['summary'] = df['summary'].apply(stemming)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state = 42, test_size = .2, shuffle = True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
vectorizer = FeatureUnion([
('headline', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2')),
('summary', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2'))])
x_train = train[['headline', 'summary', 'worker_id']]
y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
x_test = test[['headline', 'summary', 'worker_id']]
y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# IF I only use one feature it works fine.
# x_train = train['headline']
# y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# x_test = test['headline']
# y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.multiclass import OneVsRestClassifier
OneVsRest_pipeline = Pipeline(steps = [
('featureunion', vectorizer),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
OneVsRest_pipeline.fit(x_train, y_train)
predictions = OneVsRest_pipeline.predict(x_test)
prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
Full Traceback
ValueError Traceback (most recent call last)
<ipython-input-27-6394288c65f8> in <module>
4 ])
5
----> 6 OneVsRest_pipeline.fit(x_train, y_train)
7 predictions = OneVsRest_pipeline.predict(x_test)
8 prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
354 self._log_message(len(self.steps) - 1)):
355 if self._final_estimator != 'passthrough':
--> 356 self._final_estimator.fit(Xt, y, **fit_params)
357 return self
358
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in fit(self, X, y)
214 "not %s" % self.label_binarizer_.classes_[i],
215 self.label_binarizer_.classes_[i]])
--> 216 for i, column in enumerate(columns))
217
218 return self
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in _fit_binary(estimator, X, y, classes)
78 else:
79 estimator = clone(estimator)
---> 80 estimator.fit(X, y)
81 return estimator
82
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1530
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1532 accept_large_sparse=solver != 'liblinear')
1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
727 y = y.astype(np.float64)
728
--> 729 check_consistent_length(X, y)
730
731 return X, y
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
203 if len(uniques) > 1:
204 raise ValueError("Found input variables with inconsistent numbers of"
--> 205 " samples: %r" % [int(l) for l in lengths])
206
207
I am using Python 3.7 in a Jupyter Notebook. I am creating classification models based on Jason Brownlee's ebook Machine Learning Mastery with Python. The code is essentially cut and pasted from the ebook into the Jupyter Notebook. The models work fine when I split the data but when I use k-fold cross validation it generates a Future warning message I'll cut and paste the code and message below. I entered error_score =np.nan and it didn't fix the problem but I don't know where the code should be entered. I would appreciate any advice but keep in mind that I am a novice. Thanks
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
/Users/roberthoyt/opt/anaconda3/lib/python3.7/site-
packages/sklearn/model_selection/_validation.py:530: FutureWarning: From version 0.22, errors during
fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want
an exception raised or error_score=np.nan to adopt the behavior from version 0.22.
FutureWarning)
ValueError Traceback (most recent call last)
<ipython-input-105-010e5612fd63> in <module>
11 model = LogisticRegression(solver='liblinear')
12 error_score = np.nan
---> 13 results = cross_val_score(model, X, Y, cv=kfold)
14 print(results.mean())
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
error_score)
389 fit_params=fit_params,
390 pre_dispatch=pre_dispatch,
--> 391 error_score=error_score)
392 return cv_results['test_score']
393
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
return_train_score, return_estimator, error_score)
230 return_times=True, return_estimator=return_estimator,
231 error_score=error_score)
--> 232 for train, test in cv.split(X, y, groups))
233
234 zipped_scores = list(zip(*scores))
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self,
func,
callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _
fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params,
return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator,
error_score)
514 estimator.fit(X_train, **fit_params)
515 else:
--> 516 estimator.fit(X_train, y_train, **fit_params)
517
518 except Exception as e:
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y,
sample_weight)
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
1532 accept_large_sparse=solver != 'liblinear')
-> 1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
1535 n_samples, n_features = X.shape
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/multiclass.py in
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
170
171
ValueError: Unknown label type: 'continuous'
The problem is that your targets are continuous and you're doing a classification task. Make sure The column you're using a target is categorical. You may have to convert it to integer. All of this is reported in the traceback:
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
Your target is not in the accepted targets. your target is continuous:
ValueError: Unknown label type: 'continuous'
Check if your target is an integer with df.dtypes and change it to integer if it isn't.
Y = array[:,8].astype(int)
That is assuming that you haven't made the mistake of making a classification task on continuous values. You can also check if all values represent 0s and 1s:
np.unique(array[:, 8])
My first multiclass classication. I have values Xtrn and Ytrn. Ytrn have 5 values [0,1,2,3,4]. But if i start then get "multiclass format is not supported".
This example of values:
Xtrn Ytrn
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366 3
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229 2
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164 1
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648 2
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562 2
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119 2
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116 4
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704 1
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090 2
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229 2
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645 1
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487 1
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705 2
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729 0
This is code:
#import data
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn import metrics, cross_validation, grid_search, preprocessing
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?')
Ytrn = pd.read_csv('y_train_secret.csv', header=None)
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?')
#Number of unique values Ytrn
n_classes_ = len(np.unique(Ytrn))
#learning model
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42)
xgb_model = xgb.XGBClassifier(objective='multi:softmax')
xgb_params = [{'num_class': n_classes_}]
xgb_params = [
{
"n_estimators": range(50, 501, 50),
}
]
#cv
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42)
xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
xgb_grid.fit(X_train, y_train)
This is error:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-233-77d3e8d4b8c3> in <module>()
10
11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
---> 12 xgb_grid.fit(X_train, y_train)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
827
828 """
--> 829 return self._fit(X, y, ParameterGrid(self.param_grid))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
571 self.fit_params, return_parameters=True,
572 error_score=self.error_score)
--> 573 for parameters in parameter_iterable
574 for train, test in cv)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1682
1683 else:
-> 1684 test_score = _score(estimator, X_test, y_test, scorer)
1685 if return_train_score:
1686 train_score = _score(estimator, X_train, y_train, scorer)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1739 score = scorer(estimator, X_test)
1740 else:
-> 1741 score = scorer(estimator, X_test, y_test)
1742 if hasattr(score, 'item'):
1743 try:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
169 y_type = type_of_target(y)
170 if y_type not in ("binary", "multilabel-indicator"):
--> 171 raise ValueError("{0} format is not supported".format(y_type))
172
173 if is_regressor(clf):
ValueError: multiclass format is not supported
I found answer. Scoring='roc_auc' onle for binary classification. Need another (eg accuracy)
xgb_params = [{'num_class': n_classes_}] need delete