I'm trying to modify this example to use a Pandas dataframe instead of the test datasets. I am not able to do so, as ItemSelector does not seem to recognise the column name.
Please do note the columns of the dataframe df_resolved.columns returns:
Index(['u_category', ... ... 'resolution_time', 'rawtext'],
dtype='object')
So I obviously do have this in my dataframe.
However, when I try to run the solution, I get the error
"ValueError: no field of name u_category"
Also, I don't seem to be able to modify the code to support choosing multiple columns in the ItemSelector, so in this solution, I'd have to apply the transformers separately with each column.
My code is:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
"""Extract the subject & body from a usenet post in a single pass.
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def transform(self, posts):
features = np.recarray(shape=(len(posts),),
dtype=[('subject', object), ('body', object)])
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
bod = strip_newsgroup_footer(bod)
bod = strip_newsgroup_quoting(bod)
features['body'][i] = bod
prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features['subject'][i] = sub
return features
pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
# Pipeline for pulling features from the post's subject line
('rawtext', Pipeline([
('selector', ItemSelector(key='u_category')),
('labelenc', preprocessing.LabelEncoder()),
])),
# Pipeline for standard bag-of-words model for body
('features', Pipeline([
('selector', ItemSelector(key='rawtext')),
('tfidf', TfidfVectorizer(max_df=0.5, min_df=1,
stop_words='english',
token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'rawtext': 1.0,
'features': 1.0,
},
)),
# Use a SVC classifier on the combined features
('linear_svc', LinearSVC(penalty="l2")),
])
# limit the list of categories to make running this example faster.
X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))
How can I modify this code to work properly with my dataframe, and possibly support applying a transformer to multiple columns at once?
If I take the ItemSelector out, it seems to work. So this works:
ds = ItemSelector(key='u_category')
ds.fit(df_resolved)
labelenc = preprocessing.LabelEncoder()
labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved))
FULL STACK TRACE:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-93-a4ba29c137ec> in <module>()
136
137
--> 138 pipeline.fit(X_train, y_train)
139 #y = pipeline.predict(X_test)
140 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
299 """
300 last_step = self._final_estimator
--> 301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
303 return last_step.fit_transform(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-93-a4ba29c137ec> in transform(self, data_dict)
55
56 def transform(self, data_dict):
---> 57 return data_dict[self.key]
58
59
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx)
497
498 def __getitem__(self, indx):
--> 499 obj = super(recarray, self).__getitem__(indx)
500
501 # copy behavior of getattr, except that here
ValueError: no field of name u_category
UPDATE:
Even if I use dataframes (NO train_test_split), the issue persists:
UPDATE 2:
OK so I removed the SubjectBodyExtractor, since I won't need that. Now the ValueError: no field of name u_category is gone, but I have a new error: TypeError: fit_transform() takes 2 positional arguments but 3 were given.
Stack trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-110-292294015e44> in <module>()
129
130
--> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name'])
132 #y = pipeline.predict(X_test)
133 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
--> 303 return last_step.fit_transform(Xt, y, **fit_params)
304 elif last_step is None:
305 return Xt
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Yes, thats because LabelEncoder only requires a single array y whereas FeatureUnion will try sending X and y both to it.
See this: https://github.com/scikit-learn/scikit-learn/issues/3956
You can use a simple workaround for this:
Define a custom labelEncoder like this:
class MyLabelEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.le = LabelEncoder()
def fit(self, x, y=None):
return self.le.fit(x)
def transform(self, x, y=None):
return self.le.transform(x).reshape(-1,1)
def fit_transform(self, x, y=None):
self.fit(x)
return self.transform(x)
And in the pipeline, do this:
....
....
('selector', ItemSelector(key='u_category')),
('labelenc', MyLabelEncoder()),
Please note the reshape(-1,1) in the trasform() method. Thats because FeatureUnion only works with 2-d data. All the individual transformers inside the FeatureUnion should only return 2-d data.
you may need to add them in the features array like this , please try to add the two selectors in the features like this and show me the results
features = np.recarray(shape=(len(posts),),
dtype=[('u_category', object), ('rawtext', object)])
Related
i want to establish a pipe line to pubg data on kaggle to procces it but when i implement a pipe line this error get to me:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_35/3879657662.py in <module>
8 ])
9
---> 10 pubg_num_tr = num_pipeline.fit_transform(pubg_num)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
424 """
425 fit_params_steps = self._check_fit_params(**fit_params)
--> 426 Xt = self._fit(X, y, **fit_params_steps)
427
428 last_step = self._final_estimator
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
--> 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
845 if y is None:
846 # fit method of arity 1 (unsupervised transformation)
--> 847 return self.fit(X, **fit_params).transform(X)
848 else:
849 # fit method of arity 2 (supervised transformation)
/tmp/ipykernel_35/2077244363.py in transform(self, X)
13 total_distance = X[:, walkDistance_ix] + X[:, rideDistance_ix]+X[:, swimDistance_ix]
14 if self.add_total_distance_per_seconda:
---> 15 add_total_distance_per_seconda = X[:, total_distance] / X[:, matchDuration_ix]
16 return np.c_[X, walk_distance_per_seconda, total_distance,
17 add_total_distance_per_seconda]
IndexError: arrays used as indices must be of integer (or boolean) type
my pipeline code is:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
pubg_num_tr = num_pipeline.fit_transform(pubg_num)
i implemented an attribute adder and it worked properly but when i turn on the pipline it fails, i need a solution without the need to converse a float to integers because it harms data.
I am building a model using customized transformers (KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]).
When I run the below code, I get an error because of .fit:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
The code is
# MODEL
from sklearn import tree
# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
('preprocess_pipeline', preprocess_pipeline),
('model', decision_tree)])
# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error
# metrics
score_test = \
round(training.score(X, y) * 100, 2)
print(f"\nTraining Accuracy: {score_test}")
I have also tried with .fit_transform but I get the same error.
I read this: AttributeError: 'numpy.ndarray' object has no attribute 'lower' fitting logistic model data but it seems that I am not passing X or y in the Decision tree like in that example, but maybe I am wrong.
Adding
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
('text_transformer', TextTransformer()),
('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])
I get this new error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
104 doc = preprocessor(doc)
105 if tokenizer is not None:
--> 106 doc = tokenizer(doc)
107 if ngrams is not None:
108 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
If I remove text_pipeline, the error does not occur, so it seems that something is going wrong because of the way to use countVectorizer.
An example of text is
an example
example number 1
this is another small example
I have other columns that are numerical and categorical.
Have you experienced a similar issue? If yes, how did you handle it?
A common error in text transformers of sklearn involves the shape of the data: unlike most other sklearn preprocessors, text transformers generally expect a one-dimensional input, and python's duck-typing causes weird errors from both arrays and strings being iterables.
Your TextTransformer.transform returns X[['Tweet']], which is 2-dimensional, and will cause problems with the subsequent CountVectorizer. (Converting to a numpy array with .values doesn't change the dimensionality problem, but there's also no compelling reason to do that conversion.) Returning X['Tweet'] instead should cure that problem.
I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504
A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.
I am using GridSearchCV in order to find the best parameters for my pipeline.
My pipeline seems to work well as I can apply:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
And I get a decent result.
But GridSearchCV obviously doesn't like something, and I cannot figure it out.
My pipeline:
feats = FeatureUnion([('age', age),
('education_num', education_num),
('is_education_favo', is_education_favo),
('is_marital_status_favo', is_marital_status_favo),
('hours_per_week', hours_per_week),
('capital_diff', capital_diff),
('sex', sex),
('race', race),
('native_country', native_country)
])
pipeline = Pipeline([
('adhocFC',AdHocFeaturesCreation()),
('imputers', KnnImputer(target = 'native-country', n_neighbors = 5)),
('features',feats),('clf',LogisticRegression())])
My GridSearch:
hyperparameters = {'imputers__n_neighbors' : [5,21,41], 'clf__C' : [1.0, 2.0]}
GSCV = GridSearchCV(pipeline, hyperparameters, cv=3, scoring = 'roc_auc' , refit = False) #change n_jobs = 2, refit = False
GSCV.fit(X_train, y_train)
I receive 11 similar warnings:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:11:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
and this is the error message:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:11:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:12:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:14:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in ()
3 GSCV = GridSearchCV(pipeline, hyperparameters, cv=3, scoring = 'roc_auc' ,refit = False) #change n_jobs = 2, refit = False
4
----> 5 GSCV.fit(X_train, y_train)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_search.py
in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_search.py
in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in call(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py
in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py
in init(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in call(self)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in (.0)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_validation.py
in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score, return_parameters,
return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/pipeline.py
in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/pipeline.py
in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/base.py
in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
in fit(self, X, y)
16 self.ohe.fit(X_full)
17 #Create a Dataframe that does not contain any nulls, categ variables are OHE, with all each rows
---> 18 X_ohe_full = self.ohe.transform(X_full[~X[self.col].isnull()].drop(self.col,
axis=1))
19
20 #Fit the classifier on lines where col is null
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/frame.py
in getitem(self, key) 2057 return
self._getitem_multilevel(key) 2058 else:
-> 2059 return self._getitem_column(key) 2060 2061 def _getitem_column(self, key):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/frame.py
in _getitem_column(self, key) 2064 # get column 2065
if self.columns.is_unique:
-> 2066 return self._get_item_cache(key) 2067 2068 # duplicate columns & possible reduce dimensionality
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py
in _get_item_cache(self, item) 1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item) 1387 res = self._box_item_values(item, values) 1388
cache[item] = res
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py
in get(self, item, fastpath) 3550 loc =
indexer.item() 3551 else:
-> 3552 raise ValueError("cannot label index with a null key") 3553 3554 return self.iget(loc,
fastpath=fastpath)
ValueError: cannot label index with a null key
Without additional information I believe it is because your X_train and y_train variables are pandas dataframe, the basic sci-kit learn library isn't comparable with these: e.g., the .fit method of a classifier is expecting an array like object.
By feeding in pandas dataframes you are inadvertently indexing them like numpy arrays, which is not that stable in pandas.
Try converting your training data to numpy arrays:
X_train_arr = X_train.to_numpy()
y_train_arr = y_train.to_numpy()
I keep getting an
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
while trying to fit my dataframe to the following pipeline. Train and Test are two dataframes with same columns. There are different columns but I only want to focus on three of them through the ItemSelector.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.column]
def predictCases(train, test):
target_names = sorted(list(set(train['TARGET'].values)))
y_train = np.array([target_names.index(x) for x in train['TARGET'].values])
y_test = np.array([target_names.index(x) for x in test['TARGET'].values])
# train and predict
classifier = Pipeline([
('union', FeatureUnion([
('text', Pipeline([
('selector', ItemSelector(column='TEXT')),
('tfidf_vec', TfidfVectorizer())
])),
('feature1', Pipeline([
('selector', ItemSelector(column='CATEG_FEAT1')),
('lbe', LabelEncoder())
])),
('feature2', Pipeline([
('selector', ItemSelector(column='CATEG_FEAT2')),
('lbe', LabelEncoder())
]))
])),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(train.values, y_train)
predicted = classifier.predict(test.values)
return(metrics.precision_recall_fscore_support(y_test, predicted))
Full Error:
IndexError Traceback (most recent call last)
<ipython-input-19-95d9d0c337f4> in <module>()
----> 1 tt = predictCases(train_resampled, validate)
<ipython-input-17-efc951f4192e> in predictCases(train, test)
24 ])),
25 ('clf', OneVsRestClassifier(LinearSVC()))])
---> 26 classifier.fit(train.values, y_train)
27 predicted = classifier.predict(test.values)
28 return(metrics.precision_recall_fscore_support(y_test, predicted))
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
299 """
300 last_step = self._final_estimator
--> 301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
303 return last_step.fit_transform(Xt, y, **fit_params)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-2-fdc42fd9d831> in transform(self, X)
10
11 def transform(self, X):
---> 12 return X[self.column]
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
Edit:
If I use train instead of train.values in fit I get the following error:
TypeError: fit_transform() takes 2 positional arguments but 3 were given
You're passing test.values (i.e. a numpy array with raw DataFrame values) to classifier.predict and classifier.fit, while your transformer expects a DataFrame object.