IndexError while fitting pipeline with FeatureUnion - python

I keep getting an
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
while trying to fit my dataframe to the following pipeline. Train and Test are two dataframes with same columns. There are different columns but I only want to focus on three of them through the ItemSelector.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.column]
def predictCases(train, test):
target_names = sorted(list(set(train['TARGET'].values)))
y_train = np.array([target_names.index(x) for x in train['TARGET'].values])
y_test = np.array([target_names.index(x) for x in test['TARGET'].values])
# train and predict
classifier = Pipeline([
('union', FeatureUnion([
('text', Pipeline([
('selector', ItemSelector(column='TEXT')),
('tfidf_vec', TfidfVectorizer())
])),
('feature1', Pipeline([
('selector', ItemSelector(column='CATEG_FEAT1')),
('lbe', LabelEncoder())
])),
('feature2', Pipeline([
('selector', ItemSelector(column='CATEG_FEAT2')),
('lbe', LabelEncoder())
]))
])),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(train.values, y_train)
predicted = classifier.predict(test.values)
return(metrics.precision_recall_fscore_support(y_test, predicted))
Full Error:
IndexError Traceback (most recent call last)
<ipython-input-19-95d9d0c337f4> in <module>()
----> 1 tt = predictCases(train_resampled, validate)
<ipython-input-17-efc951f4192e> in predictCases(train, test)
24 ])),
25 ('clf', OneVsRestClassifier(LinearSVC()))])
---> 26 classifier.fit(train.values, y_train)
27 predicted = classifier.predict(test.values)
28 return(metrics.precision_recall_fscore_support(y_test, predicted))
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
299 """
300 last_step = self._final_estimator
--> 301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
303 return last_step.fit_transform(Xt, y, **fit_params)
C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-2-fdc42fd9d831> in transform(self, X)
10
11 def transform(self, X):
---> 12 return X[self.column]
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
Edit:
If I use train instead of train.values in fit I get the following error:
TypeError: fit_transform() takes 2 positional arguments but 3 were given

You're passing test.values (i.e. a numpy array with raw DataFrame values) to classifier.predict and classifier.fit, while your transformer expects a DataFrame object.

Related

i want to establish a pipe line to pubg data on kaggle to procces it but when i implement a pipe line this error get to me

i want to establish a pipe line to pubg data on kaggle to procces it but when i implement a pipe line this error get to me:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_35/3879657662.py in <module>
8 ])
9
---> 10 pubg_num_tr = num_pipeline.fit_transform(pubg_num)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
424 """
425 fit_params_steps = self._check_fit_params(**fit_params)
--> 426 Xt = self._fit(X, y, **fit_params_steps)
427
428 last_step = self._final_estimator
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
--> 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
845 if y is None:
846 # fit method of arity 1 (unsupervised transformation)
--> 847 return self.fit(X, **fit_params).transform(X)
848 else:
849 # fit method of arity 2 (supervised transformation)
/tmp/ipykernel_35/2077244363.py in transform(self, X)
13 total_distance = X[:, walkDistance_ix] + X[:, rideDistance_ix]+X[:, swimDistance_ix]
14 if self.add_total_distance_per_seconda:
---> 15 add_total_distance_per_seconda = X[:, total_distance] / X[:, matchDuration_ix]
16 return np.c_[X, walk_distance_per_seconda, total_distance,
17 add_total_distance_per_seconda]
IndexError: arrays used as indices must be of integer (or boolean) type
my pipeline code is:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
pubg_num_tr = num_pipeline.fit_transform(pubg_num)
i implemented an attribute adder and it worked properly but when i turn on the pipline it fails, i need a solution without the need to converse a float to integers because it harms data.

ValueError: blocks[0,:] has incompatible row dimensions

Im trying to extract few text features(word_count.char_count...) & tf-idf from a twitter dataset for sentiment analysis. Using sklearn's featureUnion to combine them and give them to a classifier in a Pipeline.
Im getting the following error ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,8].shape[0] == 7920, expected 1. Here is the code:
features_union = FeatureUnion(transformer_list = [('word_count', WordCalculator()),
('char_count', CharCalculator()),
('avg_word_len', AvdWordLengthCalculater()),
('stop_words_count', StopWordsCalculater()),
('spl_char_count', SplCharCalculater()),
('hash_tag_count', HashTagCalculator()),
('num_count',NumericsCalculator()),
('cap_letter_count',CapsCalculator()),
('tfidf_feature',Pipeline([('preprocessor', Preprocessor()),
('selector', ItemSelector('tweet')),
('count', CountVectorizer()),
('tfidf', TfidfTransformer())]))])
pipeline = Pipeline([('noise_remover', UrlRemover()),
('features', features_union),
('model', MultinomialNB())])
pipeline.fit(train, train['label'])```
Here is the complete error log
ValueError Traceback (most recent call last)
<ipython-input-33-bb532fc90bb0> in <module>
14 ('features', features_union),
15 ('model', MultinomialNB())])
---> 16 pipeline.fit(train, train['label'])
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
348 This estimator
349 """
--> 350 Xt, fit_params = self._fit(X, y, **fit_params)
351 with _print_elapsed_time('Pipeline',
352 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
313 message_clsname='Pipeline',
314 message=self._log_message(step_idx),
--> 315 **fit_params_steps[name])
316 # Replace the transformer of the step with the fitted
317 # transformer. This is necessary when loading the transformer
~/opt/anaconda3/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
943
944 if any(sparse.issparse(f) for f in Xs):
--> 945 Xs = sparse.hstack(Xs).tocsr()
946 else:
947 Xs = np.hstack(Xs)
~/opt/anaconda3/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
~/opt/anaconda3/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,8].shape[0] == 7920, expected 1.
data set sample:
0 1 0 #fingerprint #Pregnancy Test https://google.com...
1 2 0 Finally a transparant silicon case ^^ Thanks t...
2 3 0 We love this! Would you go? #talk #makememorie...
3 4 0 I'm wired I know I'm George I was made that wa...
4 5 1 What amazing service! Apple won't even talk to...
dataset shape - (7920, 3)
Any immediate help on this would be grateful.

Grid Search Cross Validation error when trying to fit X, y with GridSearchCV sklearn

Python sci-kit learn KNN Grid Search Cross Validation error
I am trying to recreated KNN model for prediction of car destination.
https://github.com/carlosbkm/car-destination-prediction
The code is not working at Grid search cross validation here:
https://github.com/carlosbkm/car-destination-prediction/blob/master/k-nearest-model.ipynb
At first geodash was not working so I switched it to geodash2 and there was no problem.
When I try to fit the model I get.
TypeError: unsupported operand type(s) for /: 'str' and 'int'
When I try to fit X and y for Grid Search Cross Validation I get an error.
The problem is coming from
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
best = gs.best_estimator_
return best
I can not fit the model to X and y:
gs.fit(X, y)
I tried to make X and y into floats but nothing changed
When I execute this:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
I get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-124-34b56429c6b5> in <module>()
4 #knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
5 knn_parameters = {"n_neighbors": [1,2,5]}
----> 6 knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
<ipython-input-116-1a00f84f1047> in cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func)
6 else:
7 gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
----> 8 gs.fit(X, y)
9 print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
10 best = gs.best_estimator_
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
258 else:
259 fit_time = time.time() - start_time
--> 260 test_score = _score(estimator, X_test, y_test, scorer)
261 score_time = time.time() - start_time - fit_time
262 if return_train_score:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
286 score = scorer(estimator, X_test)
287 else:
--> 288 score = scorer(estimator, X_test, y_test)
289 if hasattr(score, 'item'):
290 try:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
89 super(_PredictScorer, self).__call__(estimator, X, y_true,
90 sample_weight=sample_weight)
---> 91 y_pred = estimator.predict(X)
92 if sample_weight is not None:
93 return self._sign * self._score_func(y_true, y_pred,
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
151
152 if weights is None:
--> 153 y_pred = np.mean(_y[neigh_ind], axis=1)
154 else:
155 y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2907
2908 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2909 out=out, **kwargs)
2910
2911
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 if isinstance(ret, mu.ndarray):
72 ret = um.true_divide(
---> 73 ret, rcount, out=ret, casting='unsafe', subok=False)
74 if is_float16_result and out is None:
75 ret = arr.dtype.type(ret)
TypeError: unsupported operand type(s) for /: 'str' and 'int'

MemoryError cross_val_score Jupyter Notebook

I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504
A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.

Custom FeatureUnion won't work?

I'm trying to modify this example to use a Pandas dataframe instead of the test datasets. I am not able to do so, as ItemSelector does not seem to recognise the column name.
Please do note the columns of the dataframe df_resolved.columns returns:
Index(['u_category', ... ... 'resolution_time', 'rawtext'],
dtype='object')
So I obviously do have this in my dataframe.
However, when I try to run the solution, I get the error
"ValueError: no field of name u_category"
Also, I don't seem to be able to modify the code to support choosing multiple columns in the ItemSelector, so in this solution, I'd have to apply the transformers separately with each column.
My code is:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
"""Extract the subject & body from a usenet post in a single pass.
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def transform(self, posts):
features = np.recarray(shape=(len(posts),),
dtype=[('subject', object), ('body', object)])
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
bod = strip_newsgroup_footer(bod)
bod = strip_newsgroup_quoting(bod)
features['body'][i] = bod
prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features['subject'][i] = sub
return features
pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
# Pipeline for pulling features from the post's subject line
('rawtext', Pipeline([
('selector', ItemSelector(key='u_category')),
('labelenc', preprocessing.LabelEncoder()),
])),
# Pipeline for standard bag-of-words model for body
('features', Pipeline([
('selector', ItemSelector(key='rawtext')),
('tfidf', TfidfVectorizer(max_df=0.5, min_df=1,
stop_words='english',
token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'rawtext': 1.0,
'features': 1.0,
},
)),
# Use a SVC classifier on the combined features
('linear_svc', LinearSVC(penalty="l2")),
])
# limit the list of categories to make running this example faster.
X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))
How can I modify this code to work properly with my dataframe, and possibly support applying a transformer to multiple columns at once?
If I take the ItemSelector out, it seems to work. So this works:
ds = ItemSelector(key='u_category')
ds.fit(df_resolved)
labelenc = preprocessing.LabelEncoder()
labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved))
FULL STACK TRACE:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-93-a4ba29c137ec> in <module>()
136
137
--> 138 pipeline.fit(X_train, y_train)
139 #y = pipeline.predict(X_test)
140 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
299 """
300 last_step = self._final_estimator
--> 301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
303 return last_step.fit_transform(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-93-a4ba29c137ec> in transform(self, data_dict)
55
56 def transform(self, data_dict):
---> 57 return data_dict[self.key]
58
59
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx)
497
498 def __getitem__(self, indx):
--> 499 obj = super(recarray, self).__getitem__(indx)
500
501 # copy behavior of getattr, except that here
ValueError: no field of name u_category
UPDATE:
Even if I use dataframes (NO train_test_split), the issue persists:
UPDATE 2:
OK so I removed the SubjectBodyExtractor, since I won't need that. Now the ValueError: no field of name u_category is gone, but I have a new error: TypeError: fit_transform() takes 2 positional arguments but 3 were given.
Stack trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-110-292294015e44> in <module>()
129
130
--> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name'])
132 #y = pipeline.predict(X_test)
133 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
--> 303 return last_step.fit_transform(Xt, y, **fit_params)
304 elif last_step is None:
305 return Xt
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Yes, thats because LabelEncoder only requires a single array y whereas FeatureUnion will try sending X and y both to it.
See this: https://github.com/scikit-learn/scikit-learn/issues/3956
You can use a simple workaround for this:
Define a custom labelEncoder like this:
class MyLabelEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.le = LabelEncoder()
def fit(self, x, y=None):
return self.le.fit(x)
def transform(self, x, y=None):
return self.le.transform(x).reshape(-1,1)
def fit_transform(self, x, y=None):
self.fit(x)
return self.transform(x)
And in the pipeline, do this:
....
....
('selector', ItemSelector(key='u_category')),
('labelenc', MyLabelEncoder()),
Please note the reshape(-1,1) in the trasform() method. Thats because FeatureUnion only works with 2-d data. All the individual transformers inside the FeatureUnion should only return 2-d data.
you may need to add them in the features array like this , please try to add the two selectors in the features like this and show me the results
features = np.recarray(shape=(len(posts),),
dtype=[('u_category', object), ('rawtext', object)])

Categories

Resources