I'm trying to implement a pipeline with FAMD, SMOTENC, and other preprocessing steps. However it gives error each time. If i remove FAMD from the pipeline it works fine.
My code:
#Seperate the dataset in two parts
num_df= X_train_new.select_dtypes(include=[np.number]).columns
cat_df= X_train_new.select_dtypes(exclude=[np.number]).columns
#Create a mask for categorical features
categorical_feature_mask = X_train_new.dtypes == object
print(categorical_feature_mask)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
#Create a pipeline to automate the preprocessing steps and SMOTENC together
num_pipe = make_pipeline(SimpleImputer(strategy='median'))
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
transformer= make_column_transformer((num_pipe, selector(dtype_include='number')),
(cat_pipe, selector(dtype_include='object')),n_jobs=2)
#Undersampling with SMOTENC
from imblearn.over_sampling import SMOTENC
smote= SMOTENC(categorical_features=categorical_feature_mask,random_state=99)
!pip install prince
from prince import FAMD
famd=FAMD(n_components=4,random_state=99)
from imblearn.pipeline import make_pipeline as imb_pipeline
#Fit the random forest learner
rf=RandomForestClassifier(n_estimators=300random_state=99)
pipe=imb_pipeline(transformer,smote,famd,rf)
pipe.fit(X_train_new,y_train_new)
print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
The error:
AttributeError Traceback (most recent call last)
<ipython-input-24-2b7ea084a318> in <module>()
3 rf=RandomForestClassifier(n_estimators=300,max_features=3,criterion='entropy',random_state=99)
4 pipe=imb_pipeline(transformer,smote,famd,rf)
----> 5 pipe.fit(X_train_new,y_train_new)
6 print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
6 frames
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
235
236 """
--> 237 Xt, yt, fit_params = self._fit(X, y, **fit_params)
238 if self._final_estimator is not None:
239 self._final_estimator.fit(Xt, yt, **fit_params)
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
195 Xt, fitted_transformer = fit_transform_one_cached(
196 cloned_transformer, None, Xt, yt,
--> 197 **fit_params_steps[name])
198 elif hasattr(cloned_transformer, "fit_resample"):
199 Xt, yt, fitted_transformer = fit_resample_one_cached(
/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
564 def _fit_transform_one(transformer, weight, X, y, **fit_params):
565 if hasattr(transformer, 'fit_transform'):
--> 566 res = transformer.fit_transform(X, y, **fit_params)
567 else:
568 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
--> 574 return self.fit(X, y, **fit_params).transform(X)
575
576
/usr/local/lib/python3.7/dist-packages/prince/famd.py in fit(self, X, y)
27
28 # Separate numerical columns from categorical columns
---> 29 num_cols = X.select_dtypes(np.number).columns.tolist()
30 cat_cols = list(set(X.columns) - set(num_cols))
31
/usr/local/lib/python3.7/dist-packages/scipy/sparse/base.py in __getattr__(self, attr)
689 return self.getnnz()
690 else:
--> 691 raise AttributeError(attr + " not found")
692
693 def transpose(self, axes=None, copy=False):
AttributeError: select_dtypes not found
tl;dr: try adding sparse=False to your OneHotEncoder. Consider raising an Issue with prince, to handle sparse inputs.
You can see from the traceback that the problem is that FAMD.fit tries X.select_dtypes to separate categorical and numeric data. select_dtypes is a pandas function, so normally I would assume that prince is written to operate on dataframes and not the numpy arrays that sklearn uses internally (after converting from frames if necessary). However, looking at the source, a few lines above that one they do convert from numpy array to dataframe. But, the last trace message is from scipy. That hints that your X may actually be a sparse array. And indeed OneHotEncoder (earlier in your pipeline) prefers to output sparse arrays, and ColumnTransformer determines whether to transform into sparse or dense depending on its component parts and the parameter sparse_threshold.
Related
I'm trying to perform a supporting vector regression on my datasets, however I have run into a problem when trying to standard scale my datasets.
Originally I was facing an issue of "ValueError: Expected 2D array, got 1D array instead".
I tried to change the shape of my datasets, but I'm now facing an issue ValueError: Found array with dim 3. StandardScaler expected <= 2.
Could anyone please help me out how can I proceed to eliminate this issue?
Below is what I have done:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=pd.read_excel(r'C:\Users\Sammy\OneDrive - International Campus, Zhejiang University\Desktop\Data\BAYC Data.xlsx')
print(dataset)
dataset.columns
x=dataset.iloc[:,1].values
y=dataset.iloc[:,2].values
x=dataset.iloc[:,0].to_frame()
x=np.expand_dims(x, axis = -1)
from sklearn.preprocessing import StandardScaler
st_x=StandardScaler()
st_y=StandardScaler()
X=st_x.fit_transform(x)
Y=st_y.fit_transform(y)
And this is the error I'm receiving:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11140\1840045772.py in <module>
----> 1 X=st_x.fit_transform(x)
2 Y=st_y.fit_transform(y)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
804 # Reset internal state before fitting
805 self._reset()
--> 806 return self.partial_fit(X, y, sample_weight)
807
808 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
839 """
840 first_call = not hasattr(self, "n_samples_seen_")
--> 841 X = self._validate_data(
842 X,
843 accept_sparse=("csr", "csc"),
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
792 ) from e
793 if not allow_nd and array.ndim >= 3:
--> 794 raise ValueError(
795 "Found array with dim %d. %s expected <= 2."
796 % (array.ndim, estimator_name)
ValueError: Found array with dim 3. StandardScaler expected <= 2.
you are trying to "push" 3d array to the function that need 2d array.
I would recommend delete rows 9 and 10, and in rows 7 and 8 edit the values in [], so the dataset.iloc will give you 2d array :)
Beginner here.
I'm trying to find the best number of n_estimators using xgboost.
But, I'm getting this error.
diabetes.head() #this is a toy dataset in sklearn.datasets.
diabetes.head()
x=diabetes.drop('y',axis=1).values
y=diabetes.y.values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=16,test_size=0.25)
import xgboost as xgb
xgbmodel=xgb.XGBRegressor(objective="reg:squarederror",eval_metric='rmse',early_stopping_rounds=10,n_estimators=1000,random_state=16)
xgbmodel.fit(x_train,y_train,eval_set=[x_test,y_test])
I think the problem lies within:
eval_set=[x_test,y_test]
P.S. I double checked that diabetes dataset from sklearn can used used for regression.I was wondering if my error lies within eval_metric method.
Full error here:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_10180/532743731.py in <module>
4 import xgboost as xgb
5 xgbmodel=xgb.XGBRegressor(objective="reg:squarederror",eval_metric='rmse',early_stopping_rounds=10,n_estimators=1000,random_state=16)
----> 6 xgbmodel.fit(x_train,y_train,eval_set=[x_test,y_test])
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
530 for k, arg in zip(sig.parameters, args):
531 kwargs[k] = arg
--> 532 return f(**kwargs)
533
534 return inner_f
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
929 """
930 evals_result: TrainingCallback.EvalsLog = {}
--> 931 train_dmatrix, evals = _wrap_evaluation_matrices(
932 missing=self.missing,
933 X=X,
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical)
434
435 evals = []
--> 436 for i, (valid_X, valid_y) in enumerate(eval_set):
437 # Skip the duplicated entry.
438 if all(
ValueError: too many values to unpack (expected 2)
yes, it has to be a tuple, see below, your x_test, y_test should be enclosed in parentheses
eval_setparam = [(self.X_valid, self.y_valid)]
xg_model.fit(self.X_train, self.y_train,
eval_set = eval_setparam,
verbose=False)
Background
I'm struggling to implement a Naive Bayes classifier in python with sklearn across multiple features.
The features I have are:
Title - some short text
Description - some longer text
Timestamp - a float representing an hour of the day (e.g. 18.0 = 6:00PM, 11.5 = 11:30AM)
The labels/classes are categorical strings: e.g. "Class1", "Class2", "Class3"
Aim
My goal is to use the 3 features in order to construct a Naive Bayes classifier for 3 features in order to predict the class label. I specifically wish to use all of the features at the same time, i.e. not simply the description feature.
Initial Approach
I have setup some pre-processing pipelines using sklearn as follows:
from sklearn import preprocessing, naive_bayes, feature_extraction, pipeline, model_selection, compose,
text_columns = ['title', 'description']
time_columns = ['timestamp']
# get an 80-20 test-train split
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
# convert the text data into vectors
text_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
# preprocess by scaling the data, and binning the data
time_pipeline = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('bin', preprocessing.KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')),
])
# combine the pre-processors
preprocessor = compose.ColumnTransformer([
('text', text_pipeline, text_columns),
('time', time_pipeline, time_columns),
])
clf = pipeline.Pipeline([
('preprocessor', preprocessor),
('clf', naive_bayes.MultinomialNB()),
])
Here train is a pandas dataframe with the features and labels, read straight from a .csv file like this:
ID,title,description,timestamp,class
1,First Title String,"A description of the first title",13.0,Class1
2,Second Title String,"A description of the second title",17.5,Class2
Also note that I'm not setting most of the params for the transformers/classifiers, as I want to use a grid-search to find the optimum ones later on.
The problem
When I call clf.fit(X_train, y_train), I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/3039541201.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~/.local/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
697 self._record_output_indices(Xs)
698
--> 699 return self._hstack(list(Xs))
700
701 def transform(self, X):
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _hstack(self, Xs)
789 else:
790 Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
--> 791 return np.hstack(Xs)
792
793 def _sk_visual_block_(self):
<__array_function__ internals> in hstack(*args, **kwargs)
~/.local/lib/python3.9/site-packages/numpy/core/shape_base.py in hstack(tup)
344 return _nx.concatenate(arrs, 0)
345 else:
--> 346 return _nx.concatenate(arrs, 1)
347
348
<__array_function__ internals> in concatenate(*args, **kwargs)
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2 and the array at index 1 has size 3001
I have the following shapes for X_train and y_train:
X_train: (3001, 3)
y_train: (3001,)
Steps Taken
Individual Features
I can use the same pipelines with individual features (by altering the text_features and time_features arrays), and get a perfectly fine classifier. E.g. only using the "title" field, or only using the "timestamp". Unfortunately, these individual features are not accurate enough, so I would like to use all the features to build a more accurate classifier. The issue seems to be when I attempt to combine more than one feature.
I'm open to potentially using multiple Naive Bayes classifiers, and trying to multiply the probabilities together to get some overall probability, but I honestly have no clue how to do that, and I'm sure I'm just missing something simple here.
Dropping the Time Features
I have tried running only the text_features, i.e. "title" and "description", and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/1900884535.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396 return self
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
661 Returns the instance itself.
662 """
--> 663 X, y = self._check_X_y(X, y)
664 _, n_features = X.shape
665
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in _check_X_y(self, X, y, reset)
521 def _check_X_y(self, X, y, reset=True):
522 """Validate X and y in fit methods."""
--> 523 return self._validate_data(X, y, accept_sparse="csr", reset=reset)
524
525 def _update_class_log_prior(self, class_prior=None):
~/.local/lib/python3.9/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
980
--> 981 check_consistent_length(X, y)
982
983 return X, y
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
330 uniques = np.unique(lengths)
331 if len(uniques) > 1:
--> 332 raise ValueError(
333 "Found input variables with inconsistent numbers of samples: %r"
334 % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [2, 3001]
And I have the following shapes:
X_train: (3001, 2)
y_train: (3001,)
Reshaping the Labels
I have also tried reshaping y_train variable by calling it wrapped in [] like so:
# new
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train[['class']], test_size=0.2, random_state=RANDOM_STATE)
# previous
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
so that the resultant shapes are:
X_train: (3001, 3)
y_train: (3001, 1)
But unfortunately this doesn't appear to fix this.
Removing Naive Bayes Classifier
When I remove the final step of the pipeline (the naivebayes.MultinomialNB()), and I remove the text_features ("timestamp" feature), then I can build a pre-processor that works just fine for the text. I.e. I can pre-process the text fields ("title", "description"), but when I add the classifier, I get the error above under "Dropping the Time Features".
When vectorizing multiple text features, you should create CountVectorizer (or TfidfVectorizer) instances for every feature:
title_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
description_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
preprocessor = compose.ColumnTransformer([
('title', title_pipeline, text_columns[0]),
('description', description_pipeline, text_columns[1]),
('time', time_pipeline, time_columns),
])
P.S. The combination of CountVectorizer and TfidfTransformer is equivalent to TfidfVectorizer. Also, you may just skip tf-idf weighting and use only CountVectorizer for MultinomialNB.
I am new to Scikit-Learn package and am trying to use a LeaveOneGroupOut Cross-Validation for a simple classification task.
I used the following code, which I adopted based on the documentation at [this link] from the scikit-learn.org website:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score
from sklearn import svm
X = Selected_Dataset[:,:-1]
y = Selected_Labels
groups = Selected_SubjIDs
clf = svm.SVC(kernel='linear', C=1)
cv = LeaveOneGroupOut()
cv.get_n_splits(X, y, groups=groups)
cross_val_score(clf, X, y, cv=cv)
But this code generates the following exception:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-27b53a67db71> in <module>
14
15
---> 16 cross_val_score(clf, X, y, cv=cv)
17
18
~/miniconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
~/miniconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
~/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
618
619 with self._lock:
--> 620 tasks = BatchedCalls(itertools.islice(iterator, batch_size))
621 if len(tasks) == 0:
622 # No more tasks available in the iterator: tell caller to stop.
~/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, iterator_slice)
125
126 def __init__(self, iterator_slice):
--> 127 self.items = list(iterator_slice)
128 self._size = len(self.items)
129
~/miniconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in <genexpr>(.0)
200 pre_dispatch=pre_dispatch)
201 scores = parallel(
--> 202 delayed(_fit_and_score)(
203 clone(estimator), X, y, scorers, train, test, verbose, None,
204 fit_params, return_train_score=return_train_score,
~/miniconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
93 X, y, groups = indexable(X, y, groups)
94 indices = np.arange(_num_samples(X))
---> 95 for test_index in self._iter_test_masks(X, y, groups):
96 train_index = indices[np.logical_not(test_index)]
97 test_index = indices[test_index]
~/miniconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py in _iter_test_masks(self, X, y, groups)
822 def _iter_test_masks(self, X, y, groups):
823 if groups is None:
--> 824 raise ValueError("The 'groups' parameter should not be None.")
825 # We make a copy of groups to avoid side-effects during iteration
826 groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
ValueError: The 'groups' parameter should not be None.
I found these two related Bugs being reported in 2016, and 2017.
Is there any way around it?
You have to use
cross_val_score(clf, X, y, cv=cv, groups=groups)
and you can remove the get_n_splits.
Working example
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score
from sklearn import svm
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import Normalizer
#load the data
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
groups = np.random.binomial(1,0.5,size=len(X))
clf = svm.SVC(kernel='linear', C=1)
cv = LeaveOneGroupOut()
cross_val_score(clf, X, y, cv=cv,groups=groups)
I am trying to concat features from tfidf and other categorical features to perform classification on the resultant dataset. From various blogs I understand that FeatureUnion can be used to concat the features and then pipeline the same to algorithm (in my case Naive bayes).
I have followed the code from this link - http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
When I try to execute the code it is giving error
TypeError: no supported conversion for types: (dtype('O'),)
Below is the code which I am trying to execute:
class textdata():
def transform(self, X, Y):
return X[desc]
def fit(self, X, Y):
return self
class one_hot_trans():
def transform(self, X, Y):
X = pd.get_dummies(X, columns=obj_cols)
return X
def fit(self, X, Y):
return self
pipeline = Pipeline([
('features', FeatureUnion([
('ngram_tf_idf', Pipeline([
('text', textdata()),
('tf_idf', TfidfTransformer())
])),
('one_hot', one_hot_trans())
])),
('classifier', MultinomialNB())
])
d_train, d_test, y_train, y_test = train_test_split(data, data[target], test_size=0.2, random_state = 2018)
pipeline.fit(d_train, y_train)
Can anyone help me in resolving this error.
Note: data has total 9 columns with 1 target variable (categorical) and 1 text column (on which I want to perform tfidf) and rest are categorical (obj_cols in above code).
Edit:
Thanks Vivek. I did not notice that. It was by mistake i have put transformer instead of Vectorizer. Even after replacing I am getting below error.
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
745 self._update_transformer_list(transformers)
746 if any(sparse.issparse(f) for f in Xs):
--> 747 Xs = sparse.hstack(Xs).tocsr()
748 else:
749 Xs = np.hstack(Xs)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\sparse\construct.py in hstack(blocks, format, dtype)
462
463 """
--> 464 return bmat([blocks], format=format, dtype=dtype)
465
466
~\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\sparse\construct.py in bmat(blocks, format, dtype)
598 if dtype is None:
599 all_dtypes = [blk.dtype for blk in blocks[block_mask]]
--> 600 dtype = upcast(*all_dtypes) if all_dtypes else None
601
602 row_offsets = np.append(0, np.cumsum(brow_lengths))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\sparse\sputils.py in upcast(*args)
50 return t
51
---> 52 raise TypeError('no supported conversion for types: %r' % (args,))
53
54
TypeError: no supported conversion for types: (dtype('float64'), dtype('O'))
Edit::
I have checked for the unique values in all the categorical variables except for description column and I found none of the values appearing in test data which are not there in train. Am I doing something wrong.
for col in d_train.columns.drop(desc):
ext = set(d_test[col].unique().tolist()) - set(d_train[col].unique().tolist())
if ext: print ("extra columns: \n\n", ext)
Edit2::
Additional info - details of the d_train, d_test features mentioned. Can anyone help I am still getting "dimension mismatch" error on predict method.
obj cols:: ['priority', 'ticket_type', 'created_group', 'Classification', 'Component', 'ATR_OWNER_PLANT', 'created_day']
d_train cols:: Index(['priority', 'ticket_type', 'created_group', 'Description_ticket', 'Classification', 'Component', 'ATR_OWNER_PLANT', 'created_day'], dtype='object')
d_test cols:: Index(['priority', 'ticket_type', 'created_group', 'Description_ticket','Classification', 'Component', 'ATR_OWNER_PLANT', 'created_day'], dtype='object')
d_train shape:: (95080, 8)
d_test shape:: (23770, 8)
desc:: Description_ticket
I think, you are passing text column also through one_hot_trans function.
Can you try making the output of one_hot_trans as following.
class one_hot_trans():
def transform(self, X, Y):
X = pd.get_dummies(X.drop(desc,axis=1), obj_cols])
return X
def fit(self, X, Y):
return self