Dimensions issue using CalibratedClassifierCV with Pipeline - python

Trying to use CalibratedClassifierCV in Scikit Learn after finding the best parameters for a Pipeline with the following code.
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
pipeline = Pipeline([
('vect', CountVectorizer(token_pattern=r'(?u)\b\w+\b')),
('tfidf', TfidfTransformer()),
('clf', LinearSVC()),
])
parameters = {
'vect__max_features': (1000, ),
'vect__max_df': (0.75, 1.0),
'vect__min_df': (1, 5),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__binary': (True, False),
'tfidf__use_idf': (True, False),
'clf__class_weight': (None, 'balanced'),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
data = fetch_20newsgroups(
categories=['alt.atheism', 'talk.religion.misc', 'sci.med'],
remove=['headers', 'footers', 'quotes']
)
grid_search.fit(data.data, data.target)
best_parameters = grid_search.best_estimator_.get_params()
pipeline.set_params(**dict(best_parameters.items()))
model = CalibratedClassifierCV(base_estimator=pipeline, method='sigmoid')
model = model.fit(data.data, data.target)
This fails on the last fit, ValueError: Found input variables with inconsistent numbers of samples: [1, 1451].
Looking at the documentation I don't see why this shouldn't work. I've tried reshaping the array but it fails due to the Pipeline expecting a string sample as input.
I'm using scikit-learn 0.18, but had the same problem with 0.17.1
Full trace below.
.../lib/python3.5/site-packages/sklearn/calibration.py in fit(self, X, y, sample_weight)
123 """
124 X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
--> 125 force_all_finite=False)
126 X, y = indexable(X, y)
127 lb = LabelBinarizer().fit(y)
.../lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
529 y = y.astype(np.float64)
530
--> 531 check_consistent_length(X, y)
532
533 return X, y
.../lib/python3.5/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
179 if len(uniques) > 1:
180 raise ValueError("Found input variables with inconsistent numbers of"
--> 181 " samples: %r" % [int(l) for l in lengths])
182
183

Related

I've added a OneHotEncoder to a ColumnTransformer that I'm using in a Pipeline, I get an error as it can't change one of the columns to a float

I'm working with the Loan Approvals Dataset from Analytics Vidhya (https://www.kaggle.com/datasets/anmolkumar/analytics-vidhya-loan-prediction?select=train.csv) which is a dataframe with a mixture of categorical and numerical data aiming to predict if a loan is approved or not.
I'm trying to preprocess the data by building a ColumnTransformer so that I can perform specific preprocessing steps on the numerical and the categorical columns. The last step of the ColumnTransformer is a One Hot Encoder to one hot encode the categorical data. I then add this ColumnTransformer to a Pipeline with a LogisticRegression as a baseline model. When I try and fit this Pipeline, I get 'ValueError: could not convert string to float: 'Male''. I must be doing something wrong here, but I can't figure out what.
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
path = "https://raw.githubusercontent.com/richrussell1991/datasets/main/analytics_vidhya_loan_approval_practice_train.csv"
df = pd.read_csv(path, on_bad_lines='skip')
df.drop('Loan_ID', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df.drop('Loan_Status', axis=1), df['Loan_Status'], test_size=0.33)
mean_imputer = SimpleImputer(strategy='mean')
mode_imputer = SimpleImputer(strategy='most_frequent')
standard_scaler = StandardScaler()
one_hot_encoder = OneHotEncoder()
ct = ColumnTransformer(
transformers=[
('mean_imputer', mean_imputer, ['LoanAmount', 'Loan_Amount_Term']),
('mode_imputer', mode_imputer, ['Credit_History']),
('standard_scaler', standard_scaler, ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']),
('impute_most_common', mode_imputer, make_column_selector(dtype_include='object')),
('one_hot_encode', one_hot_encoder, make_column_selector(dtype_include='object'))
],
remainder='passthrough'
)
clf = Pipeline(
[('ct', ct), ('classifier', LogisticRegression())]
)
clf.fit(X_train, y_train)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-4f66baab8f93> in <module>
----> 1 clf.fit(X_train, y_train)
4 frames
/usr/local/lib/python3.8/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396 return self
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py in fit(self, X, y, sample_weight)
1506 _dtype = [np.float64, np.float32]
1507
-> 1508 X, y = self._validate_data(
1509 X,
1510 y,
/usr/local/lib/python3.8/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
962 raise ValueError("y cannot be None")
963
--> 964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
ValueError: could not convert string to float: 'Male'
I've built this ColumnTransformer, in order to apply certain transformations to the different columns, i.e. standard scaling to Numerical columns and OneHotEncoding to the Categorical columns, however I error out when trying to fit the overall Pipeline on X_train, y_train

Calculating score of regression model gives me dimensionality error

I want to make regression model with Scikit learn.
I have features that are categorical and numerical. This is how I handled that.
features = df[["text", "title_len", "lead_len", "exclamation_question", "number_of_ent", "punct_count"]]
results = df["shares"]
features = features.to_numpy()
results = results.to_numpy()
print("Shape of Features:", features.shape) # Shape of Features: (14706, 6)
print("Shape of Result:", results.shape) # Shape of Result: (14706,)
# Creating vectorizer
transformerVectoriser = ColumnTransformer(transformers=[('text_vocab', TfidfVectorizer(analyzer='word', ngram_range=(1, 4), vocabulary=vocabulary, lowercase = True), 0)
],
remainder='passthrough'
)
# Making final prediction with classifiation report and confusion matrix with model with highest accuracy
x_train, x_test, y_train, y_test = train_test_split(features, results, test_size=0.25, random_state=0)
print("X Train Shape", x_train.shape) # X Train Shape (11029, 6)
print("Y Train Shape", y_train.shape) # Y Train Shape (11029,)
print("X Test Shape", x_test.shape) # X Test Shape (3677, 6)
print("Y Test Shape", y_test.shape) # Y Test Shape (3677,)
x_train = transformerVectoriser.fit_transform(x_train)
x_test = transformerVectoriser.transform(x_test)
print("X Train Vectorized Shape", x_train.shape) # X Train Vectorized Shape (11029, 1091)
print("X Test Vectorized Shape", x_test.shape) # X Test Vectorized Shape (3677, 1091)
This is how I created a model:
regression_models = [["SVR C1", SVR(kernel='rbf', gamma='scale', C=1.0)],
["SVR C2", SVR(kernel='rbf', gamma='scale', C=2.0)],
["SVR C5", SVR(kernel='rbf', gamma='scale', C=5.0)]]
for regressor in regression_models:
name = regressor[0]
regressor = regressor[1]
print("Model Name:", name)
model = regressor.fit(x_train, y_train)
y_pred = model.predict(x_test)
score = model.score(y_test, y_pred) # ERROR
print(score)
The error that I get:
ValueError Traceback (most recent call last)
<ipython-input-1351-c5fbe26b2474> in <module>
22 print(y_test)
23 print(y_test.shape)
---> 24 score = model.score(y_test, y_pred)
25 print(score)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/base.py in score(self, X, y, sample_weight)
551
552 from .metrics import r2_score
--> 553 y_pred = self.predict(X)
554 return r2_score(y, y_pred, sample_weight=sample_weight)
555
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_base.py in predict(self, X)
236 Returns predicted values.
237 """
--> 238 return self._decision_function(X)
239
240 _preprocess_data = staticmethod(_preprocess_data)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_base.py in _decision_function(self, X)
218 check_is_fitted(self)
219
--> 220 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
221 return safe_sparse_dot(X, self.coef_.T,
222 dense_output=True) + self.intercept_
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
692 # If input is 1D raise error
693 if array.ndim == 1:
--> 694 raise ValueError(
695 "Expected 2D array, got 1D array instead:\narray={}.\n"
696 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: Expected 2D array, got 1D array instead:
array=[13. 8. 71. ... 43. 61. 55.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
What am I doing wrong?
If i print:
print(y_pred)
print(y_pred.shape)
print(y_test)
print(y_test.shape)
Im getting this:
[ 81.54398324 43.34783895 111.73805915 ... 75.27910881 89.46342907
78.93812588]
(4235,)
[13. 8. 71. ... 43. 61. 55.]
(4235,)
Based on documentation, the input of score is X and y. Therefore, it should be changed to
score = model.score(x_test, y_test)
Else, you can do
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

Sklearn Naive Bayes with multiple features

Background
I'm struggling to implement a Naive Bayes classifier in python with sklearn across multiple features.
The features I have are:
Title - some short text
Description - some longer text
Timestamp - a float representing an hour of the day (e.g. 18.0 = 6:00PM, 11.5 = 11:30AM)
The labels/classes are categorical strings: e.g. "Class1", "Class2", "Class3"
Aim
My goal is to use the 3 features in order to construct a Naive Bayes classifier for 3 features in order to predict the class label. I specifically wish to use all of the features at the same time, i.e. not simply the description feature.
Initial Approach
I have setup some pre-processing pipelines using sklearn as follows:
from sklearn import preprocessing, naive_bayes, feature_extraction, pipeline, model_selection, compose,
text_columns = ['title', 'description']
time_columns = ['timestamp']
# get an 80-20 test-train split
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
# convert the text data into vectors
text_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
# preprocess by scaling the data, and binning the data
time_pipeline = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('bin', preprocessing.KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')),
])
# combine the pre-processors
preprocessor = compose.ColumnTransformer([
('text', text_pipeline, text_columns),
('time', time_pipeline, time_columns),
])
clf = pipeline.Pipeline([
('preprocessor', preprocessor),
('clf', naive_bayes.MultinomialNB()),
])
Here train is a pandas dataframe with the features and labels, read straight from a .csv file like this:
ID,title,description,timestamp,class
1,First Title String,"A description of the first title",13.0,Class1
2,Second Title String,"A description of the second title",17.5,Class2
Also note that I'm not setting most of the params for the transformers/classifiers, as I want to use a grid-search to find the optimum ones later on.
The problem
When I call clf.fit(X_train, y_train), I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/3039541201.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~/.local/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
697 self._record_output_indices(Xs)
698
--> 699 return self._hstack(list(Xs))
700
701 def transform(self, X):
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _hstack(self, Xs)
789 else:
790 Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
--> 791 return np.hstack(Xs)
792
793 def _sk_visual_block_(self):
<__array_function__ internals> in hstack(*args, **kwargs)
~/.local/lib/python3.9/site-packages/numpy/core/shape_base.py in hstack(tup)
344 return _nx.concatenate(arrs, 0)
345 else:
--> 346 return _nx.concatenate(arrs, 1)
347
348
<__array_function__ internals> in concatenate(*args, **kwargs)
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2 and the array at index 1 has size 3001
I have the following shapes for X_train and y_train:
X_train: (3001, 3)
y_train: (3001,)
Steps Taken
Individual Features
I can use the same pipelines with individual features (by altering the text_features and time_features arrays), and get a perfectly fine classifier. E.g. only using the "title" field, or only using the "timestamp". Unfortunately, these individual features are not accurate enough, so I would like to use all the features to build a more accurate classifier. The issue seems to be when I attempt to combine more than one feature.
I'm open to potentially using multiple Naive Bayes classifiers, and trying to multiply the probabilities together to get some overall probability, but I honestly have no clue how to do that, and I'm sure I'm just missing something simple here.
Dropping the Time Features
I have tried running only the text_features, i.e. "title" and "description", and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/1900884535.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396 return self
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
661 Returns the instance itself.
662 """
--> 663 X, y = self._check_X_y(X, y)
664 _, n_features = X.shape
665
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in _check_X_y(self, X, y, reset)
521 def _check_X_y(self, X, y, reset=True):
522 """Validate X and y in fit methods."""
--> 523 return self._validate_data(X, y, accept_sparse="csr", reset=reset)
524
525 def _update_class_log_prior(self, class_prior=None):
~/.local/lib/python3.9/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
980
--> 981 check_consistent_length(X, y)
982
983 return X, y
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
330 uniques = np.unique(lengths)
331 if len(uniques) > 1:
--> 332 raise ValueError(
333 "Found input variables with inconsistent numbers of samples: %r"
334 % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [2, 3001]
And I have the following shapes:
X_train: (3001, 2)
y_train: (3001,)
Reshaping the Labels
I have also tried reshaping y_train variable by calling it wrapped in [] like so:
# new
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train[['class']], test_size=0.2, random_state=RANDOM_STATE)
# previous
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
so that the resultant shapes are:
X_train: (3001, 3)
y_train: (3001, 1)
But unfortunately this doesn't appear to fix this.
Removing Naive Bayes Classifier
When I remove the final step of the pipeline (the naivebayes.MultinomialNB()), and I remove the text_features ("timestamp" feature), then I can build a pre-processor that works just fine for the text. I.e. I can pre-process the text fields ("title", "description"), but when I add the classifier, I get the error above under "Dropping the Time Features".
When vectorizing multiple text features, you should create CountVectorizer (or TfidfVectorizer) instances for every feature:
title_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
description_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
preprocessor = compose.ColumnTransformer([
('title', title_pipeline, text_columns[0]),
('description', description_pipeline, text_columns[1]),
('time', time_pipeline, time_columns),
])
P.S. The combination of CountVectorizer and TfidfTransformer is equivalent to TfidfVectorizer. Also, you may just skip tf-idf weighting and use only CountVectorizer for MultinomialNB.

Building a custom RandomSearchCV using Python

I am trying to build a custom K-fold RandomSearchCV from scratch. I understand how RandomSearchCV works and I'm trying to implement it from scratch on a randomly generated dataset. When I try to run the code I get the following error. I think it has to do something with how I've created groups in my x_train list. What is this error and its fix? :
ValueError Traceback (most recent call last)
<ipython-input-12-229cc493eeb9> in <module>
41
42 classifier = KNeighborsClassifier()
---> 43 RandomSearchCV(X_train,y_train, classifier, folds = 3)
44
45
<ipython-input-12-229cc493eeb9> in RandomSearchCV(x_train, y_train, classifier, folds)
26 #classifier (K-NN)
27 classifier.n_neighbors = parameter
---> 28 classifier.fit(x_train_group, y_train_group)
29
30 #Predicton
~\anaconda3\lib\site-packages\sklearn\neighbors\_base.py in fit(self, X, y)
1128 """
1129 if not isinstance(X, (KDTree, BallTree)):
-> 1130 X, y = check_X_y(X, y, "csr", multi_output=True)
1131
1132 if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
753 ensure_min_features=ensure_min_features,
754 warn_on_dtype=warn_on_dtype,
--> 755 estimator=estimator)
756 if multi_output:
757 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
572 if not allow_nd and array.ndim >= 3:
573 raise ValueError("Found array with dim %d. %s expected <= 2."
--> 574 % (array.ndim, estimator_name))
575
576 if force_all_finite:
ValueError: Found array with dim 3. Estimator expected <= 2.
Here's my implementation:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
x,y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant= 0, n_clusters_per_class=1, random_state=60)
X_train, X_test, y_train, y_test = train_test_split(x,y,stratify=y,random_state=42)
def RandomSearchCV(x_train,y_train, classifier, folds):
train_scores = []
test_scores = []
#1. Generating 10 unique values from given range
params = random.sample(range(0, 50), 10)
x_train_split = []
y_train_split = []
#dividing x_train into groups
for i in range(0, len(x_train), int(len(x_train)/folds)):
x_train_split.append(x_train[i:i+int(len(x_train)/folds)])
y_train_split.append(y_train[i:i+int(len(y_train)/folds)])
#3.for each hyperparameter that we generated in step 1 and dividing dataset into training and CV datasets:
for parameter in params:
trainscores_folds = []
testscores_folds = []
for group in range(len(x_train_split)):
x_train_group = x_train_split[0:group] + x_train_split[group+1:]
x_cv_group = [x_train_split[group]]
y_train_group = y_train_split[0:group] + y_train_split[group+1:]
y_cv_group = [y_train_split[group]]
#classifier (K-NN)
classifier.n_neighbors = parameter
classifier.fit(x_train_group, y_train_group)
#Predicton
y_pred = classifier.predict(x_cv_group)
testscores_folds.append(accuracy_score(y_cv_group, Y_pred))
y_pred = classifier.predict(x_train_group)
trainscores_folds.append(accuracy_score(y_train_group, Y_pred))
trainscores.append(np.mean(np.array(trainscores_folds)))
testscores.append(np.mean(np.array(testscores_folds)))
return trainscores, testscores
classifier = KNeighborsClassifier()
RandomSearchCV(X_train,y_train, classifier, folds = 3)
Thank you for your help.
x_train_group is a list of arrays, which makes it 3-dimensional (as mentioned in the error). This does not work with fitting the classifier, as it expects 2-dimensional input. Try calling np.concatenate(x_train_group) to concatenate the folds and make it a 2-dimensional input.
As the error states, you are using an array of three dimensions while the classifier.fit() method was expecting a two-dimensional matrix. all you need to fix this issue is to change your train/cv/test groups to be like the following:
for group in range(len(x_train_split)):
x_train_group = np.concatenate(x_train_split[0:group] + x_train_split[group+1:])
x_cv_group = x_train_split[group]
y_train_group = np.concatenate(y_train_split[0:group] + y_train_split[group+1:])
y_cv_group = y_train_split[group]
....

Sklearn fitting SVM with StandardScaler

please am fitting svr on my dataset and am getting this error message. it was working when I have not included standardscaler. I have tried all means but still not working.
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(np.array(y).reshape(1,-1))
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X,y)`
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-75416c35e495> in <module>
2 from sklearn.svm import SVR
3 regressor = SVR(kernel = 'rbf') # rbf means radial basis function
----> 4 regressor.fit(X,y)
C:\anconda\lib\site-packages\sklearn\svm\_base.py in fit(self, X, y, sample_weight)
146 X, y = check_X_y(X, y, dtype=np.float64,
147 order='C', accept_sparse='csr',
--> 148 accept_large_sparse=False)
149 y = self._validate_targets(y)
150
C:\anconda\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
758 dtype=None)
759 else:
--> 760 y = column_or_1d(y, warn=True)
761 _assert_all_finite(y)
762 if y_numeric and y.dtype.kind == 'O':
C:\anconda\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
795 return np.ravel(y)
796
--> 797 raise ValueError("bad input shape {0}".format(shape))
798
799
ValueError: bad input shape (1, 10)
You are feeding to the SVM a target vector with dimension (1,10) which means one row and ten columns, this is wrong and it's caused by you're using of reshaping in
y = sc_y.fit_transform(np.array(y).reshape(1,-1))
Please note that this line is also conceptually wrong, the standardised should be applied only on the training features, not on the target vector, so you can avoid to define
sc_y = StandardScaler()

Categories

Resources