I followed this tutorial to implement sentiment analysis:
https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/
but I'm not a pro so I don'T understand every step in detail.
Now, I wanted to apply it to new data, using this tutorial:
https://stackabuse.com/scikit-learn-save-and-restore-models/
but at the point
score = pickle_model.score(Xtest, Ytest)
I get the Value error: could not convert from String to float 'positive' (Positive being a label for the sentiment analysis done earlier). What surprises me is that the error happens even when using X_train and y_train (from the first tutorial), but
text_classifier.fit(X_train, y_train)
works just fine without any errors. So I am assuming that the fit() method does something that the score() method does not, and this creates the problem. However, I have no idea how to fix it.
Here is the full error message:
ValueError Traceback (most recent call last)
<ipython-input-210-070f6faef44c> in <module>
34 print(len(X_train))
35 print(len(y_train))
---> 36 score = pickle_model.score(X_train, y_train)
37 print("Test score: {0:.2f} %".format(100 * score))
38
~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
408 y_pred = self.predict(X)
409 # XXX: Remove the check in 0.23
--> 410 y_type, _, _, _ = _check_reg_targets(y, y_pred, None)
411 if y_type == 'continuous-multioutput':
412 warnings.warn("The default value of multioutput (not exposed in "
~\Anaconda3\lib\site-packages\sklearn\metrics\regression.py in _check_reg_targets(y_true, y_pred, multioutput)
76 """
77 check_consistent_length(y_true, y_pred)
---> 78 y_true = check_array(y_true, ensure_2d=False)
79 y_pred = check_array(y_pred, ensure_2d=False)
80
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
494 try:
495 warnings.simplefilter('error', ComplexWarning)
--> 496 array = np.asarray(array, dtype=dtype, order=order)
497 except ComplexWarning:
498 raise ValueError("Complex data not supported\n"
~\Anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: could not convert string to float: 'positive'
and here'S the piece of code the error occurs in:
vectorizer = TfidfVectorizer (max_features=2500, min_df=1, max_df=1, stop_words=stopwords.words('english'))
chat_data = vectorizer.fit_transform(chat_data).toarray()
X_train, X_test, y_train, y_test = train_test_split(chat_data, chat_labels, test_size=0.2, random_state=0)
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)
predictions = text_classifier.predict(X_test)
X_train = np.array(X_train).reshape((-1,1))
y_train = np.array(y_train).reshape((-1,1))
print(len(X_train))
print(len(y_train))
score = pickle_model.score(X_train, y_train)
print("Test score: {0:.2f} %".format(100 * score))
Related
This question already has answers here:
How do I remove NaN values from a NumPy array?
(13 answers)
Closed 7 days ago.
> this is the error i am getting, i am new to python please help with this.
##########################################################################
ValueError Traceback (most recent call last)
Cell In[97], line 4
1 LR= LinearRegression()
3 #fit
----> 4 LR.fit(X,Y)
6 #predict
7 y_predict = LR.predict(X_test)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_base.py:649, in LinearRegression.fit(self, X, y, sample_weight)
645 n_jobs_ = self.n_jobs
647 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 649 X, y = self._validate_data(
650 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
651 )
653 sample_weight = _check_sample_weight(
654 sample_weight, X, dtype=X.dtype, only_non_negative=True
655 )
657 X, y, X_offset, y_offset, X_scale = _preprocess_data(
658 X,
659 y,
(...)
662 sample_weight=sample_weight,
663 )
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:554, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
552 y = check_array(y, input_name="y", **check_y_params)
553 else:
--> 554 X, y = check_X_y(X, y, **check_params)
555 out = X, y
557 if not no_val_X and check_params.get("ensure_2d", True):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1104, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1099 estimator_name = _check_estimator_name(estimator)
1100 raise ValueError(
1101 f"{estimator_name} requires y to be passed, but the target y is None"
1102 )
-> 1104 X = check_array(
1105 X,
1106 accept_sparse=accept_sparse,
1107 accept_large_sparse=accept_large_sparse,
1108 dtype=dtype,
1109 order=order,
1110 copy=copy,
1111 force_all_finite=force_all_finite,
1112 ensure_2d=ensure_2d,
1113 allow_nd=allow_nd,
1114 ensure_min_samples=ensure_min_samples,
1115 ensure_min_features=ensure_min_features,
1116 estimator=estimator,
1117 input_name="X",
1118 )
1120 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1122 check_consistent_length(X, y)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:919, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
913 raise ValueError(
914 "Found array with dim %d. %s expected <= 2."
915 % (array.ndim, estimator_name)
916 )
918 if force_all_finite:
--> 919 _assert_all_finite(
920 array,
921 input_name=input_name,
922 estimator_name=estimator_name,
923 allow_nan=force_all_finite == "allow-nan",
924 )
926 if ensure_min_samples > 0:
927 n_samples = _num_samples(array)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:111, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
109 if X.dtype == np.dtype("object") and not allow_nan:
110 if _object_dtype_isnan(X).any():
--> 111 raise ValueError("Input contains NaN")
113 # We need only consider float arrays, hence can early return for all else.
114 if X.dtype.kind not in "fc":
ValueError: Input contains NaN
###################################################################333
This is the code i am trying and getting the error above.
LR= LinearRegression(normalize=True)
#fit
LR.fit(X,Y)
#predict
y_predict = LR.predict(X_test)
it looks like the x and y values youre passing to the fit function are not properly formatted, which is causing the issue. specifically it seems like one of the values in x might be null. im not familiar with the modules you used but checking to make sure the x and y are set correctly is a good first step.
Use train_test_split,
LinearRegression simple code:
X = dataset[['statezip','city','bedrooms','sqft_living','sqft_lot','sqft_above','floors',]]
y = dataset['price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn import linear_model
import numpy as np
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred_RMSE = regr.predict(X_test)
I am trying to logistic Regression Model, and run some test but I keep getting this error. Not really sure what I have done differently to everyone else
from sklearn import preprocessing
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]ere
This is how I am separating my columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
TTS
logReg = LogisticRegression(n_jobs=-1)
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_train)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:" , mae)
ValueError Traceback (most recent call last)
Cell In [112], line 1
----> 1 mae = mean_absolute_error(y_test, y_pred)
2 print("MAE:" , mae)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:196, in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
141 def mean_absolute_error(
142 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
143 ):
144 """Mean absolute error regression loss.
145
146 Read more in the :ref:`User Guide <mean_absolute_error>`.
(...)
194 0.85...
195 """
--> 196 y_type, y_true, y_pred, multioutput = _check_reg_targets(
197 y_true, y_pred, multioutput
198 )
199 check_consistent_length(y_true, y_pred, sample_weight)
200 output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
67 """Check that y_true and y_pred belong to the same regression task.
68
69 Parameters
(...)
98 correct keyword.
99 """
--> 100 check_consistent_length(y_true, y_pred)
101 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
102 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:387, in check_consistent_length(*arrays)
385 uniques = np.unique(lengths)
386 if len(uniques) > 1:
--> 387 raise ValueError(
388 "Found input variables with inconsistent numbers of samples: %r"
389 % [int(l) for l in lengths]
390 )
ValueError: Found input variables with inconsistent numbers of samples: [25404, 101612]
I thought it was the way I split the columns but that doesn't seem to be the issue
It works when the test size is 50/50 but no other test size works
You are comparing the predicted labels for the train set with the labels for the test set, which are of different sizes, hence the error.
Replace
y_pred = logReg.predict(X_train)
with
y_pred = logReg.predict(X_test)
Background
I'm struggling to implement a Naive Bayes classifier in python with sklearn across multiple features.
The features I have are:
Title - some short text
Description - some longer text
Timestamp - a float representing an hour of the day (e.g. 18.0 = 6:00PM, 11.5 = 11:30AM)
The labels/classes are categorical strings: e.g. "Class1", "Class2", "Class3"
Aim
My goal is to use the 3 features in order to construct a Naive Bayes classifier for 3 features in order to predict the class label. I specifically wish to use all of the features at the same time, i.e. not simply the description feature.
Initial Approach
I have setup some pre-processing pipelines using sklearn as follows:
from sklearn import preprocessing, naive_bayes, feature_extraction, pipeline, model_selection, compose,
text_columns = ['title', 'description']
time_columns = ['timestamp']
# get an 80-20 test-train split
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
# convert the text data into vectors
text_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
# preprocess by scaling the data, and binning the data
time_pipeline = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('bin', preprocessing.KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')),
])
# combine the pre-processors
preprocessor = compose.ColumnTransformer([
('text', text_pipeline, text_columns),
('time', time_pipeline, time_columns),
])
clf = pipeline.Pipeline([
('preprocessor', preprocessor),
('clf', naive_bayes.MultinomialNB()),
])
Here train is a pandas dataframe with the features and labels, read straight from a .csv file like this:
ID,title,description,timestamp,class
1,First Title String,"A description of the first title",13.0,Class1
2,Second Title String,"A description of the second title",17.5,Class2
Also note that I'm not setting most of the params for the transformers/classifiers, as I want to use a grid-search to find the optimum ones later on.
The problem
When I call clf.fit(X_train, y_train), I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/3039541201.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~/.local/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
697 self._record_output_indices(Xs)
698
--> 699 return self._hstack(list(Xs))
700
701 def transform(self, X):
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _hstack(self, Xs)
789 else:
790 Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
--> 791 return np.hstack(Xs)
792
793 def _sk_visual_block_(self):
<__array_function__ internals> in hstack(*args, **kwargs)
~/.local/lib/python3.9/site-packages/numpy/core/shape_base.py in hstack(tup)
344 return _nx.concatenate(arrs, 0)
345 else:
--> 346 return _nx.concatenate(arrs, 1)
347
348
<__array_function__ internals> in concatenate(*args, **kwargs)
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2 and the array at index 1 has size 3001
I have the following shapes for X_train and y_train:
X_train: (3001, 3)
y_train: (3001,)
Steps Taken
Individual Features
I can use the same pipelines with individual features (by altering the text_features and time_features arrays), and get a perfectly fine classifier. E.g. only using the "title" field, or only using the "timestamp". Unfortunately, these individual features are not accurate enough, so I would like to use all the features to build a more accurate classifier. The issue seems to be when I attempt to combine more than one feature.
I'm open to potentially using multiple Naive Bayes classifiers, and trying to multiply the probabilities together to get some overall probability, but I honestly have no clue how to do that, and I'm sure I'm just missing something simple here.
Dropping the Time Features
I have tried running only the text_features, i.e. "title" and "description", and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/1900884535.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396 return self
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
661 Returns the instance itself.
662 """
--> 663 X, y = self._check_X_y(X, y)
664 _, n_features = X.shape
665
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in _check_X_y(self, X, y, reset)
521 def _check_X_y(self, X, y, reset=True):
522 """Validate X and y in fit methods."""
--> 523 return self._validate_data(X, y, accept_sparse="csr", reset=reset)
524
525 def _update_class_log_prior(self, class_prior=None):
~/.local/lib/python3.9/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
980
--> 981 check_consistent_length(X, y)
982
983 return X, y
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
330 uniques = np.unique(lengths)
331 if len(uniques) > 1:
--> 332 raise ValueError(
333 "Found input variables with inconsistent numbers of samples: %r"
334 % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [2, 3001]
And I have the following shapes:
X_train: (3001, 2)
y_train: (3001,)
Reshaping the Labels
I have also tried reshaping y_train variable by calling it wrapped in [] like so:
# new
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train[['class']], test_size=0.2, random_state=RANDOM_STATE)
# previous
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
so that the resultant shapes are:
X_train: (3001, 3)
y_train: (3001, 1)
But unfortunately this doesn't appear to fix this.
Removing Naive Bayes Classifier
When I remove the final step of the pipeline (the naivebayes.MultinomialNB()), and I remove the text_features ("timestamp" feature), then I can build a pre-processor that works just fine for the text. I.e. I can pre-process the text fields ("title", "description"), but when I add the classifier, I get the error above under "Dropping the Time Features".
When vectorizing multiple text features, you should create CountVectorizer (or TfidfVectorizer) instances for every feature:
title_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
description_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
preprocessor = compose.ColumnTransformer([
('title', title_pipeline, text_columns[0]),
('description', description_pipeline, text_columns[1]),
('time', time_pipeline, time_columns),
])
P.S. The combination of CountVectorizer and TfidfTransformer is equivalent to TfidfVectorizer. Also, you may just skip tf-idf weighting and use only CountVectorizer for MultinomialNB.
I am trying to build a custom K-fold RandomSearchCV from scratch. I understand how RandomSearchCV works and I'm trying to implement it from scratch on a randomly generated dataset. When I try to run the code I get the following error. I think it has to do something with how I've created groups in my x_train list. What is this error and its fix? :
ValueError Traceback (most recent call last)
<ipython-input-12-229cc493eeb9> in <module>
41
42 classifier = KNeighborsClassifier()
---> 43 RandomSearchCV(X_train,y_train, classifier, folds = 3)
44
45
<ipython-input-12-229cc493eeb9> in RandomSearchCV(x_train, y_train, classifier, folds)
26 #classifier (K-NN)
27 classifier.n_neighbors = parameter
---> 28 classifier.fit(x_train_group, y_train_group)
29
30 #Predicton
~\anaconda3\lib\site-packages\sklearn\neighbors\_base.py in fit(self, X, y)
1128 """
1129 if not isinstance(X, (KDTree, BallTree)):
-> 1130 X, y = check_X_y(X, y, "csr", multi_output=True)
1131
1132 if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
753 ensure_min_features=ensure_min_features,
754 warn_on_dtype=warn_on_dtype,
--> 755 estimator=estimator)
756 if multi_output:
757 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
572 if not allow_nd and array.ndim >= 3:
573 raise ValueError("Found array with dim %d. %s expected <= 2."
--> 574 % (array.ndim, estimator_name))
575
576 if force_all_finite:
ValueError: Found array with dim 3. Estimator expected <= 2.
Here's my implementation:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
x,y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant= 0, n_clusters_per_class=1, random_state=60)
X_train, X_test, y_train, y_test = train_test_split(x,y,stratify=y,random_state=42)
def RandomSearchCV(x_train,y_train, classifier, folds):
train_scores = []
test_scores = []
#1. Generating 10 unique values from given range
params = random.sample(range(0, 50), 10)
x_train_split = []
y_train_split = []
#dividing x_train into groups
for i in range(0, len(x_train), int(len(x_train)/folds)):
x_train_split.append(x_train[i:i+int(len(x_train)/folds)])
y_train_split.append(y_train[i:i+int(len(y_train)/folds)])
#3.for each hyperparameter that we generated in step 1 and dividing dataset into training and CV datasets:
for parameter in params:
trainscores_folds = []
testscores_folds = []
for group in range(len(x_train_split)):
x_train_group = x_train_split[0:group] + x_train_split[group+1:]
x_cv_group = [x_train_split[group]]
y_train_group = y_train_split[0:group] + y_train_split[group+1:]
y_cv_group = [y_train_split[group]]
#classifier (K-NN)
classifier.n_neighbors = parameter
classifier.fit(x_train_group, y_train_group)
#Predicton
y_pred = classifier.predict(x_cv_group)
testscores_folds.append(accuracy_score(y_cv_group, Y_pred))
y_pred = classifier.predict(x_train_group)
trainscores_folds.append(accuracy_score(y_train_group, Y_pred))
trainscores.append(np.mean(np.array(trainscores_folds)))
testscores.append(np.mean(np.array(testscores_folds)))
return trainscores, testscores
classifier = KNeighborsClassifier()
RandomSearchCV(X_train,y_train, classifier, folds = 3)
Thank you for your help.
x_train_group is a list of arrays, which makes it 3-dimensional (as mentioned in the error). This does not work with fitting the classifier, as it expects 2-dimensional input. Try calling np.concatenate(x_train_group) to concatenate the folds and make it a 2-dimensional input.
As the error states, you are using an array of three dimensions while the classifier.fit() method was expecting a two-dimensional matrix. all you need to fix this issue is to change your train/cv/test groups to be like the following:
for group in range(len(x_train_split)):
x_train_group = np.concatenate(x_train_split[0:group] + x_train_split[group+1:])
x_cv_group = x_train_split[group]
y_train_group = np.concatenate(y_train_split[0:group] + y_train_split[group+1:])
y_cv_group = y_train_split[group]
....
This question already has an answer here:
Factorize a column of strings in pandas
(1 answer)
Closed 4 years ago.
I'm trying to classify mobiles according to their features but when I apply the gaussian NB code through sklearn , I'm unable to do so because of the following error :
the code :
clf = GaussianNB()
clf.fit(X_train,y_train)
GaussianNB()
accuracy = clf.score(X_test,y_test)
print(accuracy)
error:
ValueError Traceback (most recent call last)
<ipython-input-18-e9515ccc2439> in <module>()
2 clf.fit(X_train,y_train)
3 GaussianNB()
----> 4 accuracy = clf.score(X_test,y_test)
5 print(accuracy)
/Users/kiran/anaconda/lib/python3.6/site-packages/sklearn/base.py in score(self, X, y, sample_weight)
347 """
348 from .metrics import accuracy_score
--> 349 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
350
351
/Users/kiran/anaconda/lib/python3.6/site-packages/sklearn/naive_bayes.py in predict(self, X)
63 Predicted target values for X
64 """
---> 65 jll = self._joint_log_likelihood(X)
66 return self.classes_[np.argmax(jll, axis=1)]
67
/Users/kiran/anaconda/lib/python3.6/site-packages/sklearn/naive_bayes.py in _joint_log_likelihood(self, X)
422 check_is_fitted(self, "classes_")
423
--> 424 X = check_array(X)
425 joint_log_likelihood = []
426 for i in range(np.size(self.classes_)):
/Users/kiran/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: could not convert string to float:
My dataset has been scraped so it contains string as well as float values. It would be helpful if someone could suggest me how I can clean the data and avoid the error .
try the following:
accuracy = clf.score(X_test.astype('float'),y_test.astype('float'))
ValueError: could not convert string to float
I think this says it all. You need to have float as consistent datatype in your dataset.
To convert a string in python to float:
>>> a = "123.345"
>>> float(a)
>>> 123.345
>>> int(float(a))
>>> 123