trying to plot confusion matrix on human activity recognition everything works fine accept the confusion matrix. I am not able to print confusion matrix.
cm = confusion_matrix(y_test.values,y_pred)
plot_confusion_matrix(cm, np.unique(y_pred))
result:
ValueError Traceback (most recent call last)
<ipython-input-29-7fc4511c54b7> in <module>
----> 1 cm = confusion_matrix(y_test.values,y_pred)
2 plot_confusion_matrix(cm, np.unique(y_pred)) # plotting confusion matrix
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight, normalize)
274
275 """
--> 276 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
277 if y_type not in ("binary", "multiclass"):
278 raise ValueError("%s is not supported" % y_type)
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
79 y_pred : array or indicator matrix
80 """
---> 81 check_consistent_length(y_true, y_pred)
82 type_true = type_of_target(y_true)
83 type_pred = type_of_target(y_pred)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [2947, 2937]
y_test.values and y_pred aren't the same length, one contains 2947 entries and the other 2937
Related
I am trying to evaluate my xgboost model using accuracy_score(). And I have the code:
predictions = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test.to_numpy(), predictions)
The error message looks like this:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [126], in <cell line: 1>()
----> 1 accuracy = accuracy_score(y_test.to_numpy(), predictions)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:211, in accuracy_score(y_true, y_pred, normalize, sample_weight)
145 """Accuracy classification score.
146
147 In multilabel classification, this function computes subset accuracy:
(...)
207 0.5
208 """
210 # Compute accuracy for each possible representation
--> 211 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
212 check_consistent_length(y_true, y_pred, sample_weight)
213 if y_type.startswith("multilabel"):
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:93, in _check_targets(y_true, y_pred)
90 y_type = {"multiclass"}
92 if len(y_type) > 1:
---> 93 raise ValueError(
94 "Classification metrics can't handle a mix of {0} and {1} targets".format(
95 type_true, type_pred
96 )
97 )
99 # We can't have more than one value on y_type => The set is no more needed
100 y_type = y_type.pop()
ValueError: Classification metrics can't handle a mix of unknown and binary targets
The parameters look like this
y_test.to_numpy()
predictions
They are all 1-d arrays and I cannot find where is the problem.
How should I calculate the accuracy score?
Thanks!
I am trying to logistic Regression Model, and run some test but I keep getting this error. Not really sure what I have done differently to everyone else
from sklearn import preprocessing
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]ere
This is how I am separating my columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
TTS
logReg = LogisticRegression(n_jobs=-1)
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_train)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:" , mae)
ValueError Traceback (most recent call last)
Cell In [112], line 1
----> 1 mae = mean_absolute_error(y_test, y_pred)
2 print("MAE:" , mae)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:196, in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
141 def mean_absolute_error(
142 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
143 ):
144 """Mean absolute error regression loss.
145
146 Read more in the :ref:`User Guide <mean_absolute_error>`.
(...)
194 0.85...
195 """
--> 196 y_type, y_true, y_pred, multioutput = _check_reg_targets(
197 y_true, y_pred, multioutput
198 )
199 check_consistent_length(y_true, y_pred, sample_weight)
200 output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
67 """Check that y_true and y_pred belong to the same regression task.
68
69 Parameters
(...)
98 correct keyword.
99 """
--> 100 check_consistent_length(y_true, y_pred)
101 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
102 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:387, in check_consistent_length(*arrays)
385 uniques = np.unique(lengths)
386 if len(uniques) > 1:
--> 387 raise ValueError(
388 "Found input variables with inconsistent numbers of samples: %r"
389 % [int(l) for l in lengths]
390 )
ValueError: Found input variables with inconsistent numbers of samples: [25404, 101612]
I thought it was the way I split the columns but that doesn't seem to be the issue
It works when the test size is 50/50 but no other test size works
You are comparing the predicted labels for the train set with the labels for the test set, which are of different sizes, hence the error.
Replace
y_pred = logReg.predict(X_train)
with
y_pred = logReg.predict(X_test)
i have performed a linear regression in Python,using scikit-learn. the problem doesn't seem to concern the fit of variables but the mean squared error and r-squared. I know that the leght of "y" doesn't match the lenght of "pred".
from sklearn.metrics import r2_score, mean_squared_error
mse = mean_squared_error(y, pred)
r2 = r2_score(y,pred)#Best fit lineplt.scatter(x, y)
plt.plot(x,pred, color = 'Black', marker = 'o')
#Results
print("Mean Squared Error : ", mse)
print("R-Squared :" , r2)
print("Y-intercept :" , regressor.intercept_)
print("Slope :" , regressor.coef_)
the error is:
ValueError Traceback (most recent call last)
<ipython-input-126-729d70b35dd9> in <module>
1 from sklearn.metrics import r2_score, mean_squared_error
----> 2 mse = mean_squared_error(y, pred)
3 r2 = r2_score(y,pred)#Best fit lineplt.scatter(x, y)
4 plt.plot(x,pred, color = 'Black', marker = 'o')
5
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
333 0.825...
334 """
--> 335 y_type, y_true, y_pred, multioutput = _check_reg_targets(
336 y_true, y_pred, multioutput)
337 check_consistent_length(y_true, y_pred, sample_weight)
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86 the dtype argument passed to check_array.
87 """
---> 88 check_consistent_length(y_true, y_pred)
89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
260 uniques = np.unique(lengths)
261 if len(uniques) > 1:
--> 262 raise ValueError("Found input variables with inconsistent numbers of"
263 " samples: %r" % [int(l) for l in lengths])
264
ValueError: Found input variables with inconsistent numbers of samples: [1461, 366]
I am currently working on a disease prediction machine learning model. I used Random Forest Classifier in my model, and now I am trying to get probabilities of predicted values, but the code gives me an error. In this program, I want to get the probabilities of "each" prediction specifically. For example, I entered the symptoms to predict the disease and the predicted disease is "Allergy". Then, I want my program to show the probability of the predicted disease "Allergy" as a percent, but the program gives errors like "classification metrics can't handle a mix of multiclass and unknown targets". I guess I need to use confusion matrix to show the probabilities, but it also gives the same error about the multiclass problem. More clearly, I just want to show the probability of each predicted value as "percent". For instance, the probability of Allergy disease is 90%, etc. How can I do that and how can I solve my problem?
Here the relevant codes:
p=pickle_model.predict([[22,8,50,9,20,47,50,38,0,0,0]])
actual=np.array((22,8,50,9,20,47,50,38,0,0,0))
pred=pickle_model.predict_proba([[p,0,0,0,0,0,0,0,0,0,0]])
In the code block below:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as mt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
print(accuracy_score(actual, p, normalize=True, sample_weight=None))
I get the error:
ValueError Traceback (most recent call last)
<ipython-input-69-e8980bf68410> in <module>
3 from sklearn.metrics import accuracy_score
4 from sklearn.metrics import precision_score
----> 5 print(accuracy_score(actual, p, normalize=True, sample_weight=None))
6 #precision, recall, fscore, support =
7 #score(y_test, p)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in accuracy_score(y_true, y_pred, normalize, sample_weight)
200
201 # Compute accuracy for each possible representation
--> 202 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
203 check_consistent_length(y_true, y_pred, sample_weight)
204 if y_type.startswith('multilabel'):
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
81 y_pred : array or indicator matrix
82 """
---> 83 check_consistent_length(y_true, y_pred)
84 type_true = type_of_target(y_true)
85 type_pred = type_of_target(y_pred)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
260 uniques = np.unique(lengths)
261 if len(uniques) > 1:
--> 262 raise ValueError("Found input variables with inconsistent numbers of"
263 " samples: %r" % [int(l) for l in lengths])
264
ValueError: Found input variables with inconsistent numbers of samples: [11, 1]
Also, the other error that I get in this code block is:
ValueError Traceback (most recent call last)
<ipython-input-65-774dbd6b46f7> in <module>
8
9 # specificity
---> 10 tn, fp, fn, tp = mt.confusion_matrix(actual, predict).ravel()
11 specificity = tn / (tn+fp)
12 print(specificity)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight, normalize)
294
295 """
--> 296 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
297 if y_type not in ("binary", "multiclass"):
298 raise ValueError("%s is not supported" % y_type)
~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
90
91 if len(y_type) > 1:
---> 92 raise ValueError("Classification metrics can't handle a mix of {0} "
93 "and {1} targets".format(type_true, type_pred))
94
ValueError: Classification metrics can't handle a mix of multiclass and unknown targets
The function you are looking is predict_proba().
The input of this function has to be exactly the inputs of your model trained. It must have the same inputs of the function predict().
If this part of your code worked to you (since you not reported any error):
p=pickle_model.predict([[22,8,50,9,20,47,50,38,0,0,0]])
This should work also:
p=pickle_model.predict_proba([[22,8,50,9,20,47,50,38,0,0,0]])
Note: Be aware of "[]", maybe you should have to delete someone.
I followed this tutorial to implement sentiment analysis:
https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/
but I'm not a pro so I don'T understand every step in detail.
Now, I wanted to apply it to new data, using this tutorial:
https://stackabuse.com/scikit-learn-save-and-restore-models/
but at the point
score = pickle_model.score(Xtest, Ytest)
I get the Value error: could not convert from String to float 'positive' (Positive being a label for the sentiment analysis done earlier). What surprises me is that the error happens even when using X_train and y_train (from the first tutorial), but
text_classifier.fit(X_train, y_train)
works just fine without any errors. So I am assuming that the fit() method does something that the score() method does not, and this creates the problem. However, I have no idea how to fix it.
Here is the full error message:
ValueError Traceback (most recent call last)
<ipython-input-210-070f6faef44c> in <module>
34 print(len(X_train))
35 print(len(y_train))
---> 36 score = pickle_model.score(X_train, y_train)
37 print("Test score: {0:.2f} %".format(100 * score))
38
~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
408 y_pred = self.predict(X)
409 # XXX: Remove the check in 0.23
--> 410 y_type, _, _, _ = _check_reg_targets(y, y_pred, None)
411 if y_type == 'continuous-multioutput':
412 warnings.warn("The default value of multioutput (not exposed in "
~\Anaconda3\lib\site-packages\sklearn\metrics\regression.py in _check_reg_targets(y_true, y_pred, multioutput)
76 """
77 check_consistent_length(y_true, y_pred)
---> 78 y_true = check_array(y_true, ensure_2d=False)
79 y_pred = check_array(y_pred, ensure_2d=False)
80
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
494 try:
495 warnings.simplefilter('error', ComplexWarning)
--> 496 array = np.asarray(array, dtype=dtype, order=order)
497 except ComplexWarning:
498 raise ValueError("Complex data not supported\n"
~\Anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: could not convert string to float: 'positive'
and here'S the piece of code the error occurs in:
vectorizer = TfidfVectorizer (max_features=2500, min_df=1, max_df=1, stop_words=stopwords.words('english'))
chat_data = vectorizer.fit_transform(chat_data).toarray()
X_train, X_test, y_train, y_test = train_test_split(chat_data, chat_labels, test_size=0.2, random_state=0)
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)
predictions = text_classifier.predict(X_test)
X_train = np.array(X_train).reshape((-1,1))
y_train = np.array(y_train).reshape((-1,1))
print(len(X_train))
print(len(y_train))
score = pickle_model.score(X_train, y_train)
print("Test score: {0:.2f} %".format(100 * score))