python pipeline does not execute imputer - python
I analysing the gapminder dataset [1] using a pipeline in Python but for some reason the imputer does not replace the nan values. According to the documentation ("For missing values encoded as np.nan, use the string value “NaN”.") I should do it like below but the code crashes with "ValueError: Input contains NaN" in the line "gm_cv.fit(X_train, y_train)". But gm_cv was created based on the pipeline and the pipeline contains the imputation which should remove the nans. Why does this not work?
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
fn = 'gapminder.csv'
df = pd.read_csv(fn, delimiter=',')
# replace empty strings with numpy nans
df.replace('', np.nan, inplace=True)
df.replace(' ', np.nan, inplace=True)
targetVariable = 'lifeexpectancy'
X = df.drop([targetVariable, 'country'], axis=1).values
y = df[targetVariable]
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
('scaler', StandardScaler()),
('elasticnet', ElasticNet())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio': np.linspace(0,1,30)}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=98)
# Create the GridSearchCrossValidation object
gm_cv = GridSearchCV(pipeline, parameters, cv=3)
# Fit to the training set
gm_cv.fit(X_train, y_train)
# results:
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
stack trace of full error:
python.exe pipline_and_classification_II.py
Traceback (most recent call last):
File "pipline_and_classification_II.py", line 55, in <module>
gm_cv.fit(X_train, y_train)
File "lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "lib\site-packages\sklearn\pipeline.py", line 250, in fit
self._final_estimator.fit(Xt, y, **fit_params)
File "lib\site-packages\sklearn\linear_model\coordinate_descent.py", line 709, in fit
ensure_2d=False)
File "lib\site-packages\sklearn\utils\validation.py", line 453, in check_array
_assert_all_finite(array)
File "lib\site-packages\sklearn\utils\validation.py", line 44, in _assert_all_finite
" or a value too large for %r." % X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Process finished with exit code 1
Update:
Debugging it slowly (without the pipeline) shows that the Imputer does not like 1d arrays (like y in my code above). When doing the nan-removing manually before with the code below it works.
y = np.array(y)
idx = np.argwhere(np.isnan(y))
y[idx] = np.nanmean(y)
But this defeats the purpose of the pipeline. Any ideas how to get this running without manual tinkering?
[1] http://makemeanalyst.com/download-and-learn-about-gapminder-dataset/
Related
How can I use a dataframe of multi-value in each cell as an input to machine learning for classification
I build a data frame with multivalued in each cell as picture below and I want to use logistic regression for classification>>>> my code is : fds1 = pd.DataFrame(featuresdata) fds1.fillna('', inplace=True) from sklearn.model_selection import train_test_split, cross_val_score X_train, X_test, y_train, y_test = train_test_split(fds1, y, test_size=0.30, random_state=100) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) print("Accuracy for logistic regression:", score) but there was an error with this code: File "C:\Users\hp\PycharmProjects\pythonProject\FE2.py", line 317, in CLS2butclick classifier.fit(X_train, y_train) File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1138, in fit X, y = self._validate_data( File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data X, y = check_X_y(X, y, **check_params) File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y X = check_array( File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array array = np.asarray(array, order=order, dtype=dtype) File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\generic.py", line 2064, in __array__ return np.asarray(self._values, dtype=dtype) ValueError: setting an array element with a sequence. How to fix that?
You need to do a label encoding before the training and convert string values to make them understandable for machine. Refer to https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
I am trying to predict using SVM but I receive the error AttributeError: 'numpy.ndarray' object has no attribute 'lower' when executing line text_clf.fit(X_train,y_train) of my code. How to fix this and get the probability that my prediction is correct using SVM? I am predicting the first column (gold) of my input file based on the values of the remaining columns. My input file dataExtended.txtis under the form: gold,T-x-T,T-x-N,T-x-U,T-x-NT,T-x-UT,T-x-UN,T-x-UNT,N-x-T,N-x-N,N-x-U,N-x-NT,N-x-UT,N-x-UN,N-x-UNT,U-x-T,U-x-N,U-x-U,U-x-NT,U-x-UT,U-x-UN,U-x-UNT,NT-x-T,NT-x-N,NT-x-U,NT-x-NT,NT-x-UT,NT-x-UN,NT-x-UNT,UT-x-T,UT-x-N,UT-x-U,UT-x-NT,UT-x-UT,UT-x-UN,UT-x-UNT,UN-x-T,UN-x-N,UN-x-U,UN-x-NT,UN-x-UT,UN-x-UN,UN-x-UNT,UNT-x-T,UNT-x-N,UNT-x-U,UNT-x-NT,UNT-x-UT,UNT-x-UN,UNT-x-UNT,T-T-x,T-N-x,T-U-x,T-NT-x,T-UT-x,T-UN-x,T-UNT-x,N-T-x,N-N-x,N-U-x,N-NT-x,N-UT-x,N-UN-x,N-UNT-x,U-T-x,U-N-x,U-U-x,U-NT-x,U-UT-x,U-UN-x,U-UNT-x,NT-T-x,NT-N-x,NT-U-x,NT-NT-x,NT-UT-x,NT-UN-x,NT-UNT-x,UT-T-x,UT-N-x,UT-U-x,UT-NT-x,UT-UT-x,UT-UN-x,UT-UNT-x,UN-T-x,UN-N-x,UN-U-x,UN-NT-x,UN-UT-x,UN-UN-x,UN-UNT-x,UNT-T-x,UNT-N-x,UNT-U-x,UNT-NT-x,UNT-UT-x,UNT-UN-x,UNT-UNT-x,x-T-T,x-T-N,x-T-U,x-T-NT,x-T-UT,x-T-UN,x-T-UNT,x-N-T,x-N-N,x-N-U,x-N-NT,x-N-UT,x-N-UN,x-N-UNT,x-U-T,x-U-N,x-U-U,x-U-NT,x-U-UT,x-U-UN,x-U-UNT,x-NT-T,x-NT-N,x-NT-U,x-NT-NT,x-NT-UT,x-NT-UN,x-NT-UNT,x-UT-T,x-UT-N,x-UT-U,x-UT-NT,x-UT-UT,x-UT-UN,x-UT-UNT,x-UN-T,x-UN-N,x-UN-U,x-UN-NT,x-UN-UT,x-UN-UN,x-UN-UNT,x-UNT-T,x-UNT-N,x-UNT-U,x-UNT-NT,x-UNT-UT,x-UNT-UN,x-UNT-UNT,callersAtLeast1T,CalleesAtLeast1T,callersAllT,calleesAllT,CallersAtLeast1N,CalleesAtLeast1N,CallersAllN,CalleesAllN,childrenAtLeast1T,parentsAtLeast1T,childrenAtLeast1N,parentsAtLeast1N,childrenAllT,parentsAllT,childrenAllN,ParentsAllN,ParametersatLeast1T,FieldMethodsAtLeast1T,ReturnTypeAtLeast1T,ParametersAtLeast1N,FieldMethodsAtLeast1N,ReturnTypeN,ParametersAllT,FieldMethodsAllT,ParametersAllN,FieldMethodsAllN,ClassGoldN,ClassGoldT,Inner,Leaf,Root,Isolated,EmptyCallers,EmptyCallees,EmptyCallersCallers,EmptyCalleesCallees,Program,Requirement,MethodID T,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,6,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,7,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,8,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,3 Here is my full reproducible code: # Make Predictions with Naive Bayes On The Iris Dataset from sklearn.cross_validation import train_test_split from sklearn import metrics import pandas as pd import numpy as np import seaborn as sns; sns.set() from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report import seaborn as sns from sklearn import svm from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline data = pd.read_csv( 'dataExtended.txt', sep= ',') row_count, column_count = data.shape # Printing the dataswet shape print ("Dataset Length: ", len(data)) print ("Dataset Shape: ", data.shape) print("Number of columns ", column_count) # Printing the dataset obseravtions print ("Dataset: ",data.head()) data['gold'] = data['gold'].astype('category').cat.codes data['Program'] = data['Program'].astype('category').cat.codes # Building Phase Separating the target variable X = data.values[:, 1:column_count] Y = data.values[:, 0] # Splitting the dataset into train and test X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100) #Create a svm Classifier svclassifier = svm.LinearSVC() print('Before fitting') svclassifier.fit(X_train, y_train) predicted = svclassifier.predict(X_test) text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())]) text_clf.fit(X_train,y_train) Traceback leading to error: Traceback (most recent call last): File "<ipython-input-9-8e85a0a9f81c>", line 1, in <module> runfile('C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py', wdir='C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python') File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 668, in runfile execfile(filename, namespace) File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile exec(compile(f.read(), filename, 'exec'), namespace) File "C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py", line 53, in <module> text_clf.fit(X_train,y_train) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit Xt, fit_params = self._fit(X, y, **fit_params) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit **fit_params_steps[name]) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__ return self.func(*args, **kwargs) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one res = transformer.fit_transform(X, y, **fit_params) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1381, in fit_transform X = super(TfidfVectorizer, self).fit_transform(raw_documents) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 869, in fit_transform self.fixed_vocabulary_) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab for feature in analyze(doc): File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 266, in <lambda> tokenize(preprocess(self.decode(doc))), stop_words) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 232, in <lambda> return lambda x: strip_accents(x.lower())
You cannot use TF-IDF-related methods for numeric data; the method is exclusively for use with text data, hence it uses methods such as .tolower(), which are by default applicable to strings, hence the error. This is already apparent from the documentation: fit(self, raw_documents, y=None) Learn vocabulary and idf from training set. Parameters raw_documents: iterable An iterable which yields either str, unicode or file objects. I am afraid that your rationale, as explained in the comments: I'm just trying to get the probability that each prediction is correct and TF-IDF seems to be the only way to do so when using SVM is extremely weak. For starters, there is no such thing as "the probability that each prediction is correct" - I take it that you mean probabilistic predictions, in contrast to hard class predictions (see Predict classes or class probabilities?) To get to the point of your actual requirement: in contrast to LinearSVC, which you are using here, SVC does indeed provide a predict_proba method, which should do the job (see the docs and the instructions therein). Notice that LinearSVC is not actually an SVM - see answer in Under what parameters are SVC and LinearSVC in scikit-learn equivalent? for details. In short, forget about TF-IDF and switch to SVC instead of LinearSVC.
Incompatible dimension for X and Y matrices
I was wondering what i have wrong here i get the error Traceback (most recent call last): File "main.py", line 37, in <module> y_pred = knn.predict(X_test) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/classification.py", line149, in predict neigh_dist, neigh_ind = self.kneighbors(X) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/base.py", line 434, in kneighbors **kwds)) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1448, in pairwise_distances_chunked n_jobs=n_jobs, **kwds) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1588, in pairwise_distances return _parallel_pairwise(X, Y, func, n_jobs, **kwds) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1206, in _parallel_pairwise return func(X, Y, **kwds) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 232, ineuclidean_distances X, Y = check_pairwise_arrays(X, Y) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 125, incheck_pairwise_arrays X.shape[1], Y.shape[1])) ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 38 while Y.shape[1] == 43 I'm new to ai and cant find anything on the internet that really solves this problem, any comment appreciated. This is my code from sklearn.preprocessing import OneHotEncoder from sklearn import metrics from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split import pandas as pd fileName = "breast-cancer-fixed.csv"; df = pd.read_csv(fileName) X = df[df.columns[:-1]] y = df[df.columns[-1]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) X_train = OneHotEncoder().fit_transform(X_train) X_test = OneHotEncoder().fit_transform(X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print("kNN model accuracy:", metrics.accuracy_score(y_test, y_pred)) My csv is massive and i cant upload it here so i put a small snippet in age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class 40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events 50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events 50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events 40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events 40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events 50-59,premeno,25-29,3-5,no,2,right,left_up,yes,no-recurrence-events 50-59,ge40,40-44,0-2,no,3,left,left_up,no,no-recurrence-events 40-49,premeno,10-14,0-2,no,2,left,left_up,no,no-recurrence-events 40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events 40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,no-recurrence-events 50-59,premeno,25-29,0-2,no,2,left,left_low,no,no-recurrence-events 60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events Also if i get rid of the last two line of code ( the prediction code ) it runs fine with no errors
trying adding this line anywhere above the transforms enc = OneHotEncoder(handle_unknown='ignore') then change the transform lines to the following enc = enc.fit(X_train) X_train = enc.transform(X_train) X_test = enc.transform(X_test)
I get this error ```Traceback (most recent call last): File "main.py", line 25, in <module> X_test = OneHotEncoder().transform(X_test) File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 726, in transform check_is_fitted(self, 'categories_') File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 914, in check_is_fitted raise NotFittedError(msg % {'name': type(estimator).__name__}) sklearn.exceptions.NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.```
GradientBoostingClassifier implementation
I want to implement Gradient Boosting Classifier to my Titanic ML solution based on sklearn library. I use VS Code on Ubuntu 18.04. I've tried: # Splitting the Training Data from sklearn.model_selection import train_test_split predictors = train.drop(['Survived', 'PassengerId'], axis=1) target = train["Survived"] x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0) # Gradient Boosting Classifier from sklearn.ensemble import GradientBoostingClassifier gbk = GradientBoostingClassifier() gbk.fit(x_train, y_train) ..which returns: Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/gradient_boosting.py", line 1395, in fit X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 756, in check_X_y estimator=estimator) File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 527, in check_array array = np.asarray(array, dtype=dtype, order=order) File "/home/sj/anaconda3/lib/python3.7/site-packages/numpy/core/numeric.py", line 501, in asarray return array(a, dtype, copy=False, order=order) ValueError: could not convert string to float: 'Baby' Help would be appreciated. I'm quite new to DS.
I think you may a non numerical values in your train data. Your classifier can take numerical inputs. That's why it tries to convert a string, here 'Baby', to a float. As this operation is not supported, it fails. Maybe look again at your data.
Editing entire row in .csv using Pandas and feeding to KNeighborsClassifier
I am new to applied machine learning and there is this dataset which includes a column of percentage of cocoa present in a chocolate. But when I feed that column to fit() function of the KNeighborsClassifer it throws the following error; Traceback (most recent call last): File "/home/himanshu/ML Tut-2/ML_tut2.py", line 13, in <module> knn.fit(X_train, y_train) File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit X, y = check_X_y(X, y, "csr", multi_output=True) File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 542, in check_X_y ensure_min_features, warn_on_dtype, estimator) File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 402, in check_array array = np.array(array, dtype=dtype, order=order, copy=copy) ValueError: invalid literal for float(): 72% My code is this; from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier import pandas as pd choco = pd.read_csv('flavors_of_cacao.csv') X = choco['Cocoa'] y = choco['Name'] X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0) knn = KNeighborsClassifier(n_neighbors = 5) knn.fit(X_train, y_train) It is clear that the fit() function needs a float number in the cocoa column but it is getting '%' symbol along with the number which cannot be converted to float without manipulations. Please help me to fix this problem. EDIT: I have edited my CSV and removed the '%' signs from it, but now I am getting the following error; Traceback (most recent call last): File "/home/himanshu/ML Tut-2/ML_tut2.py", line 14, in <module> knn.fit(X_train, y_train) File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit X, y = check_X_y(X, y, "csr", multi_output=True) File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 552, in check_X_y check_consistent_length(X, y) File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 173, in check_consistent_length " samples: %r" % [int(l) for l in lengths]) ValueError: Found input variables with inconsistent numbers of samples: [1346, 449] The new code is; from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier import pandas as pd import numpy as np df = pd.read_csv('chocos.csv') X = df[['Cocoa']] y = df['Name'] X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0) knn = KNeighborsClassifier(n_neighbors = 5) knn.fit(X_train, y_train) For reference, the dataset is this.
Just use the values from that column without the percent symbol: X = [[float(val.replace('%',''))] for val in choco['Cocoa']]